Pack palette indices

Pack two indices into each byte instead of storing them separately.

Reduces memory usage by up to 16 kB per sb128 in streams that uses
screen content tools when frame-threading is enabled, at the cost
of some additional computational overhead for packing/unpacking.
This commit is contained in:
Henrik Gramner
2023-07-06 23:10:22 +02:00
committed by Henrik Gramner
co-authored by Henrik Gramner
parent 233a424c38
commit 72e9c7c095
12 changed files with 371 additions and 263 deletions
+1 -1
View File
@@ -322,5 +322,5 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
c->pal_pred = BF(dav1d_pal_pred, neon);
//c->pal_pred = BF(dav1d_pal_pred, neon);
}
+19 -14
View File
@@ -448,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t,
Dav1dTileState *const ts = t->ts;
const ptrdiff_t stride = bw4 * 4;
assert(pal_idx);
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
pixel *const pal_tmp = t->scratch.pal_idx_uv;
pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
uint16_t (*const color_map_cdf)[8] =
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
uint8_t (*const order)[8] = t->scratch.pal_order;
@@ -457,22 +458,26 @@ static void read_pal_indices(Dav1dTaskContext *const t,
// top/left-to-bottom/right diagonals ("wave-front")
const int first = imin(i, w4 * 4 - 1);
const int last = imax(0, i - h4 * 4 + 1);
order_palette(pal_idx, stride, i, first, last, order, ctx);
order_palette(pal_tmp, stride, i, first, last, order, ctx);
for (int j = first, m = 0; j >= last; j--, m++) {
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
pal_idx[(i - j) * stride + j] = order[m][color_idx];
pal_tmp[(i - j) * stride + j] = order[m][color_idx];
}
}
// fill invisible edges
// fill invisible edges and pack to 4-bit (2 pixels per byte)
if (bw4 > w4)
for (int y = 0; y < 4 * h4; y++)
memset(&pal_idx[y * stride + 4 * w4],
pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
memset(&pal_tmp[y * stride + 4 * w4],
pal_tmp[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
int i;
for (i = 0; i < bw4 * h4 * 8; i++)
pal_idx[i] = pal_tmp[2*i+0] | (pal_tmp[2*i+1] << 4);
if (h4 < bh4) {
const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
const ptrdiff_t packed_stride = bw4 * 2;
const uint8_t *const src = &pal_idx[i - packed_stride];
for (int y = h4 * 4; y < bh4 * 4; y++)
memcpy(&pal_idx[y * stride], src, bw4 * 4);
memcpy(&pal_idx[y * packed_stride], src, packed_stride);
}
}
@@ -1205,9 +1210,9 @@ static int decode_b(Dav1dTaskContext *const t,
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
} else
pal_idx = t->scratch.pal_idx;
pal_idx = t->scratch.pal_idx_y;
read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
if (DEBUG_BLOCK_INFO)
printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
@@ -1219,9 +1224,9 @@ static int decode_b(Dav1dTaskContext *const t,
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
} else
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
pal_idx = t->scratch.pal_idx_uv;
read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
@@ -2488,7 +2493,7 @@ static void setup_tile(Dav1dTileState *const ts,
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
for (int p = 0; p < 2; p++) {
ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
NULL;
ts->frame_thread[p].cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
@@ -2893,7 +2898,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
dav1d_free_aligned(f->frame_thread.pal_idx);
f->frame_thread.pal_idx =
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
pal_idx_sz * 128 * 128 / 4, 64);
pal_idx_sz * 128 * 128 / 8, 64);
if (!f->frame_thread.pal_idx) {
f->frame_thread.pal_idx_sz = 0;
goto error;
+2 -1
View File
@@ -424,7 +424,8 @@ struct Dav1dTaskContext {
int16_t ac[32 * 32]; // intra-only
uint8_t txtp_map[32 * 32]; // inter-only
};
uint8_t pal_idx[2 * 64 * 64];
uint8_t pal_idx_y[32 * 64];
uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
union {
struct {
uint8_t interintra_8bpc[64 * 64];
+6 -3
View File
@@ -719,9 +719,12 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
dst[x] = pal[idx[x]];
idx += w;
for (int x = 0; x < w; x += 2) {
const int i = *idx++;
assert(!(i & 0x88));
dst[x + 0] = pal[i & 7];
dst[x + 1] = pal[i >> 4];
}
dst += PXSTRIDE(stride);
}
}
+4 -4
View File
@@ -1236,9 +1236,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
} else {
pal_idx = t->scratch.pal_idx;
pal_idx = t->scratch.pal_idx_y;
}
const pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
@@ -1437,10 +1437,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))];
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
} else {
pal = bytefn(t->scratch.pal);
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
pal_idx = t->scratch.pal_idx_uv;
}
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
+61 -48
View File
@@ -4885,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
jg .w32_wpad
jmp .w32_hpad
cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m3, [palq]
cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
lea r2, [pal_pred_16bpc_avx2_table]
tzcnt wd, wm
vbroadcasti128 m4, [pal_pred_shuf]
vbroadcasti128 m5, [pal_pred_shuf]
movifnidn hd, hm
movsxd wq, [r2+wq*4]
pshufb m3, m4
punpckhqdq m4, m3, m3
pshufb m4, m5
punpckhqdq m5, m4, m4
add wq, r2
DEFINE_ARGS dst, stride, stride3, idx, w, h
lea stride3q, [strideq*3]
jmp wq
.w4:
mova xm2, [idxq]
add idxq, 16
pshufb xm1, xm3, xm2
pshufb xm2, xm4, xm2
movq xm0, [idxq]
add idxq, 8
psrlw xm1, xm0, 4
punpcklbw xm0, xm1
pshufb xm1, xm4, xm0
pshufb xm2, xm5, xm0
punpcklbw xm0, xm1, xm2
punpckhbw xm1, xm2
movq [dstq+strideq*0], xm0
@@ -4914,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w4
RET
.w8:
movu m2, [idxq] ; only 16-byte alignment
add idxq, 32
pshufb m1, m3, m2
pshufb m2, m4, m2
pmovzxbw m2, [idxq]
add idxq, 16
psllw m1, m2, 4
por m2, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], xm0
@@ -4929,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w8
RET
.w16:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
@@ -4949,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w16
RET
.w32:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+32], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+32], m1
mova [dstq+ 0], m0
mova [dstq+32], m1
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+32], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32
RET
.w64:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+ 0], m0
mova [dstq+32], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+64], m0
mova [dstq+96], m1
mova [dstq+32*0], m0
mova [dstq+32*1], m1
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+32*2], m0
mova [dstq+32*3], m1
add dstq, strideq
dec hd
jg .w64
+52 -30
View File
@@ -38,10 +38,10 @@ smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
times 4 db 10, 11, 12, 13, 2, 3, -1, -1
filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
@@ -57,6 +57,8 @@ filter_shift: times 2 dw 6
dd 0
times 2 dw 4
dd 9
pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
db 16, 24, 20, 28, 48, 56, 52, 60
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
jg .w64_loop
RET
cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_16bpc_avx512icl_table]
tzcnt wd, wm
mova m2, [pal_pred_perm]
movsxd wq, [r6+wq*4]
mova xm3, [palq]
mova m3, [pal_pred_perm]
movifnidn hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastq m4, [pal_unpack+0]
vpbroadcastq m5, [pal_unpack+8]
add wq, r6
vbroadcasti32x4 m6, [palq]
lea stride3q, [strideq*3]
jmp wq
.w4:
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
pmovzxbd ym0, [idxq]
add idxq, 8
vpmultishiftqb ym0, ym4, ym0
vpermw ym0, ym0, ym6
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
@@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w4
RET
.w8:
pmovzxbw m0, [idxq]
add idxq, 32
vpermw m0, m0, m3
pmovzxbd m0, [idxq]
add idxq, 16
vpmultishiftqb m0, m4, m0
vpermw m0, m0, m6
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
@@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w8
RET
.w16:
vpermb m1, m2, [idxq]
add idxq, 64
vpermw m0, m1, m3
movu ym1, [idxq]
add idxq, 32
vpermb m1, m3, m1
vpmultishiftqb m1, m4, m1
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
vpermw m1, m1, m6
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
@@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w16
RET
.w32:
vpermb m1, m2, [idxq]
vpermb m2, m3, [idxq]
add idxq, 64
vpermw m0, m1, m3
vpmultishiftqb m1, m4, m2
vpmultishiftqb m2, m5, m2
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
vpermw m1, m1, m6
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
vpermw m0, m2, m6
psrlw m2, 8
vpermw m1, m2, m6
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
RET
.w64:
vpermb m1, m2, [idxq]
vpermb m2, m3, [idxq]
add idxq, 64
vpermw m0, m1, m3
vpmultishiftqb m1, m4, m2
vpmultishiftqb m2, m5, m2
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, strideq
dec hd
vpermw m1, m1, m6
mova [dstq+ 0], m0
mova [dstq+64], m1
vpermw m0, m2, m6
psrlw m2, 8
vpermw m1, m2, m6
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+64], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET
+78 -51
View File
@@ -3964,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
jg .w32_hpad_loop
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
%define base r2-pal_pred_16bpc_ssse3_table
%if ARCH_X86_32
%define hd r2d
%endif
mova m3, [palq]
mova m4, [palq]
LEA r2, pal_pred_16bpc_ssse3_table
tzcnt wd, wm
pshufb m3, [base+pal_pred_shuf]
pshufb m4, [base+pal_pred_shuf]
movsxd wq, [r2+wq*4]
pshufd m4, m3, q1032
pshufd m5, m4, q1032
add wq, r2
movifnidn hd, hm
jmp wq
.w4:
mova m0, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
movq m0, [idxq]
add idxq, 8
psrlw m1, m0, 4
punpcklbw m0, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
@@ -3995,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
jg .w4
RET
.w8:
mova m0, [idxq]
mova m3, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w8
RET
.w16:
mova m0, [idxq]
mova m3, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
mova [dstq+ 0], m0
mova [dstq+16], m1
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+16], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
mova m3, [idxq]
add idxq, 16
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
add idxq, 16*2
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova [dstq+16*2], m2
mova [dstq+16*3], m0
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
dec hd
jg .w32
RET
.w64:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
mova m3, [idxq+16*0]
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova m1, [idxq+16*2]
mova [dstq+16*2], m2
pshufb m2, m3, m1
mova [dstq+16*3], m0
pshufb m0, m4, m1
punpcklbw m1, m2, m0
punpckhbw m2, m0
mova m0, [idxq+16*3]
add idxq, 16*4
mova [dstq+16*4], m1
pshufb m1, m3, m0
mova [dstq+16*5], m2
pshufb m2, m4, m0
pshufb m1, m4, m3
pshufb m2, m5, m3
mova m3, [idxq+16*1]
add idxq, 32
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*2], m0
mova [dstq+16*3], m1
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*4], m0
mova [dstq+16*5], m1
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*6], m0
+42 -35
View File
@@ -5316,8 +5316,11 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
lea r2, [strideq*3]
jmp wq
.w4:
pshufb xm0, xm4, [idxq]
add idxq, 16
movq xm0, [idxq]
add idxq, 8
psrlw xm1, xm0, 4
punpcklbw xm0, xm1
pshufb xm0, xm4, xm0
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
pextrd [dstq+strideq*2], xm0, 2
@@ -5326,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
sub hd, 4
jg .w4
RET
ALIGN function_align
.w8:
pshufb xm0, xm4, [idxq+16*0]
pshufb xm1, xm4, [idxq+16*1]
add idxq, 16*2
movu xm2, [idxq]
add idxq, 16
pshufb xm1, xm4, xm2
psrlw xm2, 4
pshufb xm2, xm4, xm2
punpcklbw xm0, xm1, xm2
punpckhbw xm1, xm2
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
@@ -5339,47 +5345,48 @@ ALIGN function_align
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
add idxq, 32*2
movu m2, [idxq]
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], xm1
mova [dstq+strideq*1], xm1
vextracti128 [dstq+strideq*2], m0, 1
vextracti128 [dstq+r2 ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
ALIGN function_align
.w32:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
vpermq m2, [idxq], q3120
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r2 ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
RET
ALIGN function_align
.w64:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
mova [dstq+strideq*0+32*0], m0
mova [dstq+strideq*0+32*1], m1
mova [dstq+strideq*1+32*0], m2
mova [dstq+strideq*1+32*1], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32
RET
.w64:
vpermq m2, [idxq], q3120
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+32*0], m0
mova [dstq+32*1], m1
add dstq, strideq
dec hd
jg .w64
RET
+47 -28
View File
@@ -95,6 +95,8 @@ smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_127_m127: times 2 db 127, -127
pb_128: times 4 db 128
@@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
SECTION .text
@@ -1111,18 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
jg .w64_loop
RET
cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_8bpc_avx512icl_table]
tzcnt wd, wm
vpbroadcastq m4, [palq]
cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
movifnidn wd, wm
movifnidn hd, hm
movsxd wq, [r6+wq*4]
add wq, r6
lea stride3q, [strideq*3]
jmp wq
cmp wd, 8
jg .w32
movq xmm3, [palq]
je .w8
.w4:
pshufb xmm0, xm4, [idxq]
add idxq, 16
movq xmm0, [idxq]
add idxq, 8
psrlw xmm1, xmm0, 4
punpcklbw xmm0, xmm1
pshufb xmm0, xmm3, xmm0
movd [dstq+strideq*0], xmm0
pextrd [dstq+strideq*1], xmm0, 1
pextrd [dstq+strideq*2], xmm0, 2
@@ -1132,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w4
RET
.w8:
pshufb xmm0, xm4, [idxq+16*0]
pshufb xmm1, xm4, [idxq+16*1]
add idxq, 16*2
movu xmm2, [idxq]
add idxq, 16
pshufb xmm1, xmm3, xmm2
psrlw xmm2, 4
pshufb xmm2, xmm3, xmm2
punpcklbw xmm0, xmm1, xmm2
punpckhbw xmm1, xmm2
movq [dstq+strideq*0], xmm0
movhps [dstq+strideq*1], xmm0
movq [dstq+strideq*2], xmm1
@@ -1144,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w8
RET
.w16:
pshufb m0, m4, [idxq]
add idxq, 64
pmovzxdq m0, [idxq]
add idxq, 32
vpmultishiftqb m0, m3, m0
pshufb m0, m5, m0
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
@@ -1155,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w16
RET
.w32:
pshufb m0, m4, [idxq+64*0]
pshufb m1, m4, [idxq+64*1]
add idxq, 64*2
vpbroadcastq m3, [pal_unpack+0]
vpbroadcastq m5, [palq]
cmp wd, 32
jl .w16
pmovzxbd m2, [pal_perm]
vpbroadcastq m4, [pal_unpack+8]
jg .w64
.w32_loop:
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
jg .w32_loop
RET
.w64:
pshufb m0, m4, [idxq+64*0]
pshufb m1, m4, [idxq+64*1]
pshufb m2, m4, [idxq+64*2]
pshufb m3, m4, [idxq+64*3]
add idxq, 64*4
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET
+56 -45
View File
@@ -3493,11 +3493,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
lea r2, [strideq*3]
jmp wq
.w4:
pshufb m0, m4, [idxq]
add idxq, 16
movd [dstq ], m0
movq m1, [idxq]
add idxq, 8
psrlw m0, m1, 4
punpcklbw m1, m0
pshufb m0, m4, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq ], m1
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
@@ -3506,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
sub hd, 4
jg .w4
RET
ALIGN function_align
.w8:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
add idxq, 32
movq [dstq ], m0
movhps [dstq+strideq ], m0
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r2 ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+strideq ], m1
mova [dstq+strideq*2], m2
mova [dstq+r2 ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
ALIGN function_align
.w32:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+16 ], m1
mova [dstq+strideq ], m2
mova [dstq+strideq+16], m3
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w32
RET
ALIGN function_align
.w64:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [idxq+16*0]
movu m2, [idxq+16*1]
add idxq, 32
pshufb m1, m4, m0
psrlw m0, 4
pshufb m3, m4, m0
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
pshufb m1, m4, m2
psrlw m2, 4
pshufb m3, m4, m2
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
sub hd, 1
jg .w64
+3 -3
View File
@@ -251,7 +251,7 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
PIXEL_RECT(c_dst, 64, 64);
PIXEL_RECT(a_dst, 64, 64);
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
ALIGN_STK_64(uint8_t, idx, 32 * 64,);
ALIGN_STK_16(pixel, pal, 8,);
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *pal,
@@ -270,8 +270,8 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
for (int i = 0; i < 8; i++)
pal[i] = rnd() & bitdepth_max;
for (int i = 0; i < w * h; i++)
idx[i] = rnd() & 7;
for (int i = 0; i < w * h / 2; i++)
idx[i] = rnd() & 0x77;
CLEAR_PIXEL_RECT(c_dst);
CLEAR_PIXEL_RECT(a_dst);