mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 12:13:03 +00:00
Pack palette indices
Pack two indices into each byte instead of storing them separately. Reduces memory usage by up to 16 kB per sb128 in streams that uses screen content tools when frame-threading is enabled, at the cost of some additional computational overhead for packing/unpacking.
This commit is contained in:
committed by
Henrik Gramner
co-authored by
Henrik Gramner
parent
233a424c38
commit
72e9c7c095
+1
-1
@@ -322,5 +322,5 @@ static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *cons
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
|
||||
c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon);
|
||||
|
||||
c->pal_pred = BF(dav1d_pal_pred, neon);
|
||||
//c->pal_pred = BF(dav1d_pal_pred, neon);
|
||||
}
|
||||
|
||||
+19
-14
@@ -448,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t,
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const ptrdiff_t stride = bw4 * 4;
|
||||
assert(pal_idx);
|
||||
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
pixel *const pal_tmp = t->scratch.pal_idx_uv;
|
||||
pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
uint16_t (*const color_map_cdf)[8] =
|
||||
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
|
||||
uint8_t (*const order)[8] = t->scratch.pal_order;
|
||||
@@ -457,22 +458,26 @@ static void read_pal_indices(Dav1dTaskContext *const t,
|
||||
// top/left-to-bottom/right diagonals ("wave-front")
|
||||
const int first = imin(i, w4 * 4 - 1);
|
||||
const int last = imax(0, i - h4 * 4 + 1);
|
||||
order_palette(pal_idx, stride, i, first, last, order, ctx);
|
||||
order_palette(pal_tmp, stride, i, first, last, order, ctx);
|
||||
for (int j = first, m = 0; j >= last; j--, m++) {
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
|
||||
pal_idx[(i - j) * stride + j] = order[m][color_idx];
|
||||
pal_tmp[(i - j) * stride + j] = order[m][color_idx];
|
||||
}
|
||||
}
|
||||
// fill invisible edges
|
||||
// fill invisible edges and pack to 4-bit (2 pixels per byte)
|
||||
if (bw4 > w4)
|
||||
for (int y = 0; y < 4 * h4; y++)
|
||||
memset(&pal_idx[y * stride + 4 * w4],
|
||||
pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
|
||||
memset(&pal_tmp[y * stride + 4 * w4],
|
||||
pal_tmp[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
|
||||
int i;
|
||||
for (i = 0; i < bw4 * h4 * 8; i++)
|
||||
pal_idx[i] = pal_tmp[2*i+0] | (pal_tmp[2*i+1] << 4);
|
||||
if (h4 < bh4) {
|
||||
const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
|
||||
const ptrdiff_t packed_stride = bw4 * 2;
|
||||
const uint8_t *const src = &pal_idx[i - packed_stride];
|
||||
for (int y = h4 * 4; y < bh4 * 4; y++)
|
||||
memcpy(&pal_idx[y * stride], src, bw4 * 4);
|
||||
memcpy(&pal_idx[y * packed_stride], src, packed_stride);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1205,9 +1210,9 @@ static int decode_b(Dav1dTaskContext *const t,
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
|
||||
} else
|
||||
pal_idx = t->scratch.pal_idx;
|
||||
pal_idx = t->scratch.pal_idx_y;
|
||||
read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
|
||||
@@ -1219,9 +1224,9 @@ static int decode_b(Dav1dTaskContext *const t,
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
|
||||
} else
|
||||
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
|
||||
pal_idx = t->scratch.pal_idx_uv;
|
||||
read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
|
||||
@@ -2488,7 +2493,7 @@ static void setup_tile(Dav1dTileState *const ts,
|
||||
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
|
||||
for (int p = 0; p < 2; p++) {
|
||||
ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
|
||||
NULL;
|
||||
ts->frame_thread[p].cf = f->frame_thread.cf ?
|
||||
(uint8_t*)f->frame_thread.cf +
|
||||
@@ -2893,7 +2898,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
||||
dav1d_free_aligned(f->frame_thread.pal_idx);
|
||||
f->frame_thread.pal_idx =
|
||||
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
|
||||
pal_idx_sz * 128 * 128 / 4, 64);
|
||||
pal_idx_sz * 128 * 128 / 8, 64);
|
||||
if (!f->frame_thread.pal_idx) {
|
||||
f->frame_thread.pal_idx_sz = 0;
|
||||
goto error;
|
||||
|
||||
+2
-1
@@ -424,7 +424,8 @@ struct Dav1dTaskContext {
|
||||
int16_t ac[32 * 32]; // intra-only
|
||||
uint8_t txtp_map[32 * 32]; // inter-only
|
||||
};
|
||||
uint8_t pal_idx[2 * 64 * 64];
|
||||
uint8_t pal_idx_y[32 * 64];
|
||||
uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
|
||||
union {
|
||||
struct {
|
||||
uint8_t interintra_8bpc[64 * 64];
|
||||
|
||||
+6
-3
@@ -719,9 +719,12 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
|
||||
const int w, const int h)
|
||||
{
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[x] = pal[idx[x]];
|
||||
idx += w;
|
||||
for (int x = 0; x < w; x += 2) {
|
||||
const int i = *idx++;
|
||||
assert(!(i & 0x88));
|
||||
dst[x + 0] = pal[i & 7];
|
||||
dst[x + 1] = pal[i >> 4];
|
||||
}
|
||||
dst += PXSTRIDE(stride);
|
||||
}
|
||||
}
|
||||
|
||||
+4
-4
@@ -1236,9 +1236,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
|
||||
} else {
|
||||
pal_idx = t->scratch.pal_idx;
|
||||
pal_idx = t->scratch.pal_idx_y;
|
||||
}
|
||||
const pixel *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
@@ -1437,10 +1437,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
||||
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))];
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
|
||||
} else {
|
||||
pal = bytefn(t->scratch.pal);
|
||||
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
|
||||
pal_idx = t->scratch.pal_idx_uv;
|
||||
}
|
||||
|
||||
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
|
||||
|
||||
+61
-48
@@ -4885,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
|
||||
jg .w32_wpad
|
||||
jmp .w32_hpad
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m3, [palq]
|
||||
cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m4, [palq]
|
||||
lea r2, [pal_pred_16bpc_avx2_table]
|
||||
tzcnt wd, wm
|
||||
vbroadcasti128 m4, [pal_pred_shuf]
|
||||
vbroadcasti128 m5, [pal_pred_shuf]
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r2+wq*4]
|
||||
pshufb m3, m4
|
||||
punpckhqdq m4, m3, m3
|
||||
pshufb m4, m5
|
||||
punpckhqdq m5, m4, m4
|
||||
add wq, r2
|
||||
DEFINE_ARGS dst, stride, stride3, idx, w, h
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
mova xm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xm1, xm3, xm2
|
||||
pshufb xm2, xm4, xm2
|
||||
movq xm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xm1, xm0, 4
|
||||
punpcklbw xm0, xm1
|
||||
pshufb xm1, xm4, xm0
|
||||
pshufb xm2, xm5, xm0
|
||||
punpcklbw xm0, xm1, xm2
|
||||
punpckhbw xm1, xm2
|
||||
movq [dstq+strideq*0], xm0
|
||||
@@ -4914,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
||||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
movu m2, [idxq] ; only 16-byte alignment
|
||||
add idxq, 32
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pmovzxbw m2, [idxq]
|
||||
add idxq, 16
|
||||
psllw m1, m2, 4
|
||||
por m2, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], xm0
|
||||
@@ -4929,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
||||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+strideq*2], m0
|
||||
mova [dstq+stride3q ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
@@ -4949,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0+ 0], m0
|
||||
mova [dstq+strideq*0+32], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*1+ 0], m0
|
||||
mova [dstq+strideq*1+32], m1
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+32], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+32], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+32], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+64], m0
|
||||
mova [dstq+96], m1
|
||||
mova [dstq+32*0], m0
|
||||
mova [dstq+32*1], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+32*2], m0
|
||||
mova [dstq+32*3], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64
|
||||
|
||||
+52
-30
@@ -38,10 +38,10 @@ smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
||||
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
|
||||
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
|
||||
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
|
||||
pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
|
||||
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
|
||||
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
|
||||
db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
|
||||
pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
|
||||
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
|
||||
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
|
||||
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
|
||||
filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
|
||||
times 4 db 10, 11, 12, 13, 2, 3, -1, -1
|
||||
filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
|
||||
@@ -57,6 +57,8 @@ filter_shift: times 2 dw 6
|
||||
dd 0
|
||||
times 2 dw 4
|
||||
dd 9
|
||||
pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
|
||||
db 16, 24, 20, 28, 48, 56, 52, 60
|
||||
|
||||
%macro JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - 2*4)
|
||||
@@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
|
||||
jg .w64_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
||||
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
|
||||
lea r6, [pal_pred_16bpc_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
mova m2, [pal_pred_perm]
|
||||
movsxd wq, [r6+wq*4]
|
||||
mova xm3, [palq]
|
||||
mova m3, [pal_pred_perm]
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r6+wq*4]
|
||||
vpbroadcastq m4, [pal_unpack+0]
|
||||
vpbroadcastq m5, [pal_unpack+8]
|
||||
add wq, r6
|
||||
vbroadcasti32x4 m6, [palq]
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pmovzxbw ym0, [idxq]
|
||||
add idxq, 16
|
||||
vpermw ym0, ym0, ym3
|
||||
pmovzxbd ym0, [idxq]
|
||||
add idxq, 8
|
||||
vpmultishiftqb ym0, ym4, ym0
|
||||
vpermw ym0, ym0, ym6
|
||||
vextracti32x4 xm1, ym0, 1
|
||||
movq [dstq+strideq*0], xm0
|
||||
movhps [dstq+strideq*1], xm0
|
||||
@@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
pmovzxbw m0, [idxq]
|
||||
add idxq, 32
|
||||
vpermw m0, m0, m3
|
||||
pmovzxbd m0, [idxq]
|
||||
add idxq, 16
|
||||
vpmultishiftqb m0, m4, m0
|
||||
vpermw m0, m0, m6
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti32x4 [dstq+strideq*1], ym0, 1
|
||||
vextracti32x4 [dstq+strideq*2], m0, 2
|
||||
@@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
vpermb m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
movu ym1, [idxq]
|
||||
add idxq, 32
|
||||
vpermb m1, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+strideq*0], ym0
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], ym1
|
||||
@@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
vpermb m1, m2, [idxq]
|
||||
vpermb m2, m3, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
vpmultishiftqb m1, m4, m2
|
||||
vpmultishiftqb m2, m5, m2
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
vpermw m0, m2, m6
|
||||
psrlw m2, 8
|
||||
vpermw m1, m2, m6
|
||||
mova [dstq+strideq*2], m0
|
||||
mova [dstq+stride3q ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermb m1, m2, [idxq]
|
||||
vpermb m2, m3, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
vpmultishiftqb m1, m4, m2
|
||||
vpmultishiftqb m2, m5, m2
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
mova [dstq+64*0], m0
|
||||
mova [dstq+64*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+64], m1
|
||||
vpermw m0, m2, m6
|
||||
psrlw m2, 8
|
||||
vpermw m1, m2, m6
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+64], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
||||
+78
-51
@@ -3964,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
|
||||
jg .w32_hpad_loop
|
||||
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
|
||||
cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
|
||||
%define base r2-pal_pred_16bpc_ssse3_table
|
||||
%if ARCH_X86_32
|
||||
%define hd r2d
|
||||
%endif
|
||||
mova m3, [palq]
|
||||
mova m4, [palq]
|
||||
LEA r2, pal_pred_16bpc_ssse3_table
|
||||
tzcnt wd, wm
|
||||
pshufb m3, [base+pal_pred_shuf]
|
||||
pshufb m4, [base+pal_pred_shuf]
|
||||
movsxd wq, [r2+wq*4]
|
||||
pshufd m4, m3, q1032
|
||||
pshufd m5, m4, q1032
|
||||
add wq, r2
|
||||
movifnidn hd, hm
|
||||
jmp wq
|
||||
.w4:
|
||||
mova m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
movq m0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw m1, m0, 4
|
||||
punpcklbw m0, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
movq [dstq+strideq*0], m0
|
||||
@@ -3995,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
|
||||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
mova m0, [idxq]
|
||||
mova m3, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
mova m0, [idxq]
|
||||
mova m3, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+16], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+16], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
mova m0, [idxq+16*0]
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
mova m3, [idxq]
|
||||
add idxq, 16
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova m2, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
mova [dstq+16*0], m0
|
||||
pshufb m0, m3, m2
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
punpcklbw m2, m0, m1
|
||||
punpckhbw m0, m1
|
||||
mova [dstq+16*2], m2
|
||||
mova [dstq+16*3], m0
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
mova m0, [idxq+16*0]
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
mova m3, [idxq+16*0]
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova m2, [idxq+16*1]
|
||||
mova [dstq+16*0], m0
|
||||
pshufb m0, m3, m2
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
punpcklbw m2, m0, m1
|
||||
punpckhbw m0, m1
|
||||
mova m1, [idxq+16*2]
|
||||
mova [dstq+16*2], m2
|
||||
pshufb m2, m3, m1
|
||||
mova [dstq+16*3], m0
|
||||
pshufb m0, m4, m1
|
||||
punpcklbw m1, m2, m0
|
||||
punpckhbw m2, m0
|
||||
mova m0, [idxq+16*3]
|
||||
add idxq, 16*4
|
||||
mova [dstq+16*4], m1
|
||||
pshufb m1, m3, m0
|
||||
mova [dstq+16*5], m2
|
||||
pshufb m2, m4, m0
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
mova m3, [idxq+16*1]
|
||||
add idxq, 32
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*4], m0
|
||||
mova [dstq+16*5], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*6], m0
|
||||
|
||||
+42
-35
@@ -5316,8 +5316,11 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
lea r2, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pshufb xm0, xm4, [idxq]
|
||||
add idxq, 16
|
||||
movq xm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xm1, xm0, 4
|
||||
punpcklbw xm0, xm1
|
||||
pshufb xm0, xm4, xm0
|
||||
movd [dstq+strideq*0], xm0
|
||||
pextrd [dstq+strideq*1], xm0, 1
|
||||
pextrd [dstq+strideq*2], xm0, 2
|
||||
@@ -5326,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
sub hd, 4
|
||||
jg .w4
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w8:
|
||||
pshufb xm0, xm4, [idxq+16*0]
|
||||
pshufb xm1, xm4, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
movu xm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xm1, xm4, xm2
|
||||
psrlw xm2, 4
|
||||
pshufb xm2, xm4, xm2
|
||||
punpcklbw xm0, xm1, xm2
|
||||
punpckhbw xm1, xm2
|
||||
movq [dstq+strideq*0], xm0
|
||||
movhps [dstq+strideq*1], xm0
|
||||
movq [dstq+strideq*2], xm1
|
||||
@@ -5339,47 +5345,48 @@ ALIGN function_align
|
||||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
add idxq, 32*2
|
||||
movu m2, [idxq]
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti128 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], xm1
|
||||
mova [dstq+strideq*1], xm1
|
||||
vextracti128 [dstq+strideq*2], m0, 1
|
||||
vextracti128 [dstq+r2 ], m1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w16
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
pshufb m2, m4, [idxq+32*2]
|
||||
pshufb m3, m4, [idxq+32*3]
|
||||
add idxq, 32*4
|
||||
vpermq m2, [idxq], q3120
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+r2 ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
pshufb m2, m4, [idxq+32*2]
|
||||
pshufb m3, m4, [idxq+32*3]
|
||||
add idxq, 32*4
|
||||
mova [dstq+strideq*0+32*0], m0
|
||||
mova [dstq+strideq*0+32*1], m1
|
||||
mova [dstq+strideq*1+32*0], m2
|
||||
mova [dstq+strideq*1+32*1], m3
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermq m2, [idxq], q3120
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+32*0], m0
|
||||
mova [dstq+32*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
||||
+47
-28
@@ -95,6 +95,8 @@ smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
|
||||
db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
|
||||
ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
|
||||
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
|
||||
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
||||
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
|
||||
pb_127_m127: times 2 db 127, -127
|
||||
pb_128: times 4 db 128
|
||||
@@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
|
||||
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
|
||||
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
|
||||
JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
|
||||
SECTION .text
|
||||
|
||||
@@ -1111,18 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
|
||||
jg .w64_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
||||
lea r6, [pal_pred_8bpc_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
vpbroadcastq m4, [palq]
|
||||
cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r6+wq*4]
|
||||
add wq, r6
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
cmp wd, 8
|
||||
jg .w32
|
||||
movq xmm3, [palq]
|
||||
je .w8
|
||||
.w4:
|
||||
pshufb xmm0, xm4, [idxq]
|
||||
add idxq, 16
|
||||
movq xmm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xmm1, xmm0, 4
|
||||
punpcklbw xmm0, xmm1
|
||||
pshufb xmm0, xmm3, xmm0
|
||||
movd [dstq+strideq*0], xmm0
|
||||
pextrd [dstq+strideq*1], xmm0, 1
|
||||
pextrd [dstq+strideq*2], xmm0, 2
|
||||
@@ -1132,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
pshufb xmm0, xm4, [idxq+16*0]
|
||||
pshufb xmm1, xm4, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
movu xmm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xmm1, xmm3, xmm2
|
||||
psrlw xmm2, 4
|
||||
pshufb xmm2, xmm3, xmm2
|
||||
punpcklbw xmm0, xmm1, xmm2
|
||||
punpckhbw xmm1, xmm2
|
||||
movq [dstq+strideq*0], xmm0
|
||||
movhps [dstq+strideq*1], xmm0
|
||||
movq [dstq+strideq*2], xmm1
|
||||
@@ -1144,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq]
|
||||
add idxq, 64
|
||||
pmovzxdq m0, [idxq]
|
||||
add idxq, 32
|
||||
vpmultishiftqb m0, m3, m0
|
||||
pshufb m0, m5, m0
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti32x4 [dstq+strideq*1], ym0, 1
|
||||
vextracti32x4 [dstq+strideq*2], m0, 2
|
||||
@@ -1155,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq+64*0]
|
||||
pshufb m1, m4, [idxq+64*1]
|
||||
add idxq, 64*2
|
||||
vpbroadcastq m3, [pal_unpack+0]
|
||||
vpbroadcastq m5, [palq]
|
||||
cmp wd, 32
|
||||
jl .w16
|
||||
pmovzxbd m2, [pal_perm]
|
||||
vpbroadcastq m4, [pal_unpack+8]
|
||||
jg .w64
|
||||
.w32_loop:
|
||||
vpermd m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpmultishiftqb m0, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
pshufb m0, m5, m0
|
||||
pshufb m1, m5, m1
|
||||
mova [dstq+strideq*0], ym0
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], ym1
|
||||
vextracti32x8 [dstq+stride3q ], m1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
jg .w32_loop
|
||||
RET
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq+64*0]
|
||||
pshufb m1, m4, [idxq+64*1]
|
||||
pshufb m2, m4, [idxq+64*2]
|
||||
pshufb m3, m4, [idxq+64*3]
|
||||
add idxq, 64*4
|
||||
vpermd m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpmultishiftqb m0, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
pshufb m0, m5, m0
|
||||
pshufb m1, m5, m1
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+stride3q ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
||||
+56
-45
@@ -3493,11 +3493,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
lea r2, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pshufb m0, m4, [idxq]
|
||||
add idxq, 16
|
||||
movd [dstq ], m0
|
||||
movq m1, [idxq]
|
||||
add idxq, 8
|
||||
psrlw m0, m1, 4
|
||||
punpcklbw m1, m0
|
||||
pshufb m0, m4, m1
|
||||
movd [dstq+strideq*0], m0
|
||||
pshuflw m1, m0, q1032
|
||||
movd [dstq+strideq ], m1
|
||||
movd [dstq+strideq*1], m1
|
||||
punpckhqdq m0, m0
|
||||
movd [dstq+strideq*2], m0
|
||||
psrlq m0, 32
|
||||
@@ -3506,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
sub hd, 4
|
||||
jg .w4
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w8:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
add idxq, 32
|
||||
movq [dstq ], m0
|
||||
movhps [dstq+strideq ], m0
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
movq [dstq+strideq*0], m0
|
||||
movhps [dstq+strideq*1], m0
|
||||
movq [dstq+strideq*2], m1
|
||||
movhps [dstq+r2 ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+strideq ], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+r2 ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w16
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+16 ], m1
|
||||
mova [dstq+strideq ], m2
|
||||
mova [dstq+strideq+16], m3
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w32
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+16], m1
|
||||
mova [dstq+32], m2
|
||||
mova [dstq+48], m3
|
||||
movu m0, [idxq+16*0]
|
||||
movu m2, [idxq+16*1]
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m3, m4, m0
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m3, m4, m2
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
add dstq, strideq
|
||||
sub hd, 1
|
||||
jg .w64
|
||||
|
||||
@@ -251,7 +251,7 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
PIXEL_RECT(c_dst, 64, 64);
|
||||
PIXEL_RECT(a_dst, 64, 64);
|
||||
ALIGN_STK_64(uint8_t, idx, 64 * 64,);
|
||||
ALIGN_STK_64(uint8_t, idx, 32 * 64,);
|
||||
ALIGN_STK_16(pixel, pal, 8,);
|
||||
|
||||
declare_func(void, pixel *dst, ptrdiff_t stride, const pixel *pal,
|
||||
@@ -270,8 +270,8 @@ static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
|
||||
for (int i = 0; i < 8; i++)
|
||||
pal[i] = rnd() & bitdepth_max;
|
||||
|
||||
for (int i = 0; i < w * h; i++)
|
||||
idx[i] = rnd() & 7;
|
||||
for (int i = 0; i < w * h / 2; i++)
|
||||
idx[i] = rnd() & 0x77;
|
||||
|
||||
CLEAR_PIXEL_RECT(c_dst);
|
||||
CLEAR_PIXEL_RECT(a_dst);
|
||||
|
||||
Reference in New Issue
Block a user