x86/filmgrain: simplify post-horizontal filter blending

This commit is contained in:
Ronald S. Bultje
2021-07-16 17:51:17 -04:00
parent 73db537834
commit 1944317ea6
4 changed files with 416 additions and 557 deletions
+54 -51
View File
@@ -29,8 +29,6 @@
%if ARCH_X86_64
SECTION_RODATA 32
pw_1024: times 16 dw 1024
pw_23_22: times 8 dw 23, 22
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
pw_seed_xor: times 2 dw 0xb524
@@ -48,6 +46,7 @@ pw_27_17_17_27: dw 27, 17, 17, 27
; these two should be next to each other
pw_4: times 2 dw 4
pw_16: times 2 dw 16
pw_23_22: dw 23, 22, 0, 32
%macro JMP_TABLE 1-*
%xdefine %1_table %%table
@@ -1480,8 +1479,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
vpbroadcastd m9, [base+pw_4+r9*4]
pmullw m15, m9
%else
vpbroadcastd m14, [pw_1024]
vpbroadcastd m15, [pw_23_22]
vpbroadcastd m14, [pd_16]
vpbroadcastq m15, [pw_23_22]
%endif
movifnidn sbyd, sbym
@@ -1689,16 +1688,18 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
movu m9, [grain_lutq+offxyq*2]
movu m3, [grain_lutq+offxyq*2+82*2]
movd xm5, [grain_lutq+left_offxyq*2+ 0]
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 ; {left0, left1}
punpcklwd xm7, xm9, xm3 ; {cur0, cur1}
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
punpckldq xm7, xm9, xm3 ; {cur0, cur1}
punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1}
%if %1
pmaddwd xm5, [pw_23_22]
%else
pmaddwd xm5, xm15
%endif
vpbroadcastq xm8, [pw_23_22]
pmaddwd xm5, xm8
vpbroadcastd xm8, [pd_16]
paddd xm5, xm8
%else
pmaddwd xm5, xm15
paddd xm5, xm14
%endif
psrad xm5, 5
packssdw xm5, xm5
pcmpeqw xm8, xm8
@@ -1706,11 +1707,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
pxor xm8, xm7
pmaxsw xm5, xm8
pminsw xm5, xm7
vpblendw xm7, xm5, xm9, 11111110b
psrldq xm5, 2
vpblendw xm5, xm3, 11111110b
vpblendd m9, m7, 00001111b
vpblendd m3, m5, 00001111b
vpblendd m9, m9, m5, 00000001b
psrldq xm5, 4
vpblendd m3, m3, m5, 00000001b
; scaling[luma_src]
punpckhwd m5, m4, m2
@@ -1875,13 +1874,14 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
movu m5, [grain_lutq+top_offxyq*2]
punpckhwd m7, m5, m9
punpcklwd m5, m9 ; {top/cur interleaved}
vpbroadcastd m3, [pw_23_22]
REPX {pmaddwd x, m3}, m7, m5
%if %1
REPX {pmaddwd x, [pw_23_22]}, m7, m5
%else
REPX {pmaddwd x, m15}, m7, m5
%endif
vpbroadcastd m3, [pd_16]
REPX {paddd x, m3}, m7, m5
%else
REPX {paddd x, m14}, m7, m5
%endif
REPX {psrad x, 5}, m7, m5
packssdw m9, m5, m7
pcmpeqw m7, m7
@@ -1989,48 +1989,51 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%%loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
movd xm5, [grain_lutq+left_offxyq*2]
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1
pinsrw xm5, [grain_lutq+topleft_offxyq*2], 2 ; { left0, left1, top/left }
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2
vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
movu m9, [grain_lutq+offxyq*2]
movu m3, [grain_lutq+offxyq*2+82*2]
movu m8, [grain_lutq+top_offxyq*2]
punpcklwd xm7, xm9, xm3 ; { cur0, cur1 }
punpckldq xm7, xm8 ; { cur0, cur1, top0 }
punpcklwd xm5, xm7 ; { cur/left } interleaved
pmaddwd xm5, [pw_23_22]
vpbroadcastd xm0, [pd_16]
paddd xm5, xm0
psrad xm5, 5
packssdw xm5, xm5
pcmpeqw xm0, xm0
psraw xm7, xm10, 1
pxor xm0, xm7
punpckldq xm7, xm9, xm3 ; { cur0, cur1 }
vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 }
punpcklwd m5, m7 ; { cur/left } interleaved
%if %1
vpbroadcastq m0, [pw_23_22]
pmaddwd m5, m0
vpbroadcastd m0, [pd_16]
paddd m5, m0
%else
pmaddwd m5, m15
paddd m5, m14
%endif
psrad m5, 5
vextracti128 xm0, m5, 1
packssdw xm5, xm0
pcmpeqw m0, m0
psraw m7, m10, 1
pxor m0, m7
pminsw xm5, xm7
pmaxsw xm5, xm0
pcmpeqw xm7, xm7
psrldq xm7, 14 ; 0xffff, 0.....
vpblendvb m9, m5, m7 ; line 0
psrldq xm5, 2
vpblendvb m3, m5, m7 ; line 1
psrldq xm5, 2
vpblendvb m5, m8, m5, m7 ; top line
vpblendd m9, m9, m5, 00000001b
psrldq xm5, 4
vpblendd m3, m3, m5, 00000001b
psrldq xm5, 4
vpblendd m5, m8, m5, 00000001b
punpckhwd m7, m5, m9
punpckhwd m8, m5, m9
punpcklwd m5, m9 ; {top/cur interleaved}
vpbroadcastd m9, [pw_23_22]
REPX {pmaddwd x, m9}, m8, m5
%if %1
REPX {pmaddwd x, [pw_23_22]}, m7, m5
%else
REPX {pmaddwd x, m15}, m7, m5
%endif
vpbroadcastd m9, [pd_16]
REPX {paddd x, m9}, m5, m7
REPX {psrad x, 5}, m5, m7
packssdw m9, m5, m7
pcmpeqw m5, m5
psraw m7, m10, 1
pxor m5, m7
pmaxsw m9, m5
REPX {paddd x, m9}, m5, m8
%else
REPX {paddd x, m14}, m5, m8
%endif
REPX {psrad x, 5}, m5, m8
packssdw m9, m5, m8
pminsw m9, m7
pmaxsw m9, m0
; src
mova m0, [srcq]
+79 -157
View File
@@ -31,8 +31,11 @@ pd_16: times 4 dd 16
pw_1: times 8 dw 1
pw_16384: times 8 dw 16384
pw_8192: times 8 dw 8192
pw_23_22: times 4 dw 23, 22
pw_23_22: dw 23, 22
times 3 dw 0, 32
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
pw_27_17_17_27: dw 27, 17, 17, 27
times 2 dw 0, 32
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
@@ -43,7 +46,6 @@ mul_bits: dw 256, 128, 64, 32, 16
round_vals: dw 32, 64, 128, 256, 512, 1024
max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
min: dw 0, 16*4, 16*16
pw_27_17_17_27: dw 27, 17, 17, 27
; these two should be next to each other
pw_4: times 2 dw 4
pw_16: times 2 dw 16
@@ -96,6 +98,13 @@ SECTION .text
%endrep
%endmacro
%if ARCH_X86_32
%undef base
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
@@ -1429,13 +1438,6 @@ generate_grain_uv_fn 444, 0, 0
%endif
%endmacro
%if ARCH_X86_32
%undef base
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
INIT_XMM ssse3
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
@@ -1520,10 +1522,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
SCRATCH 6, 14, 5
SCRATCH 7, 15, 6
%if !cpuflag(sse4)
pcmpeqw m6, m6
pslldq m6, 4
%endif
mova m6, [base+pw_27_17_17_27] ; for horizontal filter
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
@@ -1672,11 +1671,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
test dword r8m, 2
jz .loop_x_odd
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27]
add dword [rsp+8*mmsize+1*gprsize], 16
%else
SPLATD m7, [pw_27_17_17_27]
add r12d, 16 ; top_offxy += 16
%endif
jmp .loop_x_odd_v_overlap
@@ -1686,12 +1682,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
jz .loop_x
; r8m = sbym
%if ARCH_X86_32
mov r5, r5m
movq m7, [base+pw_27_17_17_27]
%else
movq m7, [pw_27_17_17_27]
%endif
test dword r8m, 2
jnz .loop_x_hv_overlap
@@ -1743,27 +1733,21 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
mov grain_lutq, grain_lutmp
.loop_y_h_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m4, [grain_lutq+offxyq*2]
movu m5, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+0*gprsize]
movd m5, [grain_lutq+r5*2]
movd m4, [grain_lutq+r5*2]
%else
movd m5, [grain_lutq+left_offxyq*2]
%endif
punpcklwd m5, m4
pmaddwd m5, m7
paddd m5, m14
psrad m5, 5
packssdw m5, m5
%if cpuflag(sse4)
pblendw m4, m5, 00000011b
%else
pand m4, m6
pandn m0, m6, m5
por m4, m0
movd m4, [grain_lutq+left_offxyq*2]
%endif
punpcklwd m4, m5
pmaddwd m4, m6
paddd m4, m14
psrad m4, 5
packssdw m4, m4
pminsw m4, m15
pmaxsw m4, m9
shufps m4, m5, q3210
; src
pand m0, m10, [srcq+ 0]
@@ -1822,11 +1806,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
test dword r8m, 2
jz .loop_x_odd
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27]
add dword [rsp+8*mmsize+1*gprsize], 16
%else
SPLATD m7, [pw_27_17_17_27]
add r12d, 16 ; top_offxy += 16
%endif
jmp .loop_x_odd_v_overlap
@@ -1941,6 +1922,10 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
shr offxyd, 16
.loop_x_odd_v_overlap:
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
mov hd, dword r7m
mov grain_lutq, grain_lutmp
.loop_y_v_overlap:
@@ -2009,18 +1994,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27+4]
%else
SPLATD m7, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
%endif
add srcq, r2mp
add grain_lutq, 82*2
dec hw
jz .end_y_v_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
xor hd, 0x10000
test hd, 0x10000
jnz .loop_y_v_overlap
@@ -2044,11 +2027,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
btc dword r8m, 2
jc .next_blk_v
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27]
add dword [rsp+8*mmsize+1*gprsize], 16
%else
SPLATD m7, [pw_27_17_17_27]
add top_offxyd, 16
%endif
add offxyd, 16
@@ -2059,20 +2039,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
%if ARCH_X86_32
mov r5, r5m
movq m7, [base+pw_27_17_17_27]
%else
movq m7, [pw_27_17_17_27]
%endif
.loop_x_hv_overlap:
%if ARCH_X86_32
mov r5, r5m
SPLATD m0, [base+pw_27_17_17_27]
mova [rsp+7*mmsize], m0
%define m8 [rsp+7*mmsize]
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
mov r0, [rsp+8*mmsize+1*gprsize]
@@ -2084,8 +2052,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
mov seed, r3m
xor r0, r0
%else
SPLATD m8, [pw_27_17_17_27]
; we assume from the block above that bits 8-15 of r7d are zero'ed
%endif
mov r6d, seed
@@ -2139,43 +2105,39 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
%endif
shr offxyd, 16
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
movzx hd, word r7m
mov grain_lutq, grain_lutmp
.loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq*2]
movu m2, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
movu m5, [grain_lutq+r0*2]
movd m4, [grain_lutq+r5*2]
movu m4, [grain_lutq+r0*2]
movd m5, [grain_lutq+r5*2]
mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
movd m2, [grain_lutq+r5*2]
movd m3, [grain_lutq+r5*2]
%else
movu m5, [grain_lutq+top_offxyq*2]
movd m4, [grain_lutq+left_offxyq*2]
movd m2, [grain_lutq+topleft_offxyq*2]
movu m4, [grain_lutq+top_offxyq*2]
movd m5, [grain_lutq+left_offxyq*2]
movd m3, [grain_lutq+topleft_offxyq*2]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklwd m4, m3
punpcklwd m2, m5
REPX {pmaddwd x, m7}, m4, m2
REPX {paddd x, m14}, m4, m2
REPX {psrad x, 5}, m4, m2
REPX {packssdw x, x}, m4, m2
REPX {pminsw x, m15}, m4, m2
REPX {pmaxsw x, m9}, m4, m2
%if cpuflag(sse4)
pblendw m3, m4, 00000011b
pblendw m5, m2, 00000011b
%else
pand m3, m6
pand m5, m6
pandn m0, m6, m4
pandn m1, m6, m2
por m3, m0
por m5, m1
%endif
punpcklwd m5, m2
punpcklwd m3, m4
REPX {pmaddwd x, m6}, m5, m3
REPX {paddd x, m14}, m5, m3
REPX {psrad x, 5}, m5, m3
packssdw m5, m3
pminsw m5, m15
pmaxsw m5, m9
shufps m3, m5, m2, q3210
shufps m5, m4, q3232
; followed by v interpolation (top | cur -> cur)
movu m0, [grain_lutq+offxyq*2+16]
%if ARCH_X86_32
@@ -2187,7 +2149,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
punpckhwd m5, m3
punpcklwd m3, m1, m0
punpckhwd m1, m0
REPX {pmaddwd x, m8}, m2, m5, m3, m1
REPX {pmaddwd x, m7}, m2, m5, m3, m1
REPX {paddd x, m14}, m2, m5, m3, m1
REPX {psrad x, 5}, m2, m5, m3, m1
packssdw m2, m5
@@ -2229,19 +2191,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
mova [dstq+srcq+ 0], m0
mova [dstq+srcq+16], m1
%if ARCH_X86_32
mov r5, r5m
SPLATD m0, [base+pw_27_17_17_27+4]
mova m8, m0
%else
SPLATD m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
%endif
add srcq, r2mp
add grain_lutq, 82*2
dec hw
jz .end_y_hv_overlap
; 2 lines get vertical overlap, then fall back to non-overlap code for
; remaining (up to) 30 lines
%if ARCH_X86_32
mov r5, r5m
%endif
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
xor hd, 0x10000
test hd, 0x10000
jnz .loop_y_hv_overlap
@@ -2257,14 +2216,12 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
jge .end_hv
%if ARCH_X86_32
mov r5, r5m
SPLATD m7, [base+pw_27_17_17_27]
add offxyd, 16
add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
mov srcq, r9mp
add srcq, r4mp
add srcq, r4mp
%else
SPLATD m7, [pw_27_17_17_27]
add offxyd, 16
add top_offxyd, 16
mov src_bakq, r9mp
@@ -2370,12 +2327,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
SCRATCH 4, 12, 4
SCRATCH 5, 13, 5
%if cpuflag(sse4)
pxor m2, m2
%define mzero m2
%else
%define mzero m7
%endif
SPLATD m2, [base+pw_23_22]
%if ARCH_X86_32
mov scalingq, r5m
mov r5m, r5
@@ -2390,11 +2345,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
SCRATCH 0, 8, 0
SCRATCH 1, 9, 1
%if !cpuflag(sse4)
pcmpeqw m2, m2
pslldq m2, 2
%endif
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
@@ -2419,7 +2369,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
pmullw m5, m7
%else
SPLATD m6, [base+pd_16]
SPLATD m5, [base+pw_23_22]
mova m5, [base+pw_23_22]
%endif
SCRATCH 6, 14, 6
@@ -2529,9 +2479,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
mova m1, [srcq+16] ; m0-1: src as word
; luma_src
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
@@ -2687,9 +2635,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
mova m1, [srcq+16]
; luma_src
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
mov lumaq, r9m
@@ -2744,13 +2690,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
packssdw m5, m5
pmaxsw m5, m8
pminsw m5, m9
%if cpuflag(sse4)
pblendw m5, m7, 11111110b
%else
pand m7, m2
pandn m3, m2, m5
por m5, m7, m3
%endif
shufps m5, m7, q3210
movu m3, [grain_lutq+offxyq*2+16]
; scaling[luma_src]
@@ -2950,14 +2890,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%endif
punpckhwd m7, m5, m3
punpcklwd m5, m3 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
%if ARCH_X86_32
mov r5, r5m
%endif
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
%else
REPX {pmaddwd x, m15}, m7, m5
REPX {paddd x, m14}, m7, m5
%endif
REPX {psrad x, 5}, m7, m5
@@ -2974,11 +2913,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%endif
punpckhwd m7, m5, m4
punpcklwd m5, m4 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
%else
REPX {pmaddwd x, m15}, m7, m5
REPX {paddd x, m14}, m7, m5
%endif
REPX {psrad x, 5}, m7, m5
@@ -2991,9 +2929,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
mova m1, [srcq+16]
; luma_src
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
@@ -3021,9 +2957,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
REPX {pmaddwd x, m14}, m7, m6
REPX {psrad x, 6}, m7, m6
packssdw m6, m7
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
REPX {paddw x, m15}, m5, m6
REPX {pmaxsw x, mzero}, m5, m6
REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
@@ -3176,52 +3110,45 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%else
movd m5, [grain_lutq+left_offxyq*2]
%endif
movu m3, [grain_lutq+offxyq*2]
movu m7, [grain_lutq+offxyq*2]
%if ARCH_X86_32
mov r5, [rsp+8*mmsize+2*gprsize]
movu m4, [grain_lutq+r0*2]
pinsrw m5, [grain_lutq+r5*2], 1
pinsrw m5, [grain_lutq+r5*2], 2
%else
movu m4, [grain_lutq+top_offxyq*2]
pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left }
pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
%endif
punpcklwd m7, m3, m4 ; { cur0, top0 }
punpcklwd m5, m7 ; { cur/left } interleaved
punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
%if %1
%if ARCH_X86_32
mov r5, r5m
%endif
pmaddwd m5, [PIC_ptr(pw_23_22)]
pshufd m0, [PIC_ptr(pw_23_22)], q1010
%else
pshufd m0, m15, q1010
%endif
pmaddwd m5, m0
%if %1
paddd m5, [PIC_ptr(pd_16)]
%else
pmaddwd m5, m15
paddd m5, m14
%endif
psrad m5, 5
packssdw m5, m5
pmaxsw m5, m8
pminsw m5, m9
%if cpuflag(sse4)
pblendw m3, m5, 00000001b
psrldq m5, 2
pblendw m5, m4, 11111110b
%else
pand m3, m2
pandn m7, m2, m5
por m3, m7
psrldq m5, 2
pand m4, m2
pandn m7, m2, m5
por m5, m4, m7
%endif
shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
shufps m5, m4, q3231 ; top0-7 post-h_filter
punpckhwd m7, m5, m3
punpcklwd m5, m3 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m7, m5
%if %1
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
%else
REPX {pmaddwd x, m15}, m7, m5
REPX {paddd x, m14}, m5, m7
%endif
REPX {psrad x, 5}, m5, m7
@@ -3238,11 +3165,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
%endif
punpckhwd m1, m0, m4
punpcklwd m0, m4 ; {top/cur interleaved}
REPX {pmaddwd x, m2}, m1, m0
%if %1
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m1, m0
REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
%else
REPX {pmaddwd x, m15}, m1, m0
REPX {paddd x, m14}, m1, m0
%endif
REPX {psrad x, 5}, m1, m0
@@ -3255,9 +3181,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
mova m1, [srcq+16]
; luma_src
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
@@ -3285,9 +3209,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
REPX {pmaddwd x, m14}, m7, m5
REPX {psrad x, 6}, m7, m5
packssdw m5, m7
%if !cpuflag(sse4)
pxor mzero, mzero
%endif
REPX {paddw x, m15}, m6, m5
REPX {pmaxsw x, mzero}, m6, m5
REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
+70 -97
View File
@@ -38,7 +38,8 @@ byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pd_m65536: dd ~0xffff
pb_23_22: times 2 db 23, 22
pb_23_22: db 23, 22
times 3 db 0, 32
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
@@ -47,6 +48,7 @@ round_vals: dw 32, 64, 128, 256, 512
max: dw 255, 240, 235
min: dw 0, 16
pb_27_17_17_27: db 27, 17, 17, 27
times 2 db 0, 32
pw_1: dw 1
%macro JMP_TABLE 2-*
@@ -90,6 +92,14 @@ cextern gaussian_sequence
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
INIT_XMM avx2
cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data
lea r4, [pb_mask]
@@ -1092,12 +1102,12 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
jz .loop_x
; r8m = sbym
movd xm15, [pb_27_17_17_27]
movq xm15, [pb_27_17_17_27]
cmp dword r8m, 0
jne .loop_x_hv_overlap
; horizontal overlap (without vertical overlap)
movd xm14, [pw_1024]
movq xm14, [pw_1024]
.loop_x_h_overlap:
mov r6d, seed
or seed, 0xEFF4
@@ -1156,8 +1166,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
pmaddubsw xm4, xm15, xm4
pmulhrsw xm4, xm14
packsswb xm4, xm4
vpblendw xm4, xm3, 11111110b
vpblendd m3, m4, 00001111b
vpblendd m3, m3, m4, 00000001b
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1329,7 +1338,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
; back to .loop_x_v_overlap, and instead always fall-through to
; h+v overlap
movd xm15, [pb_27_17_17_27]
movq xm15, [pb_27_17_17_27]
.loop_x_hv_overlap:
vpbroadcastw m8, [pb_27_17_17_27]
@@ -1409,10 +1418,8 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
pmulhrsw xm7, xm14
packsswb xm4, xm4
packsswb xm7, xm7
vpblendw xm4, xm3, 11111110b
vpblendw xm7, xm6, 11111110b
vpblendd m3, m4, 00001111b
vpblendd m6, m7, 00001111b
vpblendd m3, m4, 00000001b
vpblendd m6, m7, 00000001b
; followed by v interpolation (top | cur -> cur)
punpckhbw m7, m6, m3
punpcklbw m6, m3
@@ -1463,8 +1470,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, luma, lstride, uv_pl, is_id
pcmpeqw m10, m10
psrld m10, 24
mov r7d, [fg_dataq+FGData.scaling_shift]
lea r8, [pb_mask]
%define base r8-pb_mask
@@ -1490,10 +1495,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%else
vpbroadcastd m14, [pw_1024]
%if %2
vpbroadcastd m15, [pb_23_22]
vpbroadcastq m15, [pb_23_22]
%else
vpbroadcastd xm15, [pb_27_17_17_27]
vpbroadcastq xm15, [pb_27_17_17_27]
%endif
%endif
%if %3
vpbroadcastw m10, [pb_23_22]
%elif %2
mova m10, [pb_8x_27_17_8x_17_27]
%endif
mov overlapd, [fg_dataq+FGData.overlap_flag]
@@ -1593,16 +1603,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@@ -1743,16 +1750,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@@ -1763,7 +1767,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; grain = grain_lut[offy+y][offx+x]
%if %2
%if %1
vpbroadcastd m6, [pb_23_22] ; FIXME
vpbroadcastq m6, [pb_23_22]
%endif
movu xm3, [grain_lutq+offxyq+ 0]
movd xm4, [grain_lutq+left_offxyq+ 0]
@@ -1778,12 +1782,10 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
pmulhrsw m4, m14
%endif
packsswb m4, m4
pcmpeqw m6, m6 ; FIXME
psrldq m6, 15 ; FIXME
vpblendvb m3, m3, m4, m6
vpblendd m3, m3, m4, 00010001b
%else
%if %1
vpbroadcastd xm6, [pb_27_17_17_27]
movq xm6, [pb_27_17_17_27]
%endif
movu m3, [grain_lutq+offxyq]
movd xm4, [grain_lutq+left_offxyq]
@@ -1796,9 +1798,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm6, xm6
psrldq xm6, 14
vpblendvb m3, m3, m4, m6
vpblendd m3, m3, m4, 00000001b
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
@@ -1915,7 +1915,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_v_overlap:
; src
@@ -1966,16 +1966,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m8, [scalingq+m4], m3
vpgatherdd m4, [scalingq+m5], m9
vpgatherdd m8, [scalingq-3+m4], m3
vpgatherdd m4, [scalingq-3+m5], m9
pcmpeqw m3, m3
pcmpeqw m9, m9
vpgatherdd m5, [scalingq+m6], m3
vpgatherdd m6, [scalingq+m7], m9
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m3
vpgatherdd m6, [scalingq-3+m7], m9
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@@ -1988,7 +1985,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; grain = grain_lut[offy+y][offx+x]
%if %3 == 0
%if %2
mova m6, [pb_8x_27_17_8x_17_27]
movu xm3, [grain_lutq+offxyq]
movu xm4, [grain_lutq+top_offxyq]
vinserti128 m3, [grain_lutq+offxyq+82], 1
@@ -1999,13 +1995,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%endif
punpckhbw m9, m4, m3
punpcklbw m4, m3
%if %2
pmaddubsw m9, m6, m9
pmaddubsw m4, m6, m4
%else
pmaddubsw m9, m1, m9
pmaddubsw m4, m1, m4
%endif
pmaddubsw m9, m10, m9
pmaddubsw m4, m10, m4
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m4, [pw_1024]
@@ -2015,19 +2006,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%endif
packsswb m3, m4, m9
%else
%if %1
vpbroadcastd m6, [pb_23_22]
%endif
movq xm3, [grain_lutq+offxyq]
movq xm4, [grain_lutq+top_offxyq]
vinserti128 m3, [grain_lutq+offxyq+8], 1
vinserti128 m4, [grain_lutq+top_offxyq+8], 1
punpcklbw m4, m3
pmaddubsw m4, m10, m4
%if %1
pmaddubsw m4, m6, m4
pmulhrsw m4, [pw_1024]
%else
pmaddubsw m4, m15, m4
pmulhrsw m4, m14
%endif
packsswb m4, m4
@@ -2084,7 +2071,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%endif
add grain_lutq, 82<<%2
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_v_overlap
%endif
@@ -2139,7 +2126,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_hv_overlap:
; src
@@ -2190,16 +2177,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[src]
pcmpeqw m9, m9
pcmpeqw m3, m3
vpgatherdd m8, [scalingq+m4], m9
vpgatherdd m4, [scalingq+m5], m3
vpgatherdd m8, [scalingq-3+m4], m9
vpgatherdd m4, [scalingq-3+m5], m3
pcmpeqw m9, m9
pcmpeqw m3, m3
vpgatherdd m5, [scalingq+m6], m9
vpgatherdd m6, [scalingq+m7], m3
pand m8, m10
pand m4, m10
pand m5, m10
pand m6, m10
vpgatherdd m5, [scalingq-3+m6], m9
vpgatherdd m6, [scalingq-3+m7], m3
REPX {psrld x, 24}, m8, m4, m5, m6
packusdw m8, m4
packusdw m5, m6
@@ -2212,9 +2196,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; grain = grain_lut[offy+y][offx+x]
%if %1
%if %2
vpbroadcastd m9, [pb_23_22]
vpbroadcastq m9, [pb_23_22]
%else
vpbroadcastd xm9, [pb_27_17_17_27]
vpbroadcastq xm9, [pb_27_17_17_27]
%endif
%endif
@@ -2252,7 +2236,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%else
punpcklbw m7, m6
%endif
punpcklwd m4, m7
punpcklqdq m4, m7
%if %1
pmaddubsw m4, m9, m4
pmulhrsw m4, [pw_1024]
@@ -2261,18 +2245,17 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
pmulhrsw m4, m14
%endif
packsswb m4, m4
pcmpeqw m9, m9 ; this is kind of ugly
psrldq m9, 15
vpblendvb m3, m3, m4, m9
psrldq m4, 1
vpblendd m3, m4, 00010001b
psrldq m4, 4
%if %3
shufpd m9, m9, m9, 1110b ; clear upper lane
vpblendd m6, m6, m4, 00000001b
%else
vpblendd m6, m6, m4, 00010001b
%endif
vpblendvb m6, m6, m4, m9
%else
punpcklbw xm4, xm3
punpcklbw xm7, xm6
punpckldq xm4, xm7
punpcklqdq xm4, xm7
%if %1
pmaddubsw xm4, xm9, xm4
pmulhrsw xm4, [pw_1024]
@@ -2281,23 +2264,19 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm9, xm9 ; this is kind of ugly
psrldq xm9, 14
vpblendvb m3, m3, m4, m9
psrldq xm4, 2
vpblendvb m6, m6, m4, m9
vpblendd m3, m3, m4, 00000001b
psrldq xm4, 4
vpblendd m6, m6, m4, 00000001b
%endif
; followed by v interpolation (top | cur -> cur)
%if %3
vpermq m9, m3, q3120
punpcklbw m6, m9
pmaddubsw m6, m10, m6
%if %1
vpbroadcastd m9, [pb_23_22]
pmaddubsw m6, m9, m6
pmulhrsw m6, [pw_1024]
%else
pmaddubsw m6, m15, m6
pmulhrsw m6, m14
%endif
packsswb m6, m6
@@ -2306,14 +2285,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%else
punpckhbw m9, m6, m3
punpcklbw m6, m3
%if %2
mova m3, [pb_8x_27_17_8x_17_27]
pmaddubsw m9, m3, m9
pmaddubsw m6, m3, m6
%else
pmaddubsw m9, m1, m9
pmaddubsw m6, m1, m6
%endif
pmaddubsw m9, m10, m9
pmaddubsw m6, m10, m6
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m6, [pw_1024]
@@ -2373,7 +2346,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jg %%loop_y_h_overlap
%else
je %%end_y_hv_overlap
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_hv_overlap
jmp %%loop_y_h_overlap
+213 -252
View File
@@ -29,14 +29,18 @@
SECTION_RODATA
pw_1024: times 8 dw 1024
pb_27_17_17_27: db 27, 17, 17, 27
times 6 db 0, 32
pb_23_22_h: db 23, 22
times 7 db 0, 32
pb_27_17: times 8 db 27, 17
pb_17_27: times 8 db 17, 27
pb_23_22: times 8 db 23, 22
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
pw_seed_xor: times 2 dw 0xb524
times 2 dw 0x49d8
pb_23_22: times 2 db 23, 22
pb_1: times 4 db 1
hmul_bits: dw 32768, 16384, 8192, 4096
round: dw 2048, 1024, 512
@@ -46,8 +50,6 @@ max: dw 255, 240, 235
min: dw 0, 16
pw_1: dw 1
%define pb_27_17_17_27 pb_17_27 - 2
%macro JMP_TABLE 2-*
%xdefine %1_8bpc_%2_table %%table
%xdefine %%base %1_8bpc_%2_table
@@ -88,6 +90,20 @@ cextern gaussian_sequence
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
%if ARCH_X86_32
%define PIC_ptr(a) base+a
%else
%define PIC_ptr(a) a
%endif
%macro SCRATCH 3
%if ARCH_X86_32
mova [rsp+%3*mmsize], m%1
@@ -1284,7 +1300,7 @@ INIT_XMM ssse3
; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
dst, src, scaling, unused1, fg_data, picptr, unused2
; copy stack arguments to new position post-alignment, so that we
; don't have to keep the old stack location in a separate register
@@ -1295,29 +1311,29 @@ cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
mov r4, r7m
mov r5, r8m
mov [rsp+6*mmsize+ 3*gprsize], r0
mov [rsp+6*mmsize+ 5*gprsize], r1
mov [rsp+6*mmsize+ 7*gprsize], r2
mov [rsp+6*mmsize+ 9*gprsize], r3
mov [rsp+6*mmsize+10*gprsize], r4
mov [rsp+6*mmsize+11*gprsize], r5
mov [rsp+5*mmsize+ 4*gprsize], r0
mov [rsp+5*mmsize+ 6*gprsize], r1
mov [rsp+5*mmsize+ 8*gprsize], r2
mov [rsp+5*mmsize+10*gprsize], r3
mov [rsp+5*mmsize+11*gprsize], r4
mov [rsp+5*mmsize+12*gprsize], r5
%else
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
dst, src, scaling, unused1, fg_data, picptr, unused2
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+6*mmsize+ 3*gprsize]
%define r1m [rsp+6*mmsize+ 4*gprsize]
%define r2m [rsp+6*mmsize+ 5*gprsize]
%define r3m [rsp+6*mmsize+ 6*gprsize]
%define r4m [rsp+6*mmsize+ 7*gprsize]
%define r5m [rsp+6*mmsize+ 8*gprsize]
%define r6m [rsp+6*mmsize+ 9*gprsize]
%define r7m [rsp+6*mmsize+10*gprsize]
%define r8m [rsp+6*mmsize+11*gprsize]
%define r0m [rsp+5*mmsize+ 4*gprsize]
%define r1m [rsp+5*mmsize+ 5*gprsize]
%define r2m [rsp+5*mmsize+ 6*gprsize]
%define r3m [rsp+5*mmsize+ 7*gprsize]
%define r4m [rsp+5*mmsize+ 8*gprsize]
%define r5m [rsp+5*mmsize+ 9*gprsize]
%define r6m [rsp+5*mmsize+10*gprsize]
%define r7m [rsp+5*mmsize+11*gprsize]
%define r8m [rsp+5*mmsize+12*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
@@ -1330,8 +1346,6 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
mov r6d, [fg_dataq+FGData.scaling_shift]
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
pcmpeqw m2, m2
psrldq m2, 14
movd m4, [base+max+r6*4]
movd m5, [base+min+r6*2]
punpcklwd m3, m3
@@ -1340,10 +1354,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
pshufd m3, m3, q0000
pshufd m4, m4, q0000
pshufd m5, m5, q0000
SCRATCH 2, 10, 0
SCRATCH 3, 11, 1
SCRATCH 4, 12, 2
SCRATCH 5, 13, 3
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
%if ARCH_X86_32
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
@@ -1356,9 +1369,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
test overlapd, overlapd
jz .no_vertical_overlap
mova m6, [base+pw_1024]
movd m7, [base+pb_27_17_17_27]
SCRATCH 6, 14, 4
SCRATCH 7, 15, 5
mova m7, [base+pb_27_17_17_27]
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
test sbyd, sbyd
jnz .vertical_overlap
; fall-through
@@ -1445,16 +1458,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
@@ -1504,7 +1514,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
@@ -1525,7 +1535,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
add offxyd, 16 ; left_offxyd
mov [rsp+6*mmsize+0*gprsize], offxyd
mov [rsp+5*mmsize+0*gprsize], offxyd
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
@@ -1578,21 +1588,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+0*gprsize]
mov r5, [rsp+5*mmsize+0*gprsize]
movd m7, [grain_lutq+r5]
%else
movd m7, [grain_lutq+left_offxyq]
@@ -1601,9 +1608,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
pmaddubsw m6, m15, m7
pmulhrsw m6, m14
packsswb m6, m6
pand m6, m10
pandn m7, m10, m3
por m6, m7
shufps m6, m3, q3210
pcmpgtb m2, m6
punpcklbw m7, m6, m2
punpckhbw m6, m2
@@ -1649,7 +1654,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
test dword r8m, 2 ; have_top_overlap
jz .loop_x_odd
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add r11d, 16 ; top_offxyd
%endif
@@ -1754,7 +1759,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
movzx top_offxyd, offxyw
%if ARCH_X86_32
mov [rsp+6*mmsize+1*gprsize], top_offxyd
mov [rsp+5*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
%endif
@@ -1764,7 +1769,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+8], r5
mov [rsp+5*mmsize+12], r5
%else
mova m8, [pb_27_17]
%endif
@@ -1779,21 +1784,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
; scaling[src]
%if ARCH_X86_32
vpgatherdw m4, m0, scalingq, r0, r5, m3
vpgatherdw m5, m1, scalingq, r0, r5, m3
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
%else
vpgatherdw m4, m0, scalingq, r12, r13, m3
vpgatherdw m5, m1, scalingq, r12, r13, m3
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
%endif
pcmpeqw m3, m3
psrlw m3, 8
pand m4, m3
pand m5, m3
REPX {psrlw x, 8}, m4, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+1*gprsize]
mov r5, [rsp+5*mmsize+1*gprsize]
movu m7, [grain_lutq+r5]
%else
movu m7, [grain_lutq+top_offxyq]
@@ -1801,7 +1803,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
punpckhbw m6, m7, m3
punpcklbw m7, m3
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+8]
mov r5, [rsp+5*mmsize+12]
pmaddubsw m3, [r5], m6
pmaddubsw m6, [r5], m7
%else
@@ -1833,7 +1835,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+8], mmsize
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
@@ -1864,7 +1866,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
jc .loop_x_hv_overlap
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@@ -1874,16 +1876,16 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
%if ARCH_X86_32
mov r5, r5m
lea r5, [base+pb_27_17]
mov [rsp+5*mmsize+8], r5
mov [rsp+5*mmsize+12], r5
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
mov r5, [rsp+6*mmsize+1*gprsize]
mov r5, [rsp+5*mmsize+1*gprsize]
mov r4, offxyd
add r5, 16
add r4, 16
mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy
mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy
mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
@@ -1937,7 +1939,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
movzx r5, offxyw ; top_offxy
mov [rsp+6*mmsize+1*gprsize], r5
mov [rsp+5*mmsize+1*gprsize], r5
%else
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy
@@ -1952,10 +1954,10 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy
mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy
mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
movu m6, [grain_lutq+r5]
mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
movd m4, [grain_lutq+r0]
movd m7, [grain_lutq+r5]
%else
@@ -1972,17 +1974,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
pmulhrsw m4, m14
packsswb m2, m2
packsswb m4, m4
pand m2, m10
pand m4, m10
pandn m7, m10, m3
pandn m3, m10, m6
por m7, m2
por m3, m4
shufps m2, m3, q3210
shufps m4, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpckhbw m4, m3, m7
punpcklbw m3, m7
punpcklbw m3, m4, m2
punpckhbw m4, m2
%if ARCH_X86_32
mov r5, [rsp+5*mmsize+8]
mov r5, [rsp+5*mmsize+12]
pmaddubsw m7, [r5], m4
pmaddubsw m4, [r5], m3
%else
@@ -2004,16 +2002,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
; scaling[src]
%if ARCH_X86_32
vpgatherdw m5, m0, scalingq, r0, r5, m7
vpgatherdw m6, m1, scalingq, r0, r5, m7
vpgatherdw m5, m0, scalingq-1, r0, r5, m7
vpgatherdw m6, m1, scalingq-1, r0, r5, m7
%else
vpgatherdw m5, m0, scalingq, r13, r14, m7
vpgatherdw m6, m1, scalingq, r13, r14, m7
vpgatherdw m5, m0, scalingq-1, r13, r14, m7
vpgatherdw m6, m1, scalingq-1, r13, r14, m7
%endif
pcmpeqw m7, m7
psrlw m7, 8
pand m5, m7
pand m6, m7
REPX {psrlw x, 8}, m5, m6
; noise = round2(scaling[src] * grain, scaling_shift)
pmullw m3, m5
@@ -2033,7 +2028,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
mova [dstq+srcq], m0
%if ARCH_X86_32
add dword [rsp+5*mmsize+8], mmsize
add dword [rsp+5*mmsize+12], mmsize
%else
mova m8, [pb_17_27]
%endif
@@ -2063,7 +2058,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
xor dword r8m, 4
add offxyd, 16
%if ARCH_X86_32
add dword [rsp+6*mmsize+1*gprsize], 16
add dword [rsp+5*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@@ -2079,49 +2074,49 @@ INIT_XMM ssse3
; sby, luma, lstride, uv_pl, is_id)
%if STACK_ALIGNMENT < mmsize
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
tmp, src, scaling, h, fg_data, picptr, unused
mov r0, r0m
mov r1, r2m
mov r2, r4m
mov r3, r6m
mov r4, r7m
mov [rsp+8*mmsize+3*gprsize], r0
mov [rsp+8*mmsize+5*gprsize], r1
mov [rsp+8*mmsize+7*gprsize], r2
mov [rsp+8*mmsize+9*gprsize], r3
mov [rsp+8*mmsize+10*gprsize], r4
mov [rsp+7*mmsize+3*gprsize], r0
mov [rsp+7*mmsize+5*gprsize], r1
mov [rsp+7*mmsize+7*gprsize], r2
mov [rsp+7*mmsize+9*gprsize], r3
mov [rsp+7*mmsize+10*gprsize], r4
mov r0, r8m
mov r1, r9m
mov r2, r10m
mov r4, r11m
mov r3, r12m
mov [rsp+8*mmsize+11*gprsize], r0
mov [rsp+8*mmsize+12*gprsize], r1
mov [rsp+8*mmsize+13*gprsize], r2
mov [rsp+8*mmsize+14*gprsize], r4
mov [rsp+7*mmsize+11*gprsize], r0
mov [rsp+7*mmsize+12*gprsize], r1
mov [rsp+7*mmsize+13*gprsize], r2
mov [rsp+7*mmsize+14*gprsize], r4
%else
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
tmp, src, scaling, h, fg_data, picptr, unused
%endif
mov srcq, srcm
mov fg_dataq, r3m
mov scalingq, r5m
%if STACK_ALIGNMENT < mmsize
%define r0m [rsp+8*mmsize+ 3*gprsize]
%define r1m [rsp+8*mmsize+ 4*gprsize]
%define r2m [rsp+8*mmsize+ 5*gprsize]
%define r3m [rsp+8*mmsize+ 6*gprsize]
%define r4m [rsp+8*mmsize+ 7*gprsize]
%define r5m [rsp+8*mmsize+ 8*gprsize]
%define r6m [rsp+8*mmsize+ 9*gprsize]
%define r7m [rsp+8*mmsize+10*gprsize]
%define r8m [rsp+8*mmsize+11*gprsize]
%define r9m [rsp+8*mmsize+12*gprsize]
%define r10m [rsp+8*mmsize+13*gprsize]
%define r11m [rsp+8*mmsize+14*gprsize]
%define r12m [rsp+8*mmsize+15*gprsize]
%define r0m [rsp+7*mmsize+ 3*gprsize]
%define r1m [rsp+7*mmsize+ 4*gprsize]
%define r2m [rsp+7*mmsize+ 5*gprsize]
%define r3m [rsp+7*mmsize+ 6*gprsize]
%define r4m [rsp+7*mmsize+ 7*gprsize]
%define r5m [rsp+7*mmsize+ 8*gprsize]
%define r6m [rsp+7*mmsize+ 9*gprsize]
%define r7m [rsp+7*mmsize+10*gprsize]
%define r8m [rsp+7*mmsize+11*gprsize]
%define r9m [rsp+7*mmsize+12*gprsize]
%define r10m [rsp+7*mmsize+13*gprsize]
%define r11m [rsp+7*mmsize+14*gprsize]
%define r12m [rsp+7*mmsize+15*gprsize]
%endif
LEA r5, pb_mask
%define base r5-pb_mask
@@ -2133,7 +2128,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%define base r8-pb_mask
%endif
mov r6d, [fg_dataq+FGData.scaling_shift]
pcmpeqw m2, m2
movd m3, [base+mul_bits+r6*2-14]
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
lea tmpd, [r6d*2]
@@ -2145,17 +2139,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
movd m5, [base+min+r6*2]
cmovne r6d, tmpd
movd m4, [base+max+r6*2]
psrldq m2, 14+%2
punpcklwd m3, m3
punpcklwd m5, m5
punpcklwd m4, m4
pshufd m3, m3, q0000
pshufd m5, m5, q0000
pshufd m4, m4, q0000
SCRATCH 2, 10, 0
SCRATCH 3, 11, 1
SCRATCH 4, 12, 2
SCRATCH 5, 13, 3
SCRATCH 3, 11, 0
SCRATCH 4, 12, 1
SCRATCH 5, 13, 2
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
@@ -2177,8 +2169,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
punpcklwd m7, m7
pshufd m6, m6, q0000
pshufd m7, m7, q0000
SCRATCH 6, 14, 4
SCRATCH 7, 15, 5
SCRATCH 6, 14, 3
SCRATCH 7, 15, 4
%endif
mov sbyd, r8m
@@ -2187,22 +2179,21 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jz %%no_vertical_overlap
%if ARCH_X86_32
%if %2
movd m1, [base+pb_23_22]
mova m1, [base+pb_23_22_h]
%else
movd m1, [base+pb_27_17_17_27]
mova m1, [base+pb_27_17_17_27]
%endif
mova m0, [base+pw_1024]
%else
%if %2
movd m1, [pb_23_22]
mova m1, [pb_23_22_h]
%else
movd m1, [pb_27_17_17_27]
mova m1, [pb_27_17_17_27]
%endif
mova m0, [pw_1024]
%endif
pshufd m1, m1, q0000
SCRATCH 0, 8, 6
SCRATCH 1, 9, 7
SCRATCH 0, 8, 5
SCRATCH 1, 9, 6
test sbyd, sbyd
jnz %%vertical_overlap
; fall-through
@@ -2347,16 +2338,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m1, m1
psrlw m1, 8
pand m7, m1
pand m5, m1
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
@@ -2426,7 +2414,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%if %2 == 0
; adjust top_offxy
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
@@ -2450,9 +2438,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%if ARCH_X86_32
%if %2
lea r6, [offxyd+16]
mov [rsp+8*mmsize+0*gprsize], r6
mov [rsp+7*mmsize+0*gprsize], r6
%else
mov [rsp+8*mmsize+0*gprsize], offxyd
mov [rsp+7*mmsize+0*gprsize], offxyd
%endif
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
@@ -2558,36 +2546,31 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m1, m1
psrlw m1, 8
pand m7, m1
pand m5, m1
REPX {psrlw x, 8}, m7, m5
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq+ 0]
movu m4, [grain_lutq+offxyq+ 0]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+0*gprsize]
movd m4, [grain_lutq+r0+ 0]
mov r0, [rsp+7*mmsize+0*gprsize]
movd m2, [grain_lutq+r0+ 0]
%else
movd m4, [grain_lutq+left_offxyq+ 0]
movd m2, [grain_lutq+left_offxyq+ 0]
%endif
punpcklbw m2, m4, m3
pmaddubsw m4, m9, m2
pmulhrsw m4, m8
packsswb m4, m4
pand m4, m10
pandn m2, m10, m3
por m3, m4, m2
punpcklbw m2, m4
pmaddubsw m3, m9, m2
pmulhrsw m3, m8
packsswb m3, m3
shufps m3, m4, q3210
pxor m4, m4
pcmpgtb m4, m3
punpcklbw m2, m3, m4
@@ -2652,7 +2635,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
xor dword r8m, 4
; adjust top_offxyd
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add r11d, 16
%endif
@@ -2780,7 +2763,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
mov [rsp+7*mmsize+1*gprsize], top_offxyd
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
%endif
@@ -2790,9 +2773,11 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_27_17]
%endif
%if %3
mova m1, [PIC_ptr(pb_23_22)]
%else
mova m1, [pb_27_17]
mova m1, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_v_overlap:
%if ARCH_X86_32
@@ -2848,34 +2833,26 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[luma_src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
vpgatherdw m7, m4, scalingq, r12, r2
vpgatherdw m5, m6, scalingq, r12, r2
vpgatherdw m7, m4, scalingq-1, r12, r2
vpgatherdw m5, m6, scalingq-1, r12, r2
%endif
pcmpeqw m4, m4
psrlw m4, 8
pand m7, m4
pand m5, m4
REPX {psrlw x, 8}, m7, m5
; grain = grain_lut[offy+y][offx+x]
movu m3, [grain_lutq+offxyq]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+1*gprsize]
mov r0, [rsp+7*mmsize+1*gprsize]
movu m4, [grain_lutq+r0]
%else
movu m4, [grain_lutq+top_offxyq]
%endif
punpckhbw m6, m4, m3
punpcklbw m4, m3
%if %3
pmaddubsw m2, m9, m6
pmaddubsw m3, m9, m4
%else
pmaddubsw m2, m1, m6
pmaddubsw m3, m1, m4
%endif
pmulhrsw m2, m8
pmulhrsw m3, m8
packsswb m3, m2
@@ -2928,10 +2905,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
btc hd, 16
%if ARCH_X86_32
mov r5, r5m
mova m1, [base+pb_17_27]
%else
mova m1, [pb_17_27]
%endif
mova m1, [PIC_ptr(pb_17_27)]
jnc %%loop_y_v_overlap
%endif
jmp %%loop_y
@@ -2963,7 +2938,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; h+v overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif
@@ -2976,15 +2951,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
%if ARCH_X86_32
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
mov r6, [rsp+8*mmsize+1*gprsize]
mov r6, [rsp+7*mmsize+1*gprsize]
%if %2
lea r0, [r3d+16]
add r6, 16
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
%else
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
%endif
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@@ -3048,18 +3023,55 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
movzx top_offxyd, offxyw
shr offxyd, 16
%if ARCH_X86_32
mov [rsp+8*mmsize+1*gprsize], top_offxyd
mov [rsp+7*mmsize+1*gprsize], top_offxyd
%endif
mov hd, r7m
mov grain_lutq, grain_lutmp
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_27_17]
%endif
%if %3
mova m3, [PIC_ptr(pb_23_22)]
%else
mova m3, [pb_27_17]
mova m3, [PIC_ptr(pb_27_17)]
%endif
%%loop_y_hv_overlap:
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
movd m1, [grain_lutq+r0]
mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
%else
movd m1, [grain_lutq+topleft_offxyq]
%endif
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m2
pmaddubsw m0, m9, m1
pmaddubsw m1, m9, m4
REPX {pmulhrsw x, m8}, m0, m1
packsswb m0, m1
shufps m4, m0, m2, q3232
shufps m0, m6, q3210
; followed by v interpolation (top | cur -> cur)
punpcklbw m2, m0, m4
punpckhbw m0, m4
pmaddubsw m4, m3, m0
pmaddubsw m1, m3, m2
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
; src
%if ARCH_X86_32
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@@ -3116,69 +3128,20 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
; scaling[src]
%if ARCH_X86_32
vpgatherdw m7, m4, scalingq, r0, r5
vpgatherdw m5, m6, scalingq, r0, r5
vpgatherdw m7, m4, scalingq-1, r0, r5
vpgatherdw m5, m6, scalingq-1, r0, r5
%else
movd m1, [grain_lutq+topleft_offxyq]
%if %3
vpgatherdw m7, m4, scalingq, r2, r12
vpgatherdw m5, m6, scalingq, r2, r12
vpgatherdw m7, m4, scalingq-1, r2, r12
vpgatherdw m5, m6, scalingq-1, r2, r12
%else
vpgatherdw m7, m4, scalingq, r2, r13
vpgatherdw m5, m6, scalingq, r2, r13
vpgatherdw m7, m4, scalingq-1, r2, r13
vpgatherdw m5, m6, scalingq-1, r2, r13
%endif
%endif
pcmpeqw m2, m2
psrlw m2, 8
pand m7, m2
pand m5, m2
REPX {psrlw x, 8}, m7, m5
; grain = grain_lut[offy+y][offx+x]
%if ARCH_X86_32
mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy
movd m1, [grain_lutq+r0]
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
%endif
movu m2, [grain_lutq+offxyq]
%if ARCH_X86_32
movu m6, [grain_lutq+r5]
movd m4, [grain_lutq+r0]
%else
movu m6, [grain_lutq+top_offxyq]
movd m4, [grain_lutq+left_offxyq]
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
punpcklbw m1, m6
punpcklbw m4, m2
%if %2
punpcklwd m4, m1
%else
punpckldq m4, m1
%endif
pmaddubsw m1, m9, m4
pmulhrsw m1, m8
packsswb m1, m1
pandn m4, m10, m2
pandn m2, m10, m6
psrldq m6, m1, 2-%2
pand m1, m10
pand m6, m10
por m4, m1
por m2, m6
; followed by v interpolation (top | cur -> cur)
punpckhbw m1, m2, m4
punpcklbw m2, m4
%if %3
pmaddubsw m4, m9, m1
pmaddubsw m1, m9, m2
%else
pmaddubsw m4, m3, m1
pmaddubsw m1, m3, m2
%endif
pmulhrsw m4, m8
pmulhrsw m1, m8
packsswb m1, m4
; unpack grain
pxor m4, m4
pcmpgtb m4, m1
punpcklbw m2, m1, m4
@@ -3229,10 +3192,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jle %%end_y_hv_overlap
%if ARCH_X86_32
mov r5, r5m
mova m3, [base+pb_17_27]
%else
mova m3, [pb_17_27]
%endif
mova m3, [PIC_ptr(pb_17_27)]
btc hd, 16
jnc %%loop_y_hv_overlap
%if ARCH_X86_64
@@ -3268,7 +3229,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
jmp %%loop_x_hv_overlap
%else
%if ARCH_X86_32
add dword [rsp+8*mmsize+1*gprsize], 16
add dword [rsp+7*mmsize+1*gprsize], 16
%else
add top_offxyd, 16
%endif