mirror of
https://code.videolan.org/videolan/dav1d
synced 2026-06-11 04:03:05 +00:00
x86/filmgrain: simplify post-horizontal filter blending
This commit is contained in:
@@ -29,8 +29,6 @@
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA 32
|
||||
pw_1024: times 16 dw 1024
|
||||
pw_23_22: times 8 dw 23, 22
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
@@ -48,6 +46,7 @@ pw_27_17_17_27: dw 27, 17, 17, 27
|
||||
; these two should be next to each other
|
||||
pw_4: times 2 dw 4
|
||||
pw_16: times 2 dw 16
|
||||
pw_23_22: dw 23, 22, 0, 32
|
||||
|
||||
%macro JMP_TABLE 1-*
|
||||
%xdefine %1_table %%table
|
||||
@@ -1480,8 +1479,8 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
vpbroadcastd m9, [base+pw_4+r9*4]
|
||||
pmullw m15, m9
|
||||
%else
|
||||
vpbroadcastd m14, [pw_1024]
|
||||
vpbroadcastd m15, [pw_23_22]
|
||||
vpbroadcastd m14, [pd_16]
|
||||
vpbroadcastq m15, [pw_23_22]
|
||||
%endif
|
||||
|
||||
movifnidn sbyd, sbym
|
||||
@@ -1689,16 +1688,18 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
movu m9, [grain_lutq+offxyq*2]
|
||||
movu m3, [grain_lutq+offxyq*2+82*2]
|
||||
movd xm5, [grain_lutq+left_offxyq*2+ 0]
|
||||
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1 ; {left0, left1}
|
||||
punpcklwd xm7, xm9, xm3 ; {cur0, cur1}
|
||||
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2 ; {left0, left1}
|
||||
punpckldq xm7, xm9, xm3 ; {cur0, cur1}
|
||||
punpcklwd xm5, xm7 ; {left0, cur0, left1, cur1}
|
||||
%if %1
|
||||
pmaddwd xm5, [pw_23_22]
|
||||
%else
|
||||
pmaddwd xm5, xm15
|
||||
%endif
|
||||
vpbroadcastq xm8, [pw_23_22]
|
||||
pmaddwd xm5, xm8
|
||||
vpbroadcastd xm8, [pd_16]
|
||||
paddd xm5, xm8
|
||||
%else
|
||||
pmaddwd xm5, xm15
|
||||
paddd xm5, xm14
|
||||
%endif
|
||||
psrad xm5, 5
|
||||
packssdw xm5, xm5
|
||||
pcmpeqw xm8, xm8
|
||||
@@ -1706,11 +1707,9 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
pxor xm8, xm7
|
||||
pmaxsw xm5, xm8
|
||||
pminsw xm5, xm7
|
||||
vpblendw xm7, xm5, xm9, 11111110b
|
||||
psrldq xm5, 2
|
||||
vpblendw xm5, xm3, 11111110b
|
||||
vpblendd m9, m7, 00001111b
|
||||
vpblendd m3, m5, 00001111b
|
||||
vpblendd m9, m9, m5, 00000001b
|
||||
psrldq xm5, 4
|
||||
vpblendd m3, m3, m5, 00000001b
|
||||
|
||||
; scaling[luma_src]
|
||||
punpckhwd m5, m4, m2
|
||||
@@ -1875,13 +1874,14 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
movu m5, [grain_lutq+top_offxyq*2]
|
||||
punpckhwd m7, m5, m9
|
||||
punpcklwd m5, m9 ; {top/cur interleaved}
|
||||
vpbroadcastd m3, [pw_23_22]
|
||||
REPX {pmaddwd x, m3}, m7, m5
|
||||
%if %1
|
||||
REPX {pmaddwd x, [pw_23_22]}, m7, m5
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m7, m5
|
||||
%endif
|
||||
vpbroadcastd m3, [pd_16]
|
||||
REPX {paddd x, m3}, m7, m5
|
||||
%else
|
||||
REPX {paddd x, m14}, m7, m5
|
||||
%endif
|
||||
REPX {psrad x, 5}, m7, m5
|
||||
packssdw m9, m5, m7
|
||||
pcmpeqw m7, m7
|
||||
@@ -1989,48 +1989,51 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%%loop_y_hv_overlap:
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movd xm5, [grain_lutq+left_offxyq*2]
|
||||
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 1
|
||||
pinsrw xm5, [grain_lutq+topleft_offxyq*2], 2 ; { left0, left1, top/left }
|
||||
pinsrw xm5, [grain_lutq+left_offxyq*2+82*2], 2
|
||||
vinserti128 m5, [grain_lutq+topleft_offxyq*2], 1 ; { left0, left1, top/left }
|
||||
movu m9, [grain_lutq+offxyq*2]
|
||||
movu m3, [grain_lutq+offxyq*2+82*2]
|
||||
movu m8, [grain_lutq+top_offxyq*2]
|
||||
punpcklwd xm7, xm9, xm3 ; { cur0, cur1 }
|
||||
punpckldq xm7, xm8 ; { cur0, cur1, top0 }
|
||||
punpcklwd xm5, xm7 ; { cur/left } interleaved
|
||||
pmaddwd xm5, [pw_23_22]
|
||||
vpbroadcastd xm0, [pd_16]
|
||||
paddd xm5, xm0
|
||||
psrad xm5, 5
|
||||
packssdw xm5, xm5
|
||||
pcmpeqw xm0, xm0
|
||||
psraw xm7, xm10, 1
|
||||
pxor xm0, xm7
|
||||
punpckldq xm7, xm9, xm3 ; { cur0, cur1 }
|
||||
vinserti128 m7, xm8, 1 ; { cur0, cur1, top0 }
|
||||
punpcklwd m5, m7 ; { cur/left } interleaved
|
||||
%if %1
|
||||
vpbroadcastq m0, [pw_23_22]
|
||||
pmaddwd m5, m0
|
||||
vpbroadcastd m0, [pd_16]
|
||||
paddd m5, m0
|
||||
%else
|
||||
pmaddwd m5, m15
|
||||
paddd m5, m14
|
||||
%endif
|
||||
psrad m5, 5
|
||||
vextracti128 xm0, m5, 1
|
||||
packssdw xm5, xm0
|
||||
pcmpeqw m0, m0
|
||||
psraw m7, m10, 1
|
||||
pxor m0, m7
|
||||
pminsw xm5, xm7
|
||||
pmaxsw xm5, xm0
|
||||
pcmpeqw xm7, xm7
|
||||
psrldq xm7, 14 ; 0xffff, 0.....
|
||||
vpblendvb m9, m5, m7 ; line 0
|
||||
psrldq xm5, 2
|
||||
vpblendvb m3, m5, m7 ; line 1
|
||||
psrldq xm5, 2
|
||||
vpblendvb m5, m8, m5, m7 ; top line
|
||||
vpblendd m9, m9, m5, 00000001b
|
||||
psrldq xm5, 4
|
||||
vpblendd m3, m3, m5, 00000001b
|
||||
psrldq xm5, 4
|
||||
vpblendd m5, m8, m5, 00000001b
|
||||
|
||||
punpckhwd m7, m5, m9
|
||||
punpckhwd m8, m5, m9
|
||||
punpcklwd m5, m9 ; {top/cur interleaved}
|
||||
vpbroadcastd m9, [pw_23_22]
|
||||
REPX {pmaddwd x, m9}, m8, m5
|
||||
%if %1
|
||||
REPX {pmaddwd x, [pw_23_22]}, m7, m5
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m7, m5
|
||||
%endif
|
||||
vpbroadcastd m9, [pd_16]
|
||||
REPX {paddd x, m9}, m5, m7
|
||||
REPX {psrad x, 5}, m5, m7
|
||||
packssdw m9, m5, m7
|
||||
pcmpeqw m5, m5
|
||||
psraw m7, m10, 1
|
||||
pxor m5, m7
|
||||
pmaxsw m9, m5
|
||||
REPX {paddd x, m9}, m5, m8
|
||||
%else
|
||||
REPX {paddd x, m14}, m5, m8
|
||||
%endif
|
||||
REPX {psrad x, 5}, m5, m8
|
||||
packssdw m9, m5, m8
|
||||
pminsw m9, m7
|
||||
pmaxsw m9, m0
|
||||
|
||||
; src
|
||||
mova m0, [srcq]
|
||||
|
||||
+79
-157
@@ -31,8 +31,11 @@ pd_16: times 4 dd 16
|
||||
pw_1: times 8 dw 1
|
||||
pw_16384: times 8 dw 16384
|
||||
pw_8192: times 8 dw 8192
|
||||
pw_23_22: times 4 dw 23, 22
|
||||
pw_23_22: dw 23, 22
|
||||
times 3 dw 0, 32
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
pw_27_17_17_27: dw 27, 17, 17, 27
|
||||
times 2 dw 0, 32
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
@@ -43,7 +46,6 @@ mul_bits: dw 256, 128, 64, 32, 16
|
||||
round_vals: dw 32, 64, 128, 256, 512, 1024
|
||||
max: dw 256*4-1, 240*4, 235*4, 256*16-1, 240*16, 235*16
|
||||
min: dw 0, 16*4, 16*16
|
||||
pw_27_17_17_27: dw 27, 17, 17, 27
|
||||
; these two should be next to each other
|
||||
pw_4: times 2 dw 4
|
||||
pw_16: times 2 dw 16
|
||||
@@ -96,6 +98,13 @@ SECTION .text
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
%undef base
|
||||
%define PIC_ptr(a) base+a
|
||||
%else
|
||||
%define PIC_ptr(a) a
|
||||
%endif
|
||||
|
||||
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
|
||||
|
||||
%macro vpgatherdw 5-8 8, 1 ; dst, src, base, tmp_gpr[x2], cnt, stride, tmp_xmm_reg
|
||||
@@ -1429,13 +1438,6 @@ generate_grain_uv_fn 444, 0, 0
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
%undef base
|
||||
%define PIC_ptr(a) base+a
|
||||
%else
|
||||
%define PIC_ptr(a) a
|
||||
%endif
|
||||
|
||||
INIT_XMM ssse3
|
||||
%if ARCH_X86_32
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
@@ -1520,10 +1522,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
SCRATCH 6, 14, 5
|
||||
SCRATCH 7, 15, 6
|
||||
|
||||
%if !cpuflag(sse4)
|
||||
pcmpeqw m6, m6
|
||||
pslldq m6, 4
|
||||
%endif
|
||||
mova m6, [base+pw_27_17_17_27] ; for horizontal filter
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, unused2
|
||||
@@ -1672,11 +1671,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
test dword r8m, 2
|
||||
jz .loop_x_odd
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m7, [base+pw_27_17_17_27]
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
SPLATD m7, [pw_27_17_17_27]
|
||||
add r12d, 16 ; top_offxy += 16
|
||||
%endif
|
||||
jmp .loop_x_odd_v_overlap
|
||||
@@ -1686,12 +1682,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
jz .loop_x
|
||||
|
||||
; r8m = sbym
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
movq m7, [base+pw_27_17_17_27]
|
||||
%else
|
||||
movq m7, [pw_27_17_17_27]
|
||||
%endif
|
||||
test dword r8m, 2
|
||||
jnz .loop_x_hv_overlap
|
||||
|
||||
@@ -1743,27 +1733,21 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
mov grain_lutq, grain_lutmp
|
||||
.loop_y_h_overlap:
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m4, [grain_lutq+offxyq*2]
|
||||
movu m5, [grain_lutq+offxyq*2]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+8*mmsize+0*gprsize]
|
||||
movd m5, [grain_lutq+r5*2]
|
||||
movd m4, [grain_lutq+r5*2]
|
||||
%else
|
||||
movd m5, [grain_lutq+left_offxyq*2]
|
||||
%endif
|
||||
punpcklwd m5, m4
|
||||
pmaddwd m5, m7
|
||||
paddd m5, m14
|
||||
psrad m5, 5
|
||||
packssdw m5, m5
|
||||
%if cpuflag(sse4)
|
||||
pblendw m4, m5, 00000011b
|
||||
%else
|
||||
pand m4, m6
|
||||
pandn m0, m6, m5
|
||||
por m4, m0
|
||||
movd m4, [grain_lutq+left_offxyq*2]
|
||||
%endif
|
||||
punpcklwd m4, m5
|
||||
pmaddwd m4, m6
|
||||
paddd m4, m14
|
||||
psrad m4, 5
|
||||
packssdw m4, m4
|
||||
pminsw m4, m15
|
||||
pmaxsw m4, m9
|
||||
shufps m4, m5, q3210
|
||||
|
||||
; src
|
||||
pand m0, m10, [srcq+ 0]
|
||||
@@ -1822,11 +1806,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
test dword r8m, 2
|
||||
jz .loop_x_odd
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m7, [base+pw_27_17_17_27]
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
SPLATD m7, [pw_27_17_17_27]
|
||||
add r12d, 16 ; top_offxy += 16
|
||||
%endif
|
||||
jmp .loop_x_odd_v_overlap
|
||||
@@ -1941,6 +1922,10 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
shr offxyd, 16
|
||||
|
||||
.loop_x_odd_v_overlap:
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
|
||||
mov hd, dword r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
.loop_y_v_overlap:
|
||||
@@ -2009,18 +1994,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
mova [dstq+srcq+ 0], m0
|
||||
mova [dstq+srcq+16], m1
|
||||
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m7, [base+pw_27_17_17_27+4]
|
||||
%else
|
||||
SPLATD m7, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
|
||||
%endif
|
||||
add srcq, r2mp
|
||||
add grain_lutq, 82*2
|
||||
dec hw
|
||||
jz .end_y_v_overlap
|
||||
; 2 lines get vertical overlap, then fall back to non-overlap code for
|
||||
; remaining (up to) 30 lines
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
|
||||
xor hd, 0x10000
|
||||
test hd, 0x10000
|
||||
jnz .loop_y_v_overlap
|
||||
@@ -2044,11 +2027,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
btc dword r8m, 2
|
||||
jc .next_blk_v
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m7, [base+pw_27_17_17_27]
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
%else
|
||||
SPLATD m7, [pw_27_17_17_27]
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
add offxyd, 16
|
||||
@@ -2059,20 +2039,8 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
; back to .loop_x_v_overlap, and instead always fall-through to
|
||||
; h+v overlap
|
||||
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
movq m7, [base+pw_27_17_17_27]
|
||||
%else
|
||||
movq m7, [pw_27_17_17_27]
|
||||
%endif
|
||||
|
||||
.loop_x_hv_overlap:
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m0, [base+pw_27_17_17_27]
|
||||
mova [rsp+7*mmsize], m0
|
||||
%define m8 [rsp+7*mmsize]
|
||||
|
||||
DEFINE_ARGS dst, src, scaling, see, w, picptr, src_bak
|
||||
|
||||
mov r0, [rsp+8*mmsize+1*gprsize]
|
||||
@@ -2084,8 +2052,6 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
mov seed, r3m
|
||||
xor r0, r0
|
||||
%else
|
||||
SPLATD m8, [pw_27_17_17_27]
|
||||
|
||||
; we assume from the block above that bits 8-15 of r7d are zero'ed
|
||||
%endif
|
||||
mov r6d, seed
|
||||
@@ -2139,43 +2105,39 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
%endif
|
||||
shr offxyd, 16
|
||||
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
SPLATD m7, [PIC_ptr(pw_27_17_17_27)]
|
||||
|
||||
movzx hd, word r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
.loop_y_hv_overlap:
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq*2]
|
||||
movu m2, [grain_lutq+offxyq*2]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+1*gprsize] ; top_offxy
|
||||
mov r5, [rsp+8*mmsize+0*gprsize] ; left_offxy
|
||||
movu m5, [grain_lutq+r0*2]
|
||||
movd m4, [grain_lutq+r5*2]
|
||||
movu m4, [grain_lutq+r0*2]
|
||||
movd m5, [grain_lutq+r5*2]
|
||||
mov r5, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
|
||||
movd m2, [grain_lutq+r5*2]
|
||||
movd m3, [grain_lutq+r5*2]
|
||||
%else
|
||||
movu m5, [grain_lutq+top_offxyq*2]
|
||||
movd m4, [grain_lutq+left_offxyq*2]
|
||||
movd m2, [grain_lutq+topleft_offxyq*2]
|
||||
movu m4, [grain_lutq+top_offxyq*2]
|
||||
movd m5, [grain_lutq+left_offxyq*2]
|
||||
movd m3, [grain_lutq+topleft_offxyq*2]
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklwd m4, m3
|
||||
punpcklwd m2, m5
|
||||
REPX {pmaddwd x, m7}, m4, m2
|
||||
REPX {paddd x, m14}, m4, m2
|
||||
REPX {psrad x, 5}, m4, m2
|
||||
REPX {packssdw x, x}, m4, m2
|
||||
REPX {pminsw x, m15}, m4, m2
|
||||
REPX {pmaxsw x, m9}, m4, m2
|
||||
%if cpuflag(sse4)
|
||||
pblendw m3, m4, 00000011b
|
||||
pblendw m5, m2, 00000011b
|
||||
%else
|
||||
pand m3, m6
|
||||
pand m5, m6
|
||||
pandn m0, m6, m4
|
||||
pandn m1, m6, m2
|
||||
por m3, m0
|
||||
por m5, m1
|
||||
%endif
|
||||
punpcklwd m5, m2
|
||||
punpcklwd m3, m4
|
||||
REPX {pmaddwd x, m6}, m5, m3
|
||||
REPX {paddd x, m14}, m5, m3
|
||||
REPX {psrad x, 5}, m5, m3
|
||||
packssdw m5, m3
|
||||
pminsw m5, m15
|
||||
pmaxsw m5, m9
|
||||
shufps m3, m5, m2, q3210
|
||||
shufps m5, m4, q3232
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
movu m0, [grain_lutq+offxyq*2+16]
|
||||
%if ARCH_X86_32
|
||||
@@ -2187,7 +2149,7 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
punpckhwd m5, m3
|
||||
punpcklwd m3, m1, m0
|
||||
punpckhwd m1, m0
|
||||
REPX {pmaddwd x, m8}, m2, m5, m3, m1
|
||||
REPX {pmaddwd x, m7}, m2, m5, m3, m1
|
||||
REPX {paddd x, m14}, m2, m5, m3, m1
|
||||
REPX {psrad x, 5}, m2, m5, m3, m1
|
||||
packssdw m2, m5
|
||||
@@ -2229,19 +2191,16 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
mova [dstq+srcq+ 0], m0
|
||||
mova [dstq+srcq+16], m1
|
||||
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m0, [base+pw_27_17_17_27+4]
|
||||
mova m8, m0
|
||||
%else
|
||||
SPLATD m8, [pw_27_17_17_27+4] ; swap weights for second v-overlap line
|
||||
%endif
|
||||
add srcq, r2mp
|
||||
add grain_lutq, 82*2
|
||||
dec hw
|
||||
jz .end_y_hv_overlap
|
||||
; 2 lines get vertical overlap, then fall back to non-overlap code for
|
||||
; remaining (up to) 30 lines
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
SPLATD m7, [PIC_ptr(pw_27_17_17_27)+4]
|
||||
xor hd, 0x10000
|
||||
test hd, 0x10000
|
||||
jnz .loop_y_hv_overlap
|
||||
@@ -2257,14 +2216,12 @@ cglobal fgy_32x32xn_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, gra
|
||||
jge .end_hv
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
SPLATD m7, [base+pw_27_17_17_27]
|
||||
add offxyd, 16
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16 ; top_offxy += 16
|
||||
mov srcq, r9mp
|
||||
add srcq, r4mp
|
||||
add srcq, r4mp
|
||||
%else
|
||||
SPLATD m7, [pw_27_17_17_27]
|
||||
add offxyd, 16
|
||||
add top_offxyd, 16
|
||||
mov src_bakq, r9mp
|
||||
@@ -2370,12 +2327,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
SCRATCH 4, 12, 4
|
||||
SCRATCH 5, 13, 5
|
||||
|
||||
%if cpuflag(sse4)
|
||||
pxor m2, m2
|
||||
%define mzero m2
|
||||
%else
|
||||
%define mzero m7
|
||||
%endif
|
||||
|
||||
SPLATD m2, [base+pw_23_22]
|
||||
|
||||
%if ARCH_X86_32
|
||||
mov scalingq, r5m
|
||||
mov r5m, r5
|
||||
@@ -2390,11 +2345,6 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
SCRATCH 0, 8, 0
|
||||
SCRATCH 1, 9, 1
|
||||
|
||||
%if !cpuflag(sse4)
|
||||
pcmpeqw m2, m2
|
||||
pslldq m2, 2
|
||||
%endif
|
||||
|
||||
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
||||
jne .csfl
|
||||
|
||||
@@ -2419,7 +2369,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
pmullw m5, m7
|
||||
%else
|
||||
SPLATD m6, [base+pd_16]
|
||||
SPLATD m5, [base+pw_23_22]
|
||||
mova m5, [base+pw_23_22]
|
||||
%endif
|
||||
|
||||
SCRATCH 6, 14, 6
|
||||
@@ -2529,9 +2479,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
mova m1, [srcq+16] ; m0-1: src as word
|
||||
|
||||
; luma_src
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
|
||||
|
||||
@@ -2687,9 +2635,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
mova m1, [srcq+16]
|
||||
|
||||
; luma_src
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
|
||||
mov lumaq, r9m
|
||||
@@ -2744,13 +2690,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
packssdw m5, m5
|
||||
pmaxsw m5, m8
|
||||
pminsw m5, m9
|
||||
%if cpuflag(sse4)
|
||||
pblendw m5, m7, 11111110b
|
||||
%else
|
||||
pand m7, m2
|
||||
pandn m3, m2, m5
|
||||
por m5, m7, m3
|
||||
%endif
|
||||
shufps m5, m7, q3210
|
||||
movu m3, [grain_lutq+offxyq*2+16]
|
||||
|
||||
; scaling[luma_src]
|
||||
@@ -2950,14 +2890,13 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%endif
|
||||
punpckhwd m7, m5, m3
|
||||
punpcklwd m5, m3 ; {top/cur interleaved}
|
||||
REPX {pmaddwd x, m2}, m7, m5
|
||||
%if %1
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
|
||||
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m7, m5
|
||||
REPX {paddd x, m14}, m7, m5
|
||||
%endif
|
||||
REPX {psrad x, 5}, m7, m5
|
||||
@@ -2974,11 +2913,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%endif
|
||||
punpckhwd m7, m5, m4
|
||||
punpcklwd m5, m4 ; {top/cur interleaved}
|
||||
REPX {pmaddwd x, m2}, m7, m5
|
||||
%if %1
|
||||
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
|
||||
REPX {paddd x, [PIC_ptr(pd_16)]}, m7, m5
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m7, m5
|
||||
REPX {paddd x, m14}, m7, m5
|
||||
%endif
|
||||
REPX {psrad x, 5}, m7, m5
|
||||
@@ -2991,9 +2929,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
mova m1, [srcq+16]
|
||||
|
||||
; luma_src
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
|
||||
|
||||
@@ -3021,9 +2957,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
REPX {pmaddwd x, m14}, m7, m6
|
||||
REPX {psrad x, 6}, m7, m6
|
||||
packssdw m6, m7
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
REPX {paddw x, m15}, m5, m6
|
||||
REPX {pmaxsw x, mzero}, m5, m6
|
||||
REPX {pminsw x, m10}, m5, m6 ; clip_pixel()
|
||||
@@ -3176,52 +3110,45 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%else
|
||||
movd m5, [grain_lutq+left_offxyq*2]
|
||||
%endif
|
||||
movu m3, [grain_lutq+offxyq*2]
|
||||
movu m7, [grain_lutq+offxyq*2]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+8*mmsize+2*gprsize]
|
||||
movu m4, [grain_lutq+r0*2]
|
||||
pinsrw m5, [grain_lutq+r5*2], 1
|
||||
pinsrw m5, [grain_lutq+r5*2], 2
|
||||
%else
|
||||
movu m4, [grain_lutq+top_offxyq*2]
|
||||
pinsrw m5, [grain_lutq+topleft_offxyq*2], 1 ; { left, top/left }
|
||||
pinsrw m5, [grain_lutq+topleft_offxyq*2], 2 ; { left, _, top/left }
|
||||
%endif
|
||||
punpcklwd m7, m3, m4 ; { cur0, top0 }
|
||||
punpcklwd m5, m7 ; { cur/left } interleaved
|
||||
punpckldq m3, m7, m4 ; { cur0/1,top0/1,cur2/3,top2/3 }
|
||||
punpcklwd m5, m3 ; { left/cur0,_/cur1,topleft/top0,_/top1 }
|
||||
%if %1
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
%endif
|
||||
pmaddwd m5, [PIC_ptr(pw_23_22)]
|
||||
pshufd m0, [PIC_ptr(pw_23_22)], q1010
|
||||
%else
|
||||
pshufd m0, m15, q1010
|
||||
%endif
|
||||
pmaddwd m5, m0
|
||||
%if %1
|
||||
paddd m5, [PIC_ptr(pd_16)]
|
||||
%else
|
||||
pmaddwd m5, m15
|
||||
paddd m5, m14
|
||||
%endif
|
||||
psrad m5, 5
|
||||
packssdw m5, m5
|
||||
pmaxsw m5, m8
|
||||
pminsw m5, m9
|
||||
%if cpuflag(sse4)
|
||||
pblendw m3, m5, 00000001b
|
||||
psrldq m5, 2
|
||||
pblendw m5, m4, 11111110b
|
||||
%else
|
||||
pand m3, m2
|
||||
pandn m7, m2, m5
|
||||
por m3, m7
|
||||
psrldq m5, 2
|
||||
pand m4, m2
|
||||
pandn m7, m2, m5
|
||||
por m5, m4, m7
|
||||
%endif
|
||||
shufps m5, m3, q3210 ; cur0/1,top0/1,cur2/3,top2/3
|
||||
shufps m3, m5, m7, q3220 ; cur0-7 post-h_filter
|
||||
shufps m5, m4, q3231 ; top0-7 post-h_filter
|
||||
|
||||
punpckhwd m7, m5, m3
|
||||
punpcklwd m5, m3 ; {top/cur interleaved}
|
||||
REPX {pmaddwd x, m2}, m7, m5
|
||||
%if %1
|
||||
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m7, m5
|
||||
REPX {paddd x, [PIC_ptr(pd_16)]}, m5, m7
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m7, m5
|
||||
REPX {paddd x, m14}, m5, m7
|
||||
%endif
|
||||
REPX {psrad x, 5}, m5, m7
|
||||
@@ -3238,11 +3165,10 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
%endif
|
||||
punpckhwd m1, m0, m4
|
||||
punpcklwd m0, m4 ; {top/cur interleaved}
|
||||
REPX {pmaddwd x, m2}, m1, m0
|
||||
%if %1
|
||||
REPX {pmaddwd x, [PIC_ptr(pw_23_22)]}, m1, m0
|
||||
REPX {paddd x, [PIC_ptr(pd_16)]}, m1, m0
|
||||
%else
|
||||
REPX {pmaddwd x, m15}, m1, m0
|
||||
REPX {paddd x, m14}, m1, m0
|
||||
%endif
|
||||
REPX {psrad x, 5}, m1, m0
|
||||
@@ -3255,9 +3181,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
mova m1, [srcq+16]
|
||||
|
||||
; luma_src
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, luma, grain_lut
|
||||
|
||||
@@ -3285,9 +3209,7 @@ cglobal fguv_32x32xn_i420_16bpc, 6, 15, 16, dst, src, stride, fg_data, w, scalin
|
||||
REPX {pmaddwd x, m14}, m7, m5
|
||||
REPX {psrad x, 6}, m7, m5
|
||||
packssdw m5, m7
|
||||
%if !cpuflag(sse4)
|
||||
pxor mzero, mzero
|
||||
%endif
|
||||
REPX {paddw x, m15}, m6, m5
|
||||
REPX {pmaxsw x, mzero}, m6, m5
|
||||
REPX {pminsw x, m10}, m6, m5 ; clip_pixel()
|
||||
|
||||
+70
-97
@@ -38,7 +38,8 @@ byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
pd_m65536: dd ~0xffff
|
||||
pb_23_22: times 2 db 23, 22
|
||||
pb_23_22: db 23, 22
|
||||
times 3 db 0, 32
|
||||
pb_1: times 4 db 1
|
||||
hmul_bits: dw 32768, 16384, 8192, 4096
|
||||
round: dw 2048, 1024, 512
|
||||
@@ -47,6 +48,7 @@ round_vals: dw 32, 64, 128, 256, 512
|
||||
max: dw 255, 240, 235
|
||||
min: dw 0, 16
|
||||
pb_27_17_17_27: db 27, 17, 17, 27
|
||||
times 2 db 0, 32
|
||||
pw_1: dw 1
|
||||
|
||||
%macro JMP_TABLE 2-*
|
||||
@@ -90,6 +92,14 @@ cextern gaussian_sequence
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
%xdefine %%f(x) %1
|
||||
%rep %0 - 1
|
||||
%rotate 1
|
||||
%%f(%1)
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
INIT_XMM avx2
|
||||
cglobal generate_grain_y_8bpc, 2, 9, 16, buf, fg_data
|
||||
lea r4, [pb_mask]
|
||||
@@ -1092,12 +1102,12 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
jz .loop_x
|
||||
|
||||
; r8m = sbym
|
||||
movd xm15, [pb_27_17_17_27]
|
||||
movq xm15, [pb_27_17_17_27]
|
||||
cmp dword r8m, 0
|
||||
jne .loop_x_hv_overlap
|
||||
|
||||
; horizontal overlap (without vertical overlap)
|
||||
movd xm14, [pw_1024]
|
||||
movq xm14, [pw_1024]
|
||||
.loop_x_h_overlap:
|
||||
mov r6d, seed
|
||||
or seed, 0xEFF4
|
||||
@@ -1156,8 +1166,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
pmaddubsw xm4, xm15, xm4
|
||||
pmulhrsw xm4, xm14
|
||||
packsswb xm4, xm4
|
||||
vpblendw xm4, xm3, 11111110b
|
||||
vpblendd m3, m4, 00001111b
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
pcmpgtb m7, m2, m3
|
||||
punpcklbw m2, m3, m7
|
||||
punpckhbw m3, m7
|
||||
@@ -1329,7 +1338,7 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
; back to .loop_x_v_overlap, and instead always fall-through to
|
||||
; h+v overlap
|
||||
|
||||
movd xm15, [pb_27_17_17_27]
|
||||
movq xm15, [pb_27_17_17_27]
|
||||
.loop_x_hv_overlap:
|
||||
vpbroadcastw m8, [pb_27_17_17_27]
|
||||
|
||||
@@ -1409,10 +1418,8 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
pmulhrsw xm7, xm14
|
||||
packsswb xm4, xm4
|
||||
packsswb xm7, xm7
|
||||
vpblendw xm4, xm3, 11111110b
|
||||
vpblendw xm7, xm6, 11111110b
|
||||
vpblendd m3, m4, 00001111b
|
||||
vpblendd m6, m7, 00001111b
|
||||
vpblendd m3, m4, 00000001b
|
||||
vpblendd m6, m7, 00000001b
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m7, m6, m3
|
||||
punpcklbw m6, m3
|
||||
@@ -1463,8 +1470,6 @@ cglobal fgy_32x32xn_8bpc, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
|
||||
grain_lut, h, sby, luma, lstride, uv_pl, is_id
|
||||
pcmpeqw m10, m10
|
||||
psrld m10, 24
|
||||
mov r7d, [fg_dataq+FGData.scaling_shift]
|
||||
lea r8, [pb_mask]
|
||||
%define base r8-pb_mask
|
||||
@@ -1490,10 +1495,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%else
|
||||
vpbroadcastd m14, [pw_1024]
|
||||
%if %2
|
||||
vpbroadcastd m15, [pb_23_22]
|
||||
vpbroadcastq m15, [pb_23_22]
|
||||
%else
|
||||
vpbroadcastd xm15, [pb_27_17_17_27]
|
||||
vpbroadcastq xm15, [pb_27_17_17_27]
|
||||
%endif
|
||||
%endif
|
||||
%if %3
|
||||
vpbroadcastw m10, [pb_23_22]
|
||||
%elif %2
|
||||
mova m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
|
||||
mov overlapd, [fg_dataq+FGData.overlap_flag]
|
||||
@@ -1593,16 +1603,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
@@ -1743,16 +1750,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
@@ -1763,7 +1767,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %2
|
||||
%if %1
|
||||
vpbroadcastd m6, [pb_23_22] ; FIXME
|
||||
vpbroadcastq m6, [pb_23_22]
|
||||
%endif
|
||||
movu xm3, [grain_lutq+offxyq+ 0]
|
||||
movd xm4, [grain_lutq+left_offxyq+ 0]
|
||||
@@ -1778,12 +1782,10 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
pcmpeqw m6, m6 ; FIXME
|
||||
psrldq m6, 15 ; FIXME
|
||||
vpblendvb m3, m3, m4, m6
|
||||
vpblendd m3, m3, m4, 00010001b
|
||||
%else
|
||||
%if %1
|
||||
vpbroadcastd xm6, [pb_27_17_17_27]
|
||||
movq xm6, [pb_27_17_17_27]
|
||||
%endif
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
movd xm4, [grain_lutq+left_offxyq]
|
||||
@@ -1796,9 +1798,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
pmulhrsw xm4, xm14
|
||||
%endif
|
||||
packsswb xm4, xm4
|
||||
pcmpeqw xm6, xm6
|
||||
psrldq xm6, 14
|
||||
vpblendvb m3, m3, m4, m6
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
%endif
|
||||
pcmpgtb m7, m2, m3
|
||||
punpcklbw m2, m3, m7
|
||||
@@ -1915,7 +1915,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
mov hd, hm
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
%%loop_y_v_overlap:
|
||||
; src
|
||||
@@ -1966,16 +1966,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; scaling[luma_src]
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m8, [scalingq+m4], m3
|
||||
vpgatherdd m4, [scalingq+m5], m9
|
||||
vpgatherdd m8, [scalingq-3+m4], m3
|
||||
vpgatherdd m4, [scalingq-3+m5], m9
|
||||
pcmpeqw m3, m3
|
||||
pcmpeqw m9, m9
|
||||
vpgatherdd m5, [scalingq+m6], m3
|
||||
vpgatherdd m6, [scalingq+m7], m9
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m3
|
||||
vpgatherdd m6, [scalingq-3+m7], m9
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
@@ -1988,7 +1985,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %3 == 0
|
||||
%if %2
|
||||
mova m6, [pb_8x_27_17_8x_17_27]
|
||||
movu xm3, [grain_lutq+offxyq]
|
||||
movu xm4, [grain_lutq+top_offxyq]
|
||||
vinserti128 m3, [grain_lutq+offxyq+82], 1
|
||||
@@ -1999,13 +1995,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%endif
|
||||
punpckhbw m9, m4, m3
|
||||
punpcklbw m4, m3
|
||||
%if %2
|
||||
pmaddubsw m9, m6, m9
|
||||
pmaddubsw m4, m6, m4
|
||||
%else
|
||||
pmaddubsw m9, m1, m9
|
||||
pmaddubsw m4, m1, m4
|
||||
%endif
|
||||
pmaddubsw m9, m10, m9
|
||||
pmaddubsw m4, m10, m4
|
||||
%if %1
|
||||
pmulhrsw m9, [pw_1024]
|
||||
pmulhrsw m4, [pw_1024]
|
||||
@@ -2015,19 +2006,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%endif
|
||||
packsswb m3, m4, m9
|
||||
%else
|
||||
%if %1
|
||||
vpbroadcastd m6, [pb_23_22]
|
||||
%endif
|
||||
movq xm3, [grain_lutq+offxyq]
|
||||
movq xm4, [grain_lutq+top_offxyq]
|
||||
vinserti128 m3, [grain_lutq+offxyq+8], 1
|
||||
vinserti128 m4, [grain_lutq+top_offxyq+8], 1
|
||||
punpcklbw m4, m3
|
||||
pmaddubsw m4, m10, m4
|
||||
%if %1
|
||||
pmaddubsw m4, m6, m4
|
||||
pmulhrsw m4, [pw_1024]
|
||||
%else
|
||||
pmaddubsw m4, m15, m4
|
||||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
@@ -2084,7 +2071,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%endif
|
||||
add grain_lutq, 82<<%2
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_v_overlap
|
||||
%endif
|
||||
@@ -2139,7 +2126,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
mov hd, hm
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if %2 == 0
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27]
|
||||
%endif
|
||||
%%loop_y_hv_overlap:
|
||||
; src
|
||||
@@ -2190,16 +2177,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; scaling[src]
|
||||
pcmpeqw m9, m9
|
||||
pcmpeqw m3, m3
|
||||
vpgatherdd m8, [scalingq+m4], m9
|
||||
vpgatherdd m4, [scalingq+m5], m3
|
||||
vpgatherdd m8, [scalingq-3+m4], m9
|
||||
vpgatherdd m4, [scalingq-3+m5], m3
|
||||
pcmpeqw m9, m9
|
||||
pcmpeqw m3, m3
|
||||
vpgatherdd m5, [scalingq+m6], m9
|
||||
vpgatherdd m6, [scalingq+m7], m3
|
||||
pand m8, m10
|
||||
pand m4, m10
|
||||
pand m5, m10
|
||||
pand m6, m10
|
||||
vpgatherdd m5, [scalingq-3+m6], m9
|
||||
vpgatherdd m6, [scalingq-3+m7], m3
|
||||
REPX {psrld x, 24}, m8, m4, m5, m6
|
||||
packusdw m8, m4
|
||||
packusdw m5, m6
|
||||
|
||||
@@ -2212,9 +2196,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if %1
|
||||
%if %2
|
||||
vpbroadcastd m9, [pb_23_22]
|
||||
vpbroadcastq m9, [pb_23_22]
|
||||
%else
|
||||
vpbroadcastd xm9, [pb_27_17_17_27]
|
||||
vpbroadcastq xm9, [pb_27_17_17_27]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
@@ -2252,7 +2236,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%else
|
||||
punpcklbw m7, m6
|
||||
%endif
|
||||
punpcklwd m4, m7
|
||||
punpcklqdq m4, m7
|
||||
%if %1
|
||||
pmaddubsw m4, m9, m4
|
||||
pmulhrsw m4, [pw_1024]
|
||||
@@ -2261,18 +2245,17 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
pmulhrsw m4, m14
|
||||
%endif
|
||||
packsswb m4, m4
|
||||
pcmpeqw m9, m9 ; this is kind of ugly
|
||||
psrldq m9, 15
|
||||
vpblendvb m3, m3, m4, m9
|
||||
psrldq m4, 1
|
||||
vpblendd m3, m4, 00010001b
|
||||
psrldq m4, 4
|
||||
%if %3
|
||||
shufpd m9, m9, m9, 1110b ; clear upper lane
|
||||
vpblendd m6, m6, m4, 00000001b
|
||||
%else
|
||||
vpblendd m6, m6, m4, 00010001b
|
||||
%endif
|
||||
vpblendvb m6, m6, m4, m9
|
||||
%else
|
||||
punpcklbw xm4, xm3
|
||||
punpcklbw xm7, xm6
|
||||
punpckldq xm4, xm7
|
||||
punpcklqdq xm4, xm7
|
||||
%if %1
|
||||
pmaddubsw xm4, xm9, xm4
|
||||
pmulhrsw xm4, [pw_1024]
|
||||
@@ -2281,23 +2264,19 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
pmulhrsw xm4, xm14
|
||||
%endif
|
||||
packsswb xm4, xm4
|
||||
pcmpeqw xm9, xm9 ; this is kind of ugly
|
||||
psrldq xm9, 14
|
||||
vpblendvb m3, m3, m4, m9
|
||||
psrldq xm4, 2
|
||||
vpblendvb m6, m6, m4, m9
|
||||
vpblendd m3, m3, m4, 00000001b
|
||||
psrldq xm4, 4
|
||||
vpblendd m6, m6, m4, 00000001b
|
||||
%endif
|
||||
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
%if %3
|
||||
vpermq m9, m3, q3120
|
||||
punpcklbw m6, m9
|
||||
pmaddubsw m6, m10, m6
|
||||
%if %1
|
||||
vpbroadcastd m9, [pb_23_22]
|
||||
pmaddubsw m6, m9, m6
|
||||
pmulhrsw m6, [pw_1024]
|
||||
%else
|
||||
pmaddubsw m6, m15, m6
|
||||
pmulhrsw m6, m14
|
||||
%endif
|
||||
packsswb m6, m6
|
||||
@@ -2306,14 +2285,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%else
|
||||
punpckhbw m9, m6, m3
|
||||
punpcklbw m6, m3
|
||||
%if %2
|
||||
mova m3, [pb_8x_27_17_8x_17_27]
|
||||
pmaddubsw m9, m3, m9
|
||||
pmaddubsw m6, m3, m6
|
||||
%else
|
||||
pmaddubsw m9, m1, m9
|
||||
pmaddubsw m6, m1, m6
|
||||
%endif
|
||||
pmaddubsw m9, m10, m9
|
||||
pmaddubsw m6, m10, m6
|
||||
%if %1
|
||||
pmulhrsw m9, [pw_1024]
|
||||
pmulhrsw m6, [pw_1024]
|
||||
@@ -2373,7 +2346,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
jg %%loop_y_h_overlap
|
||||
%else
|
||||
je %%end_y_hv_overlap
|
||||
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
|
||||
vbroadcasti128 m10, [pb_8x_27_17_8x_17_27+16]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_hv_overlap
|
||||
jmp %%loop_y_h_overlap
|
||||
|
||||
+213
-252
@@ -29,14 +29,18 @@
|
||||
SECTION_RODATA
|
||||
|
||||
pw_1024: times 8 dw 1024
|
||||
pb_27_17_17_27: db 27, 17, 17, 27
|
||||
times 6 db 0, 32
|
||||
pb_23_22_h: db 23, 22
|
||||
times 7 db 0, 32
|
||||
pb_27_17: times 8 db 27, 17
|
||||
pb_17_27: times 8 db 17, 27
|
||||
pb_23_22: times 8 db 23, 22
|
||||
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
|
||||
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
|
||||
byte_blend: db 0, 0, 0, 0xff, 0, 0, 0, 0
|
||||
pw_seed_xor: times 2 dw 0xb524
|
||||
times 2 dw 0x49d8
|
||||
pb_23_22: times 2 db 23, 22
|
||||
pb_1: times 4 db 1
|
||||
hmul_bits: dw 32768, 16384, 8192, 4096
|
||||
round: dw 2048, 1024, 512
|
||||
@@ -46,8 +50,6 @@ max: dw 255, 240, 235
|
||||
min: dw 0, 16
|
||||
pw_1: dw 1
|
||||
|
||||
%define pb_27_17_17_27 pb_17_27 - 2
|
||||
|
||||
%macro JMP_TABLE 2-*
|
||||
%xdefine %1_8bpc_%2_table %%table
|
||||
%xdefine %%base %1_8bpc_%2_table
|
||||
@@ -88,6 +90,20 @@ cextern gaussian_sequence
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro REPX 2-*
|
||||
%xdefine %%f(x) %1
|
||||
%rep %0 - 1
|
||||
%rotate 1
|
||||
%%f(%1)
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
%define PIC_ptr(a) base+a
|
||||
%else
|
||||
%define PIC_ptr(a) a
|
||||
%endif
|
||||
|
||||
%macro SCRATCH 3
|
||||
%if ARCH_X86_32
|
||||
mova [rsp+%3*mmsize], m%1
|
||||
@@ -1284,7 +1300,7 @@ INIT_XMM ssse3
|
||||
; fgy_32x32xn(dst, src, stride, fg_data, w, scaling, grain_lut, h, sby)
|
||||
%if ARCH_X86_32
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (5 * mmsize + 16 * gprsize), \
|
||||
dst, src, scaling, unused1, fg_data, picptr, unused2
|
||||
; copy stack arguments to new position post-alignment, so that we
|
||||
; don't have to keep the old stack location in a separate register
|
||||
@@ -1295,29 +1311,29 @@ cglobal fgy_32x32xn_8bpc, 0, 7, 16, 0 - (6 * mmsize + (9 + 3) * gprsize), \
|
||||
mov r4, r7m
|
||||
mov r5, r8m
|
||||
|
||||
mov [rsp+6*mmsize+ 3*gprsize], r0
|
||||
mov [rsp+6*mmsize+ 5*gprsize], r1
|
||||
mov [rsp+6*mmsize+ 7*gprsize], r2
|
||||
mov [rsp+6*mmsize+ 9*gprsize], r3
|
||||
mov [rsp+6*mmsize+10*gprsize], r4
|
||||
mov [rsp+6*mmsize+11*gprsize], r5
|
||||
mov [rsp+5*mmsize+ 4*gprsize], r0
|
||||
mov [rsp+5*mmsize+ 6*gprsize], r1
|
||||
mov [rsp+5*mmsize+ 8*gprsize], r2
|
||||
mov [rsp+5*mmsize+10*gprsize], r3
|
||||
mov [rsp+5*mmsize+11*gprsize], r4
|
||||
mov [rsp+5*mmsize+12*gprsize], r5
|
||||
%else
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 6 * mmsize + (3 + 1) * gprsize, \
|
||||
cglobal fgy_32x32xn_8bpc, 0, 7, 16, 5 * mmsize + 4 * gprsize, \
|
||||
dst, src, scaling, unused1, fg_data, picptr, unused2
|
||||
%endif
|
||||
mov srcq, srcm
|
||||
mov fg_dataq, r3m
|
||||
mov scalingq, r5m
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
%define r0m [rsp+6*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+6*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+6*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+6*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+6*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+6*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+6*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+6*mmsize+10*gprsize]
|
||||
%define r8m [rsp+6*mmsize+11*gprsize]
|
||||
%define r0m [rsp+5*mmsize+ 4*gprsize]
|
||||
%define r1m [rsp+5*mmsize+ 5*gprsize]
|
||||
%define r2m [rsp+5*mmsize+ 6*gprsize]
|
||||
%define r3m [rsp+5*mmsize+ 7*gprsize]
|
||||
%define r4m [rsp+5*mmsize+ 8*gprsize]
|
||||
%define r5m [rsp+5*mmsize+ 9*gprsize]
|
||||
%define r6m [rsp+5*mmsize+10*gprsize]
|
||||
%define r7m [rsp+5*mmsize+11*gprsize]
|
||||
%define r8m [rsp+5*mmsize+12*gprsize]
|
||||
%endif
|
||||
LEA r5, pb_mask
|
||||
%define base r5-pb_mask
|
||||
@@ -1330,8 +1346,6 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
mov r6d, [fg_dataq+FGData.scaling_shift]
|
||||
movd m3, [base+mul_bits+r6*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
pcmpeqw m2, m2
|
||||
psrldq m2, 14
|
||||
movd m4, [base+max+r6*4]
|
||||
movd m5, [base+min+r6*2]
|
||||
punpcklwd m3, m3
|
||||
@@ -1340,10 +1354,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
pshufd m3, m3, q0000
|
||||
pshufd m4, m4, q0000
|
||||
pshufd m5, m5, q0000
|
||||
SCRATCH 2, 10, 0
|
||||
SCRATCH 3, 11, 1
|
||||
SCRATCH 4, 12, 2
|
||||
SCRATCH 5, 13, 3
|
||||
SCRATCH 3, 11, 0
|
||||
SCRATCH 4, 12, 1
|
||||
SCRATCH 5, 13, 2
|
||||
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
|
||||
@@ -1356,9 +1369,9 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
test overlapd, overlapd
|
||||
jz .no_vertical_overlap
|
||||
mova m6, [base+pw_1024]
|
||||
movd m7, [base+pb_27_17_17_27]
|
||||
SCRATCH 6, 14, 4
|
||||
SCRATCH 7, 15, 5
|
||||
mova m7, [base+pb_27_17_17_27]
|
||||
SCRATCH 6, 14, 3
|
||||
SCRATCH 7, 15, 4
|
||||
test sbyd, sbyd
|
||||
jnz .vertical_overlap
|
||||
; fall-through
|
||||
@@ -1445,16 +1458,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
@@ -1504,7 +1514,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
jz .loop_x_odd
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16 ; top_offxyd
|
||||
%endif
|
||||
@@ -1525,7 +1535,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
DEFINE_ARGS dst, src, scaling, offxy, unused1, unused2, unused3
|
||||
|
||||
add offxyd, 16 ; left_offxyd
|
||||
mov [rsp+6*mmsize+0*gprsize], offxyd
|
||||
mov [rsp+5*mmsize+0*gprsize], offxyd
|
||||
|
||||
DEFINE_ARGS dst, src, scaling, see, unused1, unused2, unused3
|
||||
|
||||
@@ -1578,21 +1588,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+0*gprsize]
|
||||
mov r5, [rsp+5*mmsize+0*gprsize]
|
||||
movd m7, [grain_lutq+r5]
|
||||
%else
|
||||
movd m7, [grain_lutq+left_offxyq]
|
||||
@@ -1601,9 +1608,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
pmaddubsw m6, m15, m7
|
||||
pmulhrsw m6, m14
|
||||
packsswb m6, m6
|
||||
pand m6, m10
|
||||
pandn m7, m10, m3
|
||||
por m6, m7
|
||||
shufps m6, m3, q3210
|
||||
pcmpgtb m2, m6
|
||||
punpcklbw m7, m6, m2
|
||||
punpckhbw m6, m2
|
||||
@@ -1649,7 +1654,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
test dword r8m, 2 ; have_top_overlap
|
||||
jz .loop_x_odd
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16 ; top_offxyd
|
||||
%endif
|
||||
@@ -1754,7 +1759,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
|
||||
movzx top_offxyd, offxyw
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+6*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+5*mmsize+1*gprsize], top_offxyd
|
||||
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
|
||||
%endif
|
||||
@@ -1764,7 +1769,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
lea r5, [base+pb_27_17]
|
||||
mov [rsp+5*mmsize+8], r5
|
||||
mov [rsp+5*mmsize+12], r5
|
||||
%else
|
||||
mova m8, [pb_27_17]
|
||||
%endif
|
||||
@@ -1779,21 +1784,18 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m4, m0, scalingq, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq, r0, r5, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r0, r5, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r0, r5, m3
|
||||
%else
|
||||
vpgatherdw m4, m0, scalingq, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq, r12, r13, m3
|
||||
vpgatherdw m4, m0, scalingq-1, r12, r13, m3
|
||||
vpgatherdw m5, m1, scalingq-1, r12, r13, m3
|
||||
%endif
|
||||
pcmpeqw m3, m3
|
||||
psrlw m3, 8
|
||||
pand m4, m3
|
||||
pand m5, m3
|
||||
REPX {psrlw x, 8}, m4, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+1*gprsize]
|
||||
mov r5, [rsp+5*mmsize+1*gprsize]
|
||||
movu m7, [grain_lutq+r5]
|
||||
%else
|
||||
movu m7, [grain_lutq+top_offxyq]
|
||||
@@ -1801,7 +1803,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
punpckhbw m6, m7, m3
|
||||
punpcklbw m7, m3
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+5*mmsize+8]
|
||||
mov r5, [rsp+5*mmsize+12]
|
||||
pmaddubsw m3, [r5], m6
|
||||
pmaddubsw m6, [r5], m7
|
||||
%else
|
||||
@@ -1833,7 +1835,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
mova [dstq+srcq], m0
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+5*mmsize+8], mmsize
|
||||
add dword [rsp+5*mmsize+12], mmsize
|
||||
%else
|
||||
mova m8, [pb_17_27]
|
||||
%endif
|
||||
@@ -1864,7 +1866,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
jc .loop_x_hv_overlap
|
||||
add offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
@@ -1874,16 +1876,16 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
lea r5, [base+pb_27_17]
|
||||
mov [rsp+5*mmsize+8], r5
|
||||
mov [rsp+5*mmsize+12], r5
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, src_bak
|
||||
|
||||
mov r5, [rsp+6*mmsize+1*gprsize]
|
||||
mov r5, [rsp+5*mmsize+1*gprsize]
|
||||
mov r4, offxyd
|
||||
add r5, 16
|
||||
add r4, 16
|
||||
mov [rsp+6*mmsize+2*gprsize], r5 ; topleft_offxy
|
||||
mov [rsp+6*mmsize+0*gprsize], r4 ; left_offxy
|
||||
mov [rsp+5*mmsize+2*gprsize], r5 ; topleft_offxy
|
||||
mov [rsp+5*mmsize+0*gprsize], r4 ; left_offxy
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, see, w, picptr, src_bak
|
||||
|
||||
@@ -1937,7 +1939,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
DEFINE_ARGS dst, src, scaling, offxy, h, picptr, grain_lut
|
||||
|
||||
movzx r5, offxyw ; top_offxy
|
||||
mov [rsp+6*mmsize+1*gprsize], r5
|
||||
mov [rsp+5*mmsize+1*gprsize], r5
|
||||
%else
|
||||
DEFINE_ARGS dst, src, stride, src_bak, w, scaling, grain_lut, \
|
||||
h, offxy, see, left_offxy, top_offxy, topleft_offxy
|
||||
@@ -1952,10 +1954,10 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+6*mmsize+1*gprsize] ; top_offxy
|
||||
mov r0, [rsp+6*mmsize+0*gprsize] ; left_offxy
|
||||
mov r5, [rsp+5*mmsize+1*gprsize] ; top_offxy
|
||||
mov r0, [rsp+5*mmsize+0*gprsize] ; left_offxy
|
||||
movu m6, [grain_lutq+r5]
|
||||
mov r5, [rsp+6*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+5*mmsize+2*gprsize] ; topleft_offxy
|
||||
movd m4, [grain_lutq+r0]
|
||||
movd m7, [grain_lutq+r5]
|
||||
%else
|
||||
@@ -1972,17 +1974,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
pmulhrsw m4, m14
|
||||
packsswb m2, m2
|
||||
packsswb m4, m4
|
||||
pand m2, m10
|
||||
pand m4, m10
|
||||
pandn m7, m10, m3
|
||||
pandn m3, m10, m6
|
||||
por m7, m2
|
||||
por m3, m4
|
||||
shufps m2, m3, q3210
|
||||
shufps m4, m6, q3210
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m4, m3, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m3, m4, m2
|
||||
punpckhbw m4, m2
|
||||
%if ARCH_X86_32
|
||||
mov r5, [rsp+5*mmsize+8]
|
||||
mov r5, [rsp+5*mmsize+12]
|
||||
pmaddubsw m7, [r5], m4
|
||||
pmaddubsw m4, [r5], m3
|
||||
%else
|
||||
@@ -2004,16 +2002,13 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m5, m0, scalingq, r0, r5, m7
|
||||
vpgatherdw m6, m1, scalingq, r0, r5, m7
|
||||
vpgatherdw m5, m0, scalingq-1, r0, r5, m7
|
||||
vpgatherdw m6, m1, scalingq-1, r0, r5, m7
|
||||
%else
|
||||
vpgatherdw m5, m0, scalingq, r13, r14, m7
|
||||
vpgatherdw m6, m1, scalingq, r13, r14, m7
|
||||
vpgatherdw m5, m0, scalingq-1, r13, r14, m7
|
||||
vpgatherdw m6, m1, scalingq-1, r13, r14, m7
|
||||
%endif
|
||||
pcmpeqw m7, m7
|
||||
psrlw m7, 8
|
||||
pand m5, m7
|
||||
pand m6, m7
|
||||
REPX {psrlw x, 8}, m5, m6
|
||||
|
||||
; noise = round2(scaling[src] * grain, scaling_shift)
|
||||
pmullw m3, m5
|
||||
@@ -2033,7 +2028,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
mova [dstq+srcq], m0
|
||||
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+5*mmsize+8], mmsize
|
||||
add dword [rsp+5*mmsize+12], mmsize
|
||||
%else
|
||||
mova m8, [pb_17_27]
|
||||
%endif
|
||||
@@ -2063,7 +2058,7 @@ cglobal fgy_32x32xn_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grai
|
||||
xor dword r8m, 4
|
||||
add offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+6*mmsize+1*gprsize], 16
|
||||
add dword [rsp+5*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
@@ -2079,49 +2074,49 @@ INIT_XMM ssse3
|
||||
; sby, luma, lstride, uv_pl, is_id)
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 0 - (7 * mmsize + (13 + 3) * gprsize), \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
mov r0, r0m
|
||||
mov r1, r2m
|
||||
mov r2, r4m
|
||||
mov r3, r6m
|
||||
mov r4, r7m
|
||||
mov [rsp+8*mmsize+3*gprsize], r0
|
||||
mov [rsp+8*mmsize+5*gprsize], r1
|
||||
mov [rsp+8*mmsize+7*gprsize], r2
|
||||
mov [rsp+8*mmsize+9*gprsize], r3
|
||||
mov [rsp+8*mmsize+10*gprsize], r4
|
||||
mov [rsp+7*mmsize+3*gprsize], r0
|
||||
mov [rsp+7*mmsize+5*gprsize], r1
|
||||
mov [rsp+7*mmsize+7*gprsize], r2
|
||||
mov [rsp+7*mmsize+9*gprsize], r3
|
||||
mov [rsp+7*mmsize+10*gprsize], r4
|
||||
|
||||
mov r0, r8m
|
||||
mov r1, r9m
|
||||
mov r2, r10m
|
||||
mov r4, r11m
|
||||
mov r3, r12m
|
||||
mov [rsp+8*mmsize+11*gprsize], r0
|
||||
mov [rsp+8*mmsize+12*gprsize], r1
|
||||
mov [rsp+8*mmsize+13*gprsize], r2
|
||||
mov [rsp+8*mmsize+14*gprsize], r4
|
||||
mov [rsp+7*mmsize+11*gprsize], r0
|
||||
mov [rsp+7*mmsize+12*gprsize], r1
|
||||
mov [rsp+7*mmsize+13*gprsize], r2
|
||||
mov [rsp+7*mmsize+14*gprsize], r4
|
||||
%else
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
|
||||
cglobal fguv_32x32xn_i%1_8bpc, 0, 7, 8, 7 * mmsize + (4) * gprsize, \
|
||||
tmp, src, scaling, h, fg_data, picptr, unused
|
||||
%endif
|
||||
mov srcq, srcm
|
||||
mov fg_dataq, r3m
|
||||
mov scalingq, r5m
|
||||
%if STACK_ALIGNMENT < mmsize
|
||||
%define r0m [rsp+8*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+8*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+8*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+8*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+8*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+8*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+8*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+8*mmsize+10*gprsize]
|
||||
%define r8m [rsp+8*mmsize+11*gprsize]
|
||||
%define r9m [rsp+8*mmsize+12*gprsize]
|
||||
%define r10m [rsp+8*mmsize+13*gprsize]
|
||||
%define r11m [rsp+8*mmsize+14*gprsize]
|
||||
%define r12m [rsp+8*mmsize+15*gprsize]
|
||||
%define r0m [rsp+7*mmsize+ 3*gprsize]
|
||||
%define r1m [rsp+7*mmsize+ 4*gprsize]
|
||||
%define r2m [rsp+7*mmsize+ 5*gprsize]
|
||||
%define r3m [rsp+7*mmsize+ 6*gprsize]
|
||||
%define r4m [rsp+7*mmsize+ 7*gprsize]
|
||||
%define r5m [rsp+7*mmsize+ 8*gprsize]
|
||||
%define r6m [rsp+7*mmsize+ 9*gprsize]
|
||||
%define r7m [rsp+7*mmsize+10*gprsize]
|
||||
%define r8m [rsp+7*mmsize+11*gprsize]
|
||||
%define r9m [rsp+7*mmsize+12*gprsize]
|
||||
%define r10m [rsp+7*mmsize+13*gprsize]
|
||||
%define r11m [rsp+7*mmsize+14*gprsize]
|
||||
%define r12m [rsp+7*mmsize+15*gprsize]
|
||||
%endif
|
||||
LEA r5, pb_mask
|
||||
%define base r5-pb_mask
|
||||
@@ -2133,7 +2128,6 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%define base r8-pb_mask
|
||||
%endif
|
||||
mov r6d, [fg_dataq+FGData.scaling_shift]
|
||||
pcmpeqw m2, m2
|
||||
movd m3, [base+mul_bits+r6*2-14]
|
||||
mov r6d, [fg_dataq+FGData.clip_to_restricted_range]
|
||||
lea tmpd, [r6d*2]
|
||||
@@ -2145,17 +2139,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
movd m5, [base+min+r6*2]
|
||||
cmovne r6d, tmpd
|
||||
movd m4, [base+max+r6*2]
|
||||
psrldq m2, 14+%2
|
||||
punpcklwd m3, m3
|
||||
punpcklwd m5, m5
|
||||
punpcklwd m4, m4
|
||||
pshufd m3, m3, q0000
|
||||
pshufd m5, m5, q0000
|
||||
pshufd m4, m4, q0000
|
||||
SCRATCH 2, 10, 0
|
||||
SCRATCH 3, 11, 1
|
||||
SCRATCH 4, 12, 2
|
||||
SCRATCH 5, 13, 3
|
||||
SCRATCH 3, 11, 0
|
||||
SCRATCH 4, 12, 1
|
||||
SCRATCH 5, 13, 2
|
||||
|
||||
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
|
||||
jne .csfl
|
||||
@@ -2177,8 +2169,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
punpcklwd m7, m7
|
||||
pshufd m6, m6, q0000
|
||||
pshufd m7, m7, q0000
|
||||
SCRATCH 6, 14, 4
|
||||
SCRATCH 7, 15, 5
|
||||
SCRATCH 6, 14, 3
|
||||
SCRATCH 7, 15, 4
|
||||
%endif
|
||||
|
||||
mov sbyd, r8m
|
||||
@@ -2187,22 +2179,21 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
jz %%no_vertical_overlap
|
||||
%if ARCH_X86_32
|
||||
%if %2
|
||||
movd m1, [base+pb_23_22]
|
||||
mova m1, [base+pb_23_22_h]
|
||||
%else
|
||||
movd m1, [base+pb_27_17_17_27]
|
||||
mova m1, [base+pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [base+pw_1024]
|
||||
%else
|
||||
%if %2
|
||||
movd m1, [pb_23_22]
|
||||
mova m1, [pb_23_22_h]
|
||||
%else
|
||||
movd m1, [pb_27_17_17_27]
|
||||
mova m1, [pb_27_17_17_27]
|
||||
%endif
|
||||
mova m0, [pw_1024]
|
||||
%endif
|
||||
pshufd m1, m1, q0000
|
||||
SCRATCH 0, 8, 6
|
||||
SCRATCH 1, 9, 7
|
||||
SCRATCH 0, 8, 5
|
||||
SCRATCH 1, 9, 6
|
||||
test sbyd, sbyd
|
||||
jnz %%vertical_overlap
|
||||
; fall-through
|
||||
@@ -2347,16 +2338,13 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
pand m7, m1
|
||||
pand m5, m1
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; unpack chroma_source
|
||||
punpckhbw m1, m0, m2
|
||||
@@ -2426,7 +2414,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%if %2 == 0
|
||||
; adjust top_offxy
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
@@ -2450,9 +2438,9 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%if ARCH_X86_32
|
||||
%if %2
|
||||
lea r6, [offxyd+16]
|
||||
mov [rsp+8*mmsize+0*gprsize], r6
|
||||
mov [rsp+7*mmsize+0*gprsize], r6
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], offxyd
|
||||
mov [rsp+7*mmsize+0*gprsize], offxyd
|
||||
%endif
|
||||
|
||||
DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
|
||||
@@ -2558,36 +2546,31 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m1, m1
|
||||
psrlw m1, 8
|
||||
pand m7, m1
|
||||
pand m5, m1
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; unpack chroma_source
|
||||
punpckhbw m1, m0, m2
|
||||
punpcklbw m0, m2 ; m0-1: src as word
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq+ 0]
|
||||
movu m4, [grain_lutq+offxyq+ 0]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+0*gprsize]
|
||||
movd m4, [grain_lutq+r0+ 0]
|
||||
mov r0, [rsp+7*mmsize+0*gprsize]
|
||||
movd m2, [grain_lutq+r0+ 0]
|
||||
%else
|
||||
movd m4, [grain_lutq+left_offxyq+ 0]
|
||||
movd m2, [grain_lutq+left_offxyq+ 0]
|
||||
%endif
|
||||
punpcklbw m2, m4, m3
|
||||
pmaddubsw m4, m9, m2
|
||||
pmulhrsw m4, m8
|
||||
packsswb m4, m4
|
||||
pand m4, m10
|
||||
pandn m2, m10, m3
|
||||
por m3, m4, m2
|
||||
punpcklbw m2, m4
|
||||
pmaddubsw m3, m9, m2
|
||||
pmulhrsw m3, m8
|
||||
packsswb m3, m3
|
||||
shufps m3, m4, q3210
|
||||
pxor m4, m4
|
||||
pcmpgtb m4, m3
|
||||
punpcklbw m2, m3, m4
|
||||
@@ -2652,7 +2635,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
xor dword r8m, 4
|
||||
; adjust top_offxyd
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add r11d, 16
|
||||
%endif
|
||||
@@ -2780,7 +2763,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
movzx top_offxyd, offxyw
|
||||
shr offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+8*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+7*mmsize+1*gprsize], top_offxyd
|
||||
|
||||
DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
|
||||
%endif
|
||||
@@ -2790,9 +2773,11 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_27_17]
|
||||
%endif
|
||||
%if %3
|
||||
mova m1, [PIC_ptr(pb_23_22)]
|
||||
%else
|
||||
mova m1, [pb_27_17]
|
||||
mova m1, [PIC_ptr(pb_27_17)]
|
||||
%endif
|
||||
%%loop_y_v_overlap:
|
||||
%if ARCH_X86_32
|
||||
@@ -2848,34 +2833,26 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
|
||||
; scaling[luma_src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r12, r2
|
||||
vpgatherdw m5, m6, scalingq, r12, r2
|
||||
vpgatherdw m7, m4, scalingq-1, r12, r2
|
||||
vpgatherdw m5, m6, scalingq-1, r12, r2
|
||||
%endif
|
||||
pcmpeqw m4, m4
|
||||
psrlw m4, 8
|
||||
pand m7, m4
|
||||
pand m5, m4
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
movu m3, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+1*gprsize]
|
||||
mov r0, [rsp+7*mmsize+1*gprsize]
|
||||
movu m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m4, [grain_lutq+top_offxyq]
|
||||
%endif
|
||||
punpckhbw m6, m4, m3
|
||||
punpcklbw m4, m3
|
||||
%if %3
|
||||
pmaddubsw m2, m9, m6
|
||||
pmaddubsw m3, m9, m4
|
||||
%else
|
||||
pmaddubsw m2, m1, m6
|
||||
pmaddubsw m3, m1, m4
|
||||
%endif
|
||||
pmulhrsw m2, m8
|
||||
pmulhrsw m3, m8
|
||||
packsswb m3, m2
|
||||
@@ -2928,10 +2905,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
btc hd, 16
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m1, [base+pb_17_27]
|
||||
%else
|
||||
mova m1, [pb_17_27]
|
||||
%endif
|
||||
mova m1, [PIC_ptr(pb_17_27)]
|
||||
jnc %%loop_y_v_overlap
|
||||
%endif
|
||||
jmp %%loop_y
|
||||
@@ -2963,7 +2938,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
; h+v overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
@@ -2976,15 +2951,15 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
|
||||
|
||||
mov r6, [rsp+8*mmsize+1*gprsize]
|
||||
mov r6, [rsp+7*mmsize+1*gprsize]
|
||||
%if %2
|
||||
lea r0, [r3d+16]
|
||||
add r6, 16
|
||||
mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy
|
||||
mov [rsp+7*mmsize+0*gprsize], r0 ; left_offxy
|
||||
%else
|
||||
mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy
|
||||
mov [rsp+7*mmsize+0*gprsize], r3 ; left_offxy
|
||||
%endif
|
||||
mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy
|
||||
mov [rsp+7*mmsize+2*gprsize], r6 ; topleft_offxy
|
||||
|
||||
DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
|
||||
|
||||
@@ -3048,18 +3023,55 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
movzx top_offxyd, offxyw
|
||||
shr offxyd, 16
|
||||
%if ARCH_X86_32
|
||||
mov [rsp+8*mmsize+1*gprsize], top_offxyd
|
||||
mov [rsp+7*mmsize+1*gprsize], top_offxyd
|
||||
%endif
|
||||
|
||||
mov hd, r7m
|
||||
mov grain_lutq, grain_lutmp
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_27_17]
|
||||
%endif
|
||||
%if %3
|
||||
mova m3, [PIC_ptr(pb_23_22)]
|
||||
%else
|
||||
mova m3, [pb_27_17]
|
||||
mova m3, [PIC_ptr(pb_27_17)]
|
||||
%endif
|
||||
%%loop_y_hv_overlap:
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+7*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+7*mmsize+1*gprsize] ; top_offxy
|
||||
movd m1, [grain_lutq+r0]
|
||||
mov r0, [rsp+7*mmsize+0*gprsize] ; left_offxy
|
||||
%else
|
||||
movd m1, [grain_lutq+topleft_offxyq]
|
||||
%endif
|
||||
movu m2, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
movu m6, [grain_lutq+r5]
|
||||
movd m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m6, [grain_lutq+top_offxyq]
|
||||
movd m4, [grain_lutq+left_offxyq]
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m4, m2
|
||||
pmaddubsw m0, m9, m1
|
||||
pmaddubsw m1, m9, m4
|
||||
REPX {pmulhrsw x, m8}, m0, m1
|
||||
packsswb m0, m1
|
||||
shufps m4, m0, m2, q3232
|
||||
shufps m0, m6, q3210
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpcklbw m2, m0, m4
|
||||
punpckhbw m0, m4
|
||||
pmaddubsw m4, m3, m0
|
||||
pmaddubsw m1, m3, m2
|
||||
pmulhrsw m4, m8
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m4
|
||||
|
||||
; src
|
||||
%if ARCH_X86_32
|
||||
DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
|
||||
@@ -3116,69 +3128,20 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
|
||||
; scaling[src]
|
||||
%if ARCH_X86_32
|
||||
vpgatherdw m7, m4, scalingq, r0, r5
|
||||
vpgatherdw m5, m6, scalingq, r0, r5
|
||||
vpgatherdw m7, m4, scalingq-1, r0, r5
|
||||
vpgatherdw m5, m6, scalingq-1, r0, r5
|
||||
%else
|
||||
movd m1, [grain_lutq+topleft_offxyq]
|
||||
%if %3
|
||||
vpgatherdw m7, m4, scalingq, r2, r12
|
||||
vpgatherdw m5, m6, scalingq, r2, r12
|
||||
vpgatherdw m7, m4, scalingq-1, r2, r12
|
||||
vpgatherdw m5, m6, scalingq-1, r2, r12
|
||||
%else
|
||||
vpgatherdw m7, m4, scalingq, r2, r13
|
||||
vpgatherdw m5, m6, scalingq, r2, r13
|
||||
vpgatherdw m7, m4, scalingq-1, r2, r13
|
||||
vpgatherdw m5, m6, scalingq-1, r2, r13
|
||||
%endif
|
||||
%endif
|
||||
pcmpeqw m2, m2
|
||||
psrlw m2, 8
|
||||
pand m7, m2
|
||||
pand m5, m2
|
||||
REPX {psrlw x, 8}, m7, m5
|
||||
|
||||
; grain = grain_lut[offy+y][offx+x]
|
||||
%if ARCH_X86_32
|
||||
mov r0, [rsp+8*mmsize+2*gprsize] ; topleft_offxy
|
||||
mov r5, [rsp+8*mmsize+1*gprsize] ; top_offxy
|
||||
movd m1, [grain_lutq+r0]
|
||||
mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy
|
||||
%endif
|
||||
movu m2, [grain_lutq+offxyq]
|
||||
%if ARCH_X86_32
|
||||
movu m6, [grain_lutq+r5]
|
||||
movd m4, [grain_lutq+r0]
|
||||
%else
|
||||
movu m6, [grain_lutq+top_offxyq]
|
||||
movd m4, [grain_lutq+left_offxyq]
|
||||
%endif
|
||||
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
|
||||
punpcklbw m1, m6
|
||||
punpcklbw m4, m2
|
||||
%if %2
|
||||
punpcklwd m4, m1
|
||||
%else
|
||||
punpckldq m4, m1
|
||||
%endif
|
||||
pmaddubsw m1, m9, m4
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m1
|
||||
pandn m4, m10, m2
|
||||
pandn m2, m10, m6
|
||||
psrldq m6, m1, 2-%2
|
||||
pand m1, m10
|
||||
pand m6, m10
|
||||
por m4, m1
|
||||
por m2, m6
|
||||
; followed by v interpolation (top | cur -> cur)
|
||||
punpckhbw m1, m2, m4
|
||||
punpcklbw m2, m4
|
||||
%if %3
|
||||
pmaddubsw m4, m9, m1
|
||||
pmaddubsw m1, m9, m2
|
||||
%else
|
||||
pmaddubsw m4, m3, m1
|
||||
pmaddubsw m1, m3, m2
|
||||
%endif
|
||||
pmulhrsw m4, m8
|
||||
pmulhrsw m1, m8
|
||||
packsswb m1, m4
|
||||
; unpack grain
|
||||
pxor m4, m4
|
||||
pcmpgtb m4, m1
|
||||
punpcklbw m2, m1, m4
|
||||
@@ -3229,10 +3192,8 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
jle %%end_y_hv_overlap
|
||||
%if ARCH_X86_32
|
||||
mov r5, r5m
|
||||
mova m3, [base+pb_17_27]
|
||||
%else
|
||||
mova m3, [pb_17_27]
|
||||
%endif
|
||||
mova m3, [PIC_ptr(pb_17_27)]
|
||||
btc hd, 16
|
||||
jnc %%loop_y_hv_overlap
|
||||
%if ARCH_X86_64
|
||||
@@ -3268,7 +3229,7 @@ cglobal fguv_32x32xn_i%1_8bpc, 6, 15, 16, dst, src, stride, fg_data, w, scaling,
|
||||
jmp %%loop_x_hv_overlap
|
||||
%else
|
||||
%if ARCH_X86_32
|
||||
add dword [rsp+8*mmsize+1*gprsize], 16
|
||||
add dword [rsp+7*mmsize+1*gprsize], 16
|
||||
%else
|
||||
add top_offxyd, 16
|
||||
%endif
|
||||
|
||||
Reference in New Issue
Block a user