x86: add AVX2 versions for filmgrain.fguv_32x32xn[422/444]

fguv_32x32xn_8bpc_420_csfl0_c: 14568.2
fguv_32x32xn_8bpc_420_csfl0_avx2: 940.2
fguv_32x32xn_8bpc_420_csfl1_c: 10682.0
fguv_32x32xn_8bpc_420_csfl1_avx2: 783.3
fguv_32x32xn_8bpc_422_csfl0_c: 16370.5
fguv_32x32xn_8bpc_422_csfl0_avx2: 1557.3
fguv_32x32xn_8bpc_422_csfl1_c: 11333.8
fguv_32x32xn_8bpc_422_csfl1_avx2: 902.1
fguv_32x32xn_8bpc_444_csfl0_c: 12950.1
fguv_32x32xn_8bpc_444_csfl0_avx2: 822.9
fguv_32x32xn_8bpc_444_csfl1_c: 8806.7
fguv_32x32xn_8bpc_444_csfl1_avx2: 708.2
This commit is contained in:
Ronald S. Bultje
2020-04-01 10:50:02 -04:00
parent fcc94fa905
commit 275e91de9e
2 changed files with 320 additions and 54 deletions
+316 -54
View File
@@ -28,6 +28,8 @@
%if ARCH_X86_64
SECTION_RODATA 32
pb_8x_27_17_8x_17_27: times 8 db 27, 17
times 8 db 17, 27
pw_1024: times 16 dw 1024
pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
@@ -1457,8 +1459,9 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
.end_hv:
RET
cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, luma, lstride, uv_pl, is_id
%macro FGUV_FN 3 ; name, ss_hor, ss_ver
cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
grain_lut, h, sby, luma, lstride, uv_pl, is_id
pcmpeqw m10, m10
psrld m10, 24
mov r7d, [fg_dataq+FGData.scaling_shift]
@@ -1474,7 +1477,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
jne .csfl
%macro FGUV_32x32xN_LOOP 1 ; not-csfl
%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
%if %1
@@ -1485,7 +1488,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4]
%else
vpbroadcastd m14, [pw_1024]
%if %2
vpbroadcastd m15, [pb_23_22]
%else
vpbroadcastd xm15, [pb_27_17_17_27]
%endif
%endif
mov overlapd, [fg_dataq+FGData.overlap_flag]
@@ -1507,7 +1514,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov lumaq, r9mp
lea r12, [srcq+wq]
lea r13, [dstq+wq]
lea r14, [lumaq+wq*2]
lea r14, [lumaq+wq*(1+%2)]
mov r11mp, r12
mov r12mp, r13
mov lstrideq, r10mp
@@ -1528,8 +1535,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
rorx offyd, seed, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 82
lea offyq, [offyq+offxq+498] ; offy*stride+offx
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, unused1, unused2, lstride
@@ -1538,21 +1545,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov grain_lutq, grain_lutmp
%%loop_y:
; src
%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
vinserti128 m4, [lumaq+lstrideq*2 +0], 1
vinserti128 m6, [lumaq+lstrideq*2+16], 1
vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
pxor m2, m2
mova m4, [lumaq]
mova m0, [srcq]
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1564,6 +1579,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1592,8 +1610,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
%if %2
movu xm3, [grain_lutq+offxyq+ 0]
vinserti128 m3, [grain_lutq+offxyq+82], 1
%else
movu m3, [grain_lutq+offxyq]
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1612,21 +1634,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
%if %2
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
%else
mova [dstq], m0
%endif
%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
lea lumaq, [lumaq+lstrideq*4]
add grain_lutq, 82*2
sub hb, 2
lea lumaq, [lumaq+lstrideq*(2<<%3)]
%else
add srcq, strideq
add dstq, strideq
add lumaq, lstrideq
%endif
add grain_lutq, 82<<%2
sub hb, 1+%2
jg %%loop_y
add wq, 16
add wq, 32>>%2
jge %%end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*2]
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
test overlapd, overlapd
@@ -1648,13 +1680,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, unused1, unused2, lstride
lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx
lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx
mov offxd, seed
rorx offyd, seed, 8
shr offxd, 12
and offyd, 0xf
imul offyd, 82
lea offyq, [offyq+offxq+498] ; offy*stride+offx
imul offyd, 164>>%3
lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, unused1, unused2, lstride
@@ -1663,21 +1695,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov grain_lutq, grain_lutmp
%%loop_y_h_overlap:
; src
%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
vinserti128 m4, [lumaq+lstrideq*2 +0], 1
vinserti128 m6, [lumaq+lstrideq*2+16], 1
vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1689,6 +1729,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1717,6 +1760,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
punpcklbw m0, m2 ; m0-1: src as word
; grain = grain_lut[offy+y][offx+x]
%if %2
%if %1
vpbroadcastd m6, [pb_23_22] ; FIXME
%endif
@@ -1736,6 +1780,25 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pcmpeqw m6, m6 ; FIXME
psrldq m6, 15 ; FIXME
vpblendvb m3, m3, m4, m6
%else
%if %1
vpbroadcastd xm6, [pb_27_17_17_27]
%endif
movu m3, [grain_lutq+offxyq]
movd xm4, [grain_lutq+left_offxyq]
punpcklbw xm4, xm3
%if %1
pmaddubsw xm4, xm6, xm4
pmulhrsw xm4, [pw_1024]
%else
pmaddubsw xm4, xm15, xm4
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm6, xm6
psrldq xm6, 14
vpblendvb m3, m3, m4, m6
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1754,21 +1817,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pminsw m0, m12
pminsw m1, m12
packuswb m0, m1
%if %2
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
%else
mova [dstq], m0
%endif
%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
lea lumaq, [lumaq+lstrideq*4]
add grain_lutq, 82*2
sub hb, 2
lea lumaq, [lumaq+lstrideq*(2<<%3)]
%else
add srcq, strideq
add dstq, strideq
add lumaq, lstrideq
%endif
add grain_lutq, 82*(1+%2)
sub hb, 1+%2
jg %%loop_y_h_overlap
add wq, 16
add wq, 32>>%2
jge %%end
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*2]
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
@@ -1801,7 +1874,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov lumaq, r9mp
lea r12, [srcq+wq]
lea r13, [dstq+wq]
lea r14, [lumaq+wq*2]
lea r14, [lumaq+wq*(1+%2)]
mov r11mp, r12
mov r12mp, r13
mov lstrideq, r10mp
@@ -1828,9 +1901,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
rorx offxd, seed, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 82
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq+0x10001*498+16*82]
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, overlap, top_offxy, unused, lstride
@@ -1840,23 +1913,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_v_overlap:
; src
%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
vinserti128 m4, [lumaq+lstrideq*2 +0], 1
vinserti128 m6, [lumaq+lstrideq*2+16], 1
vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -1868,6 +1952,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -1891,11 +1978,42 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packusdw m8, m4
packusdw m5, m6
%if %2
; unpack chroma_source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
%endif
; grain = grain_lut[offy+y][offx+x]
%if %3 == 0
%if %2
mova m6, [pb_8x_27_17_8x_17_27]
movu xm3, [grain_lutq+offxyq]
movu xm4, [grain_lutq+top_offxyq]
vinserti128 m3, [grain_lutq+offxyq+82], 1
vinserti128 m4, [grain_lutq+top_offxyq+82], 1
%else
movu m3, [grain_lutq+offxyq]
movu m4, [grain_lutq+top_offxyq]
%endif
punpckhbw m9, m4, m3
punpcklbw m4, m3
%if %2
pmaddubsw m9, m6, m9
pmaddubsw m4, m6, m4
%else
pmaddubsw m9, m1, m9
pmaddubsw m4, m1, m4
%endif
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m4, [pw_1024]
%else
pmulhrsw m9, m14
pmulhrsw m4, m14
%endif
packsswb m3, m4, m9
%else
%if %1
vpbroadcastd m6, [pb_23_22]
%endif
@@ -1915,6 +2033,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
vpermq m4, m4, q3120
; only interpolate first line, insert second line unmodified
vinserti128 m3, m4, [grain_lutq+offxyq+82], 1
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -1926,6 +2045,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw m3, m11
; dst = clip_pixel(src, noise)
%if %2
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
@@ -1935,21 +2055,46 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m0, m1
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
%else
pxor m6, m6
punpckhbw m9, m0, m6
punpcklbw m0, m6 ; m0-1: src as word
sub hb, 2
paddw m0, m2
paddw m9, m3
pmaxsw m0, m13
pmaxsw m9, m13
pminsw m0, m12
pminsw m9, m12
packuswb m0, m9
mova [dstq], m0
%endif
sub hb, 1+%2
jl %%end_y_v_overlap
%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
lea lumaq, [lumaq+lstrideq*4]
add grain_lutq, 82*2
lea lumaq, [lumaq+lstrideq*(2<<%3)]
%else
add srcq, strideq
add dstq, strideq
add lumaq, lstrideq
%endif
add grain_lutq, 82<<%2
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_v_overlap
%endif
jmp %%loop_y
%%end_y_v_overlap:
add wq, 16
add wq, 32>>%2
jge %%end_hv
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*2]
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
@@ -1974,15 +2119,15 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
lea topleft_offxyq, [top_offxyq+16]
lea left_offxyq, [offyq+16]
lea topleft_offxyq, [top_offxyq+(32>>%2)]
lea left_offxyq, [offyq+(32>>%2)]
rorx offyd, seed, 8
rorx offxd, seed, 12
and offyd, 0xf000f
and offxd, 0xf000f
imul offyd, 82
imul offyd, 164>>%3
; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
lea offyq, [offyq+offxq+0x10001*498+16*82]
lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
@@ -1992,23 +2137,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
mov hd, hm
mov grain_lutq, grain_lutmp
%if %2 == 0
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27]
%endif
%%loop_y_hv_overlap:
; src
%if %2
mova xm4, [lumaq+lstrideq*0+ 0]
mova xm6, [lumaq+lstrideq*0+16]
mova xm0, [srcq]
vpbroadcastd m7, [pb_1]
vinserti128 m4, [lumaq+lstrideq*2 +0], 1
vinserti128 m6, [lumaq+lstrideq*2+16], 1
vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1
vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1
vinserti128 m0, [srcq+strideq], 1
pxor m2, m2
pmaddubsw m4, m7
pmaddubsw m6, m7
pavgw m4, m2
pavgw m6, m2
%else
mova m4, [lumaq]
mova m0, [srcq]
pxor m2, m2
%endif
%if %1
%if %2
packuswb m4, m6 ; luma
%endif
punpckhbw m6, m4, m0
punpcklbw m4, m0 ; { luma, chroma }
pmaddubsw m6, m14
@@ -2020,6 +2176,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m4, m6 ; pack+unpack = clip
punpckhbw m6, m4, m2
punpcklbw m4, m2
%elif %2 == 0
punpckhbw m6, m4, m2
punpcklbw m4, m2
%endif
punpckhwd m5, m4, m2
@@ -2043,44 +2202,94 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packusdw m8, m4
packusdw m5, m6
%if %2
; unpack chroma source
punpckhbw m1, m0, m2
punpcklbw m0, m2 ; m0-1: src as word
%endif
; grain = grain_lut[offy+y][offx+x]
%if %1
%if %2
vpbroadcastd m9, [pb_23_22]
%else
vpbroadcastd xm9, [pb_27_17_17_27]
%endif
%endif
%if %2
movu xm3, [grain_lutq+offxyq]
%if %3
movq xm6, [grain_lutq+top_offxyq]
%else
movu xm6, [grain_lutq+top_offxyq]
%endif
vinserti128 m3, [grain_lutq+offxyq+82], 1
%if %3
vinserti128 m6, [grain_lutq+top_offxyq+8], 1
%else
vinserti128 m6, [grain_lutq+top_offxyq+82], 1
%endif
%else
movu m3, [grain_lutq+offxyq]
movu m6, [grain_lutq+top_offxyq]
%endif
movd xm4, [grain_lutq+left_offxyq]
movd xm7, [grain_lutq+topleft_offxyq]
%if %2
vinserti128 m4, [grain_lutq+left_offxyq+82], 1
%if %3 == 0
vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1
%endif
%endif
; do h interpolation first (so top | top/left -> top, left | cur -> cur)
%if %2
punpcklbw m4, m3
%if %3
punpcklbw xm7, xm6
%else
punpcklbw m7, m6
%endif
punpcklwd m4, m7
%if %1
pmaddubsw m4, m9, m4
pmaddubsw xm7, xm9, xm7
pmulhrsw m4, [pw_1024]
pmulhrsw xm7, [pw_1024]
%else
pmaddubsw m4, m15, m4
pmaddubsw xm7, xm15, xm7
pmulhrsw m4, m14
pmulhrsw xm7, xm14
%endif
packsswb m4, m4
packsswb xm7, xm7
pcmpeqw m9, m9 ; this is kind of ugly
psrldq m9, 15
vpblendvb m3, m3, m4, m9
shufpd m9, m9, m9, 1110b
vpblendvb m6, m6, m7, m9
vpermq m9, m3, q3120
psrldq m4, 1
%if %3
shufpd m9, m9, m9, 1110b ; clear upper lane
%endif
vpblendvb m6, m6, m4, m9
%else
punpcklbw xm4, xm3
punpcklbw xm7, xm6
punpckldq xm4, xm7
%if %1
pmaddubsw xm4, xm9, xm4
pmulhrsw xm4, [pw_1024]
%else
pmaddubsw xm4, xm15, xm4
pmulhrsw xm4, xm14
%endif
packsswb xm4, xm4
pcmpeqw xm9, xm9 ; this is kind of ugly
psrldq xm9, 14
vpblendvb m3, m3, m4, m9
psrldq xm4, 2
vpblendvb m6, m6, m4, m9
%endif
; followed by v interpolation (top | cur -> cur)
%if %3
vpermq m9, m3, q3120
punpcklbw m6, m9
%if %1
vpbroadcastd m9, [pb_23_22]
@@ -2093,6 +2302,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packsswb m6, m6
vpermq m6, m6, q3120
vpblendd m3, m3, m6, 00001111b
%else
punpckhbw m9, m6, m3
punpcklbw m6, m3
%if %2
mova m3, [pb_8x_27_17_8x_17_27]
pmaddubsw m9, m3, m9
pmaddubsw m6, m3, m6
%else
pmaddubsw m9, m1, m9
pmaddubsw m6, m1, m6
%endif
%if %1
pmulhrsw m9, [pw_1024]
pmulhrsw m6, [pw_1024]
%else
pmulhrsw m9, m14
pmulhrsw m6, m14
%endif
packsswb m3, m6, m9
%endif
pcmpgtb m7, m2, m3
punpcklbw m2, m3, m7
punpckhbw m3, m7
@@ -2104,6 +2333,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
pmulhrsw m3, m11
; dst = clip_pixel(src, noise)
%if %2
paddw m0, m2
paddw m1, m3
pmaxsw m0, m13
@@ -2113,20 +2343,47 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
packuswb m0, m1
mova [dstq], xm0
vextracti128 [dstq+strideq], m0, 1
%else
pxor m6, m6
punpckhbw m9, m0, m6
punpcklbw m0, m6 ; m0-1: src as word
paddw m0, m2
paddw m9, m3
pmaxsw m0, m13
pmaxsw m9, m13
pminsw m0, m12
pminsw m9, m12
packuswb m0, m9
mova [dstq], m0
%endif
%if %2
lea srcq, [srcq+strideq*2]
lea dstq, [dstq+strideq*2]
lea lumaq, [lumaq+lstrideq*4]
add grain_lutq, 82*2
sub hb, 2
lea lumaq, [lumaq+lstrideq*(2<<%3)]
%else
add srcq, strideq
add dstq, strideq
add lumaq, lstrideq
%endif
add grain_lutq, 82<<%2
sub hb, 1+%2
%if %2
jg %%loop_y_h_overlap
%else
je %%end_y_hv_overlap
vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16]
btc hd, 16
jnc %%loop_y_hv_overlap
jmp %%loop_y_h_overlap
%endif
%%end_y_hv_overlap:
add wq, 16
add wq, 32>>%2
jge %%end_hv
mov srcq, r11mp
mov dstq, r12mp
lea lumaq, [r14+wq*2]
lea lumaq, [r14+wq*(1+%2)]
add srcq, wq
add dstq, wq
jmp %%loop_x_hv_overlap
@@ -2135,8 +2392,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
RET
%endmacro
FGUV_32x32xN_LOOP 1
%%FGUV_32x32xN_LOOP 1, %2, %3
.csfl:
FGUV_32x32xN_LOOP 0
%%FGUV_32x32xN_LOOP 0, %2, %3
%endmacro
FGUV_FN 420, 1, 1
FGUV_FN 422, 1, 0
FGUV_FN 444, 0, 0
%endif ; ARCH_X86_64
+4
View File
@@ -41,6 +41,8 @@ decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -65,5 +67,7 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
#endif
}