arm: Fix up code style slightly

The existing code has been written striving to align columns so
that the largest register names can be typed, e.g. r10 on ARM
(and similarly for x10 or q10 on AArch64), or v31.16b for AArch64
vectors.

Fix some cases, where the current forms were clearly
inconsistent/wrong. Not all cases have been fixed up to match this
norm, but some individual ones that were clearly wrong have been
fixed.
This commit is contained in:
Martin Storsjö
2026-05-06 15:32:26 +00:00
parent 7b9ab8373e
commit 037430193a
10 changed files with 57 additions and 57 deletions
+2 -2
View File
@@ -540,7 +540,7 @@ L(ipred_dc_left_w64):
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
vst1.8 {d0, d1, d2, d3}, [r12, :128], r1
subs r4, r4, #4
subs r4, r4, #4
vst1.8 {d0, d1, d2, d3}, [r0, :128]!
vst1.8 {d0, d1, d2, d3}, [r12, :128]!
vst1.8 {d0, d1, d2, d3}, [r0, :128], r1
@@ -692,7 +692,7 @@ L(ipred_dc_w16):
2:
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
subs r4, r4, #4
subs r4, r4, #4
vst1.8 {d0, d1}, [r0, :128], r1
vst1.8 {d0, d1}, [r12, :128], r1
bgt 2b
+2 -2
View File
@@ -47,8 +47,8 @@ function sgr_box3_row_v_neon, export=1
vld1.16 {q15}, [r9]!
subs r4, r4, #8
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vadd.i32 q8, q8, q10
vadd.i32 q9, q9, q11
vld1.32 {q12, q13}, [r0]!
+33 -33
View File
@@ -109,7 +109,7 @@ L(\type\()_tbl):
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
beq 0f
\type d18, d19, q0, q1, q2, q3
\type d18, d19, q0, q1, q2, q3
cmp r5, #8
vst1.32 {d18[0]}, [r0, :32], r1
vst1.32 {d18[1]}, [r6, :32], r1
@@ -119,7 +119,7 @@ L(\type\()_tbl):
\type d16, d17, q0, q1, q2, q3
vst1.32 {d16[0]}, [r0, :32], r1
vst1.32 {d16[1]}, [r6, :32], r1
\type d18, d19, q0, q1, q2, q3
\type d18, d19, q0, q1, q2, q3
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
vst1.32 {d18[0]}, [r0, :32], r1
@@ -288,7 +288,7 @@ L(w_mask_\type\()_tbl):
vadd.s16 d21, d22, d23
vpadd.s16 d20, d20, d21 // (128 - m) + (128 - n) (column wise addition)
vsub.s16 d20, d30, d20 // (256 - sign) - ((128 - m) + (128 - n))
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vrshrn.u16 d20, q10, #2 // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
vst1.32 {d20[0]}, [r6, :32]!
.endif
vst1.32 {d24[0]}, [r0, :32], r1
@@ -611,7 +611,7 @@ L(blend_h_tbl):
vld2.u8 {d2[], d3[]}, [r5, :16]!
vld1.u8 {d1}, [r2, :64]!
subs r4, r4, #2
vext.u8 d2, d2, d3, #4
vext.u8 d2, d2, d3, #4
vld1.32 {d0[]}, [r0, :32]
vsub.i8 d6, d22, d2
vld1.32 {d0[1]}, [r12, :32]
@@ -623,7 +623,7 @@ L(blend_h_tbl):
bgt 4b
pop {r4-r5,pc}
80:
vmov.i8 q8, #64
vmov.i8 q8, #64
add r12, r0, r1
lsl r1, r1, #1
8:
@@ -2498,8 +2498,8 @@ L(\type\()_bilin_h_tbl):
2:
vld1.32 {d4[]}, [\src], \s_strd
vld1.32 {d6[]}, [\sr2], \s_strd
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vtrn.16 q2, q3
subs \h, \h, #2
vmull.u8 q3, d4, d0
@@ -2519,8 +2519,8 @@ L(\type\()_bilin_h_tbl):
4:
vld1.8 {d4}, [\src], \s_strd
vld1.8 {d6}, [\sr2], \s_strd
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vtrn.32 q2, q3
subs \h, \h, #2
vmull.u8 q3, d4, d0
@@ -2552,8 +2552,8 @@ L(\type\()_bilin_h_tbl):
vmlal.u8 q8, d18, d1
vmlal.u8 q10, d22, d1
.ifc \type, put
vqrshrn.u16 d16, q8, #4
vqrshrn.u16 d18, q10, #4
vqrshrn.u16 d16, q8, #4
vqrshrn.u16 d18, q10, #4
vst1.8 {d16}, [\dst, :64], \d_strd
vst1.8 {d18}, [\ds2, :64], \d_strd
.else
@@ -2712,7 +2712,7 @@ L(\type\()_bilin_v_tbl):
vst1.16 {d5}, [\ds2, :64], \d_strd
.endif
ble 0f
vmov d16, d18
vmov d16, d18
b 4b
0:
pop {r4-r11,pc}
@@ -2881,8 +2881,8 @@ L(\type\()_bilin_hv_tbl):
vmov d17, d18
vmul.u16 q10, q8, q2
vmla.u16 q10, q9, q3
vmul.u16 q10, q8, q2
vmla.u16 q10, q9, q3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d20, q10, #8
@@ -3054,8 +3054,8 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
ldr r6, [sp, #108]
ldrd r8, r9, [r4]
sxth r7, r8
asr r8, r8, #16
asr r4, r9, #16
asr r8, r8, #16
asr r4, r9, #16
sxth r9, r9
mov r10, #8
sub r2, r2, r3, lsl #1
@@ -3107,26 +3107,26 @@ function warp_affine_8x8\t\()_8bpc_neon, export=1
// This ordering of vmull/vmlal is highly beneficial for
// Cortex A8/A9/A53 here, but harmful for Cortex A7.
vmull.s16 q0, d16, d2
vmlal.s16 q0, d18, d4
vmlal.s16 q0, d20, d6
vmlal.s16 q0, d22, d8
vmlal.s16 q0, d24, d10
vmlal.s16 q0, d26, d12
vmull.s16 q1, d17, d3
vmlal.s16 q1, d19, d5
vmlal.s16 q1, d21, d7
vmlal.s16 q1, d23, d9
vmlal.s16 q1, d25, d11
vmlal.s16 q1, d27, d13
vmull.s16 q0, d16, d2
vmlal.s16 q0, d18, d4
vmlal.s16 q0, d20, d6
vmlal.s16 q0, d22, d8
vmlal.s16 q0, d24, d10
vmlal.s16 q0, d26, d12
vmull.s16 q1, d17, d3
vmlal.s16 q1, d19, d5
vmlal.s16 q1, d21, d7
vmlal.s16 q1, d23, d9
vmlal.s16 q1, d25, d11
vmlal.s16 q1, d27, d13
vmovl.s8 q2, d14
vmovl.s8 q3, d15
vmlal.s16 q0, d28, d4
vmlal.s16 q0, d30, d6
vmlal.s16 q1, d29, d5
vmlal.s16 q1, d31, d7
vmlal.s16 q0, d28, d4
vmlal.s16 q0, d30, d6
vmlal.s16 q1, d29, d5
vmlal.s16 q1, d31, d7
.ifb \t
vmov.i16 q7, #128
@@ -3318,7 +3318,7 @@ function emu_edge_8bpc_neon, export=1
subs r3, r3, #1
vst1.8 {q0, q1}, [r6, :128], r7
bgt 2b
mls r6, r7, r10, r6 // dst -= bottom_ext * stride
mls r6, r7, r10, r6 // dst -= bottom_ext * stride
subs r4, r4, #32 // bw -= 32
add r6, r6, #32 // dst += 32
bgt 1b
+1 -1
View File
@@ -366,7 +366,7 @@ function msac_decode_hi_tok_neon, export=1
add r5, r0, #DIF + 2
vld1.16 {q8}, [r4, :128]
mov r2, #-24
vand d20, d0, d30 // cdf & 0xffc0
vand d20, d0, d30 // cdf & 0xffc0
ldr r10, [r0, #ALLOW_UPDATE_CDF]
vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16)
sub sp, sp, #48
+5 -5
View File
@@ -133,10 +133,10 @@ function save_tmvs_neon, export=1
and r9, r7, #30 // (y & 15) * 2
ldr r9, [r2, r9, lsl #2] // b = rr[(y & 15) * 2]
add r9, r9, #12 // &b[... + 1]
mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
mla r10, r4, r11, r9 // end_cand_b = &b[col_end8*2 + 1]
mla r9, r6, r11, r9 // cand_b = &b[x*2 + 1]
mla r3, r6, r3, r0 // &rp[x]
mla r3, r6, r3, r0 // &rp[x]
push {r2,r4,r6}
@@ -175,8 +175,8 @@ function save_tmvs_neon, export=1
vmov.u16 r6, d2[1]
ldr r11, [r11, #4] // Fetch jump table entry
ldr r2, [r2, #4]
add r4, r12, r4, lsl #4
add r6, r12, r6, lsl #4
add r4, r12, r4, lsl #4
add r6, r12, r6, lsl #4
vld1.8 {d2, d3}, [r4] // Load permutation table base on case
vld1.8 {d4, d5}, [r6]
add r11, r8, r11 // Find jump table target
+2 -2
View File
@@ -695,7 +695,7 @@ function gen_grain_uv_420_lag0_4_neon
str x30, [sp, #-16]!
ld1 {v16.4h, v17.4h}, [x19]
ld1 {v18.4h, v19.4h}, [x12]
add x19, x19, #32
add x19, x19, #32
addp v16.4h, v16.4h, v17.4h
addp v17.4h, v18.4h, v19.4h
add v16.4h, v16.4h, v17.4h
@@ -708,7 +708,7 @@ function gen_grain_uv_422_lag0_4_neon
AARCH64_SIGN_LINK_REGISTER
str x30, [sp, #-16]!
ld1 {v16.4h, v17.4h}, [x19]
add x19, x19, #32
add x19, x19, #32
addp v16.4h, v16.4h, v17.4h
srshr v4.4h, v16.4h, #1
get_grain_4 v0
+2 -2
View File
@@ -545,7 +545,7 @@ endfunc
sqrshrn2 \o0\().8h, v17.4s, #12
.ifc \o2, v17
mov v17.16b, v18.16b
mov v17.16b, v18.16b
.endif
sqrshrn \o1\().4h, v6.4s, #12
@@ -1993,7 +1993,7 @@ function inv_dct32_odd_8h_x16_neon, export=1
smull_smlal v4, v5, v25, v27, v0.h[0], v0.h[0], .8h // -> t26a
smull_smlsl v6, v7, v25, v27, v0.h[0], v0.h[0], .8h // -> t21a
mov v27.16b, v22.16b // t27
mov v27.16b, v22.16b // t27
sqrshrn_sz v26, v4, v5, #12, .8h // t26a
smull_smlsl v24, v25, v21, v23, v0.h[0], v0.h[0], .8h // -> t22
+4 -4
View File
@@ -305,7 +305,7 @@ function lpf_16_wd\wd\()_neon
rshrn2 v0.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h , v9.8h, v3.8h
add v9.8h, v9.8h, v3.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
@@ -696,7 +696,7 @@ function lpf_v_8_16_neon
lpf_16_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
sub x16, x16, x1
st1 {v21.16b}, [x16], x1 // p2
st1 {v24.16b}, [x0], x1 // q0
st1 {v22.16b}, [x16], x1 // p1
@@ -817,7 +817,7 @@ function lpf_v_16_16_neon
lpf_16_wd16
sub x16, x0, x1, lsl #2
sub x16, x16, x1, lsl #1
sub x16, x16, x1, lsl #1
st1 {v0.16b}, [x16], x1 // p5
st1 {v6.16b}, [x0], x1 // q0
st1 {v1.16b}, [x16], x1 // p4
@@ -1051,7 +1051,7 @@ function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
adds x16, x16, x17
b.eq 7f // if (!L) continue;
neg v5.16b, v5.16b // -sharp[0]
movrel x16, word_1248
movrel x16, word_1248
ushr v12.16b, v1.16b, #4 // H
ld1 {v16.4s}, [x16]
sshl v3.16b, v1.16b, v5.16b // L >> sharp[0]
+3 -3
View File
@@ -550,7 +550,7 @@ function lpf_v_8_8_neon
lpf_8_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
sub x16, x16, x1
st1 {v21.8h}, [x16], x1 // p2
st1 {v24.8h}, [x0], x1 // q0
st1 {v22.8h}, [x16], x1 // p1
@@ -781,7 +781,7 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
mov w8, w7 // bitdepth_max
clz w9, w8
mov w10, #24
sub w9, w10, w9 // bitdepth_min_8
sub w9, w10, w9 // bitdepth_min_8
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
@@ -836,7 +836,7 @@ function lpf_\dir\()_sb_\type\()_16bpc_neon, export=1
cmp x16, #0
b.eq 7f // if (!L) continue;
neg v5.8b, v5.8b // -sharp[0]
movrel x16, word_12
movrel x16, word_12
ushr v12.8b, v1.8b, #4 // H
ld1 {v16.2s}, [x16]
sshl v3.8b, v1.8b, v5.8b // L >> sharp[0]
+3 -3
View File
@@ -133,10 +133,10 @@ function save_tmvs_neon, export=1
and w9, w7, #30 // (y & 15) * 2
ldr x9, [x2, w9, uxtw #3] // b = rr[(y & 15) * 2]
add x9, x9, #12 // &b[... + 1]
madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
madd x10, x4, x14, x9 // end_cand_b = &b[col_end8*2 + 1]
madd x9, x6, x14, x9 // cand_b = &b[x*2 + 1]
madd x3, x6, x15, x0 // &rp[x]
madd x3, x6, x15, x0 // &rp[x]
2:
ldrb w11, [x9, #10] // cand_b->bs