deblock-a.S 29 KB


  1. /*****************************************************************************
  2. * deblock.S: aarch64 deblocking
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: Mans Rullgard <mans@mansr.com>
  7. * Janne Grunau <janne-x264@jannau.net>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "asm.S"
  27. .macro h264_loop_filter_start
  28. cmp w2, #0
  29. ldr w6, [x4]
  30. ccmp w3, #0, #0, ne
  31. mov v24.s[0], w6
  32. and w8, w6, w6, lsl #16
  33. b.eq 1f
  34. ands w8, w8, w8, lsl #8
  35. b.ge 2f
  36. 1:
  37. ret
  38. 2:
  39. .endm
  40. .macro h264_loop_filter_luma
  41. dup v22.16b, w2 // alpha
  42. uxtl v24.8h, v24.8b
  43. uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
  44. uxtl v24.4s, v24.4h
  45. uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
  46. sli v24.8h, v24.8h, #8
  47. uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
  48. sli v24.4s, v24.4s, #16
  49. cmhi v21.16b, v22.16b, v21.16b // < alpha
  50. dup v22.16b, w3 // beta
  51. cmlt v23.16b, v24.16b, #0
  52. cmhi v28.16b, v22.16b, v28.16b // < beta
  53. cmhi v30.16b, v22.16b, v30.16b // < beta
  54. bic v21.16b, v21.16b, v23.16b
  55. uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
  56. and v21.16b, v21.16b, v28.16b
  57. uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
  58. cmhi v17.16b, v22.16b, v17.16b // < beta
  59. and v21.16b, v21.16b, v30.16b
  60. cmhi v19.16b, v22.16b, v19.16b // < beta
  61. and v17.16b, v17.16b, v21.16b
  62. and v19.16b, v19.16b, v21.16b
  63. and v24.16b, v24.16b, v21.16b
  64. urhadd v28.16b, v16.16b, v0.16b
  65. sub v21.16b, v24.16b, v17.16b
  66. uqadd v23.16b, v18.16b, v24.16b
  67. uhadd v20.16b, v20.16b, v28.16b
  68. sub v21.16b, v21.16b, v19.16b
  69. uhadd v28.16b, v4.16b, v28.16b
  70. umin v23.16b, v23.16b, v20.16b
  71. uqsub v22.16b, v18.16b, v24.16b
  72. uqadd v4.16b, v2.16b, v24.16b
  73. umax v23.16b, v23.16b, v22.16b
  74. uqsub v22.16b, v2.16b, v24.16b
  75. umin v28.16b, v4.16b, v28.16b
  76. uxtl v4.8h, v0.8b
  77. umax v28.16b, v28.16b, v22.16b
  78. uxtl2 v20.8h, v0.16b
  79. usubw v4.8h, v4.8h, v16.8b
  80. usubw2 v20.8h, v20.8h, v16.16b
  81. shl v4.8h, v4.8h, #2
  82. shl v20.8h, v20.8h, #2
  83. uaddw v4.8h, v4.8h, v18.8b
  84. uaddw2 v20.8h, v20.8h, v18.16b
  85. usubw v4.8h, v4.8h, v2.8b
  86. usubw2 v20.8h, v20.8h, v2.16b
  87. rshrn v4.8b, v4.8h, #3
  88. rshrn2 v4.16b, v20.8h, #3
  89. bsl v17.16b, v23.16b, v18.16b
  90. bsl v19.16b, v28.16b, v2.16b
  91. neg v23.16b, v21.16b
  92. uxtl v28.8h, v16.8b
  93. smin v4.16b, v4.16b, v21.16b
  94. uxtl2 v21.8h, v16.16b
  95. smax v4.16b, v4.16b, v23.16b
  96. uxtl v22.8h, v0.8b
  97. uxtl2 v24.8h, v0.16b
  98. saddw v28.8h, v28.8h, v4.8b
  99. saddw2 v21.8h, v21.8h, v4.16b
  100. ssubw v22.8h, v22.8h, v4.8b
  101. ssubw2 v24.8h, v24.8h, v4.16b
  102. sqxtun v16.8b, v28.8h
  103. sqxtun2 v16.16b, v21.8h
  104. sqxtun v0.8b, v22.8h
  105. sqxtun2 v0.16b, v24.8h
  106. .endm
  107. function deblock_v_luma_neon, export=1
  108. h264_loop_filter_start
  109. ld1 {v0.16b}, [x0], x1
  110. ld1 {v2.16b}, [x0], x1
  111. ld1 {v4.16b}, [x0], x1
  112. sub x0, x0, x1, lsl #2
  113. sub x0, x0, x1, lsl #1
  114. ld1 {v20.16b}, [x0], x1
  115. ld1 {v18.16b}, [x0], x1
  116. ld1 {v16.16b}, [x0], x1
  117. h264_loop_filter_luma
  118. sub x0, x0, x1, lsl #1
  119. st1 {v17.16b}, [x0], x1
  120. st1 {v16.16b}, [x0], x1
  121. st1 {v0.16b}, [x0], x1
  122. st1 {v19.16b}, [x0]
  123. ret
  124. endfunc
  125. function deblock_h_luma_neon, export=1
  126. h264_loop_filter_start
  127. sub x0, x0, #4
  128. ld1 {v6.8b}, [x0], x1
  129. ld1 {v20.8b}, [x0], x1
  130. ld1 {v18.8b}, [x0], x1
  131. ld1 {v16.8b}, [x0], x1
  132. ld1 {v0.8b}, [x0], x1
  133. ld1 {v2.8b}, [x0], x1
  134. ld1 {v4.8b}, [x0], x1
  135. ld1 {v26.8b}, [x0], x1
  136. ld1 {v6.d}[1], [x0], x1
  137. ld1 {v20.d}[1], [x0], x1
  138. ld1 {v18.d}[1], [x0], x1
  139. ld1 {v16.d}[1], [x0], x1
  140. ld1 {v0.d}[1], [x0], x1
  141. ld1 {v2.d}[1], [x0], x1
  142. ld1 {v4.d}[1], [x0], x1
  143. ld1 {v26.d}[1], [x0], x1
  144. transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
  145. h264_loop_filter_luma
  146. transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
  147. sub x0, x0, x1, lsl #4
  148. add x0, x0, #2
  149. st1 {v17.s}[0], [x0], x1
  150. st1 {v16.s}[0], [x0], x1
  151. st1 {v0.s}[0], [x0], x1
  152. st1 {v19.s}[0], [x0], x1
  153. st1 {v17.s}[1], [x0], x1
  154. st1 {v16.s}[1], [x0], x1
  155. st1 {v0.s}[1], [x0], x1
  156. st1 {v19.s}[1], [x0], x1
  157. st1 {v17.s}[2], [x0], x1
  158. st1 {v16.s}[2], [x0], x1
  159. st1 {v0.s}[2], [x0], x1
  160. st1 {v19.s}[2], [x0], x1
  161. st1 {v17.s}[3], [x0], x1
  162. st1 {v16.s}[3], [x0], x1
  163. st1 {v0.s}[3], [x0], x1
  164. st1 {v19.s}[3], [x0], x1
  165. ret
  166. endfunc
  167. .macro h264_loop_filter_start_intra
  168. orr w4, w2, w3
  169. cmp w4, #0
  170. b.ne 1f
  171. ret
  172. 1:
  173. dup v30.16b, w2 // alpha
  174. dup v31.16b, w3 // beta
  175. .endm
  176. .macro h264_loop_filter_luma_intra
  177. uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
  178. uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
  179. uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
  180. cmhi v19.16b, v30.16b, v16.16b // < alpha
  181. cmhi v17.16b, v31.16b, v17.16b // < beta
  182. cmhi v18.16b, v31.16b, v18.16b // < beta
  183. movi v29.16b, #2
  184. ushr v30.16b, v30.16b, #2 // alpha >> 2
  185. add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
  186. cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
  187. and v19.16b, v19.16b, v17.16b
  188. and v19.16b, v19.16b, v18.16b
  189. shrn v20.8b, v19.8h, #4
  190. mov x4, v20.d[0]
  191. cbz x4, 9f
  192. ushll v20.8h, v6.8b, #1
  193. ushll v22.8h, v1.8b, #1
  194. ushll2 v21.8h, v6.16b, #1
  195. ushll2 v23.8h, v1.16b, #1
  196. uaddw v20.8h, v20.8h, v7.8b
  197. uaddw v22.8h, v22.8h, v0.8b
  198. uaddw2 v21.8h, v21.8h, v7.16b
  199. uaddw2 v23.8h, v23.8h, v0.16b
  200. uaddw v20.8h, v20.8h, v1.8b
  201. uaddw v22.8h, v22.8h, v6.8b
  202. uaddw2 v21.8h, v21.8h, v1.16b
  203. uaddw2 v23.8h, v23.8h, v6.16b
  204. rshrn v24.8b, v20.8h, #2 // p0'_1
  205. rshrn v25.8b, v22.8h, #2 // q0'_1
  206. rshrn2 v24.16b, v21.8h, #2 // p0'_1
  207. rshrn2 v25.16b, v23.8h, #2 // q0'_1
  208. uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
  209. uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
  210. cmhi v17.16b, v31.16b, v17.16b // < beta
  211. cmhi v18.16b, v31.16b, v18.16b // < beta
  212. and v17.16b, v16.16b, v17.16b // if_2 && if_3
  213. and v18.16b, v16.16b, v18.16b // if_2 && if_4
  214. not v30.16b, v17.16b
  215. not v31.16b, v18.16b
  216. and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
  217. and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
  218. and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
  219. and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
  220. //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
  221. uaddl v26.8h, v5.8b, v7.8b
  222. uaddl2 v27.8h, v5.16b, v7.16b
  223. uaddw v26.8h, v26.8h, v0.8b
  224. uaddw2 v27.8h, v27.8h, v0.16b
  225. add v20.8h, v20.8h, v26.8h
  226. add v21.8h, v21.8h, v27.8h
  227. uaddw v20.8h, v20.8h, v0.8b
  228. uaddw2 v21.8h, v21.8h, v0.16b
  229. rshrn v20.8b, v20.8h, #3 // p0'_2
  230. rshrn2 v20.16b, v21.8h, #3 // p0'_2
  231. uaddw v26.8h, v26.8h, v6.8b
  232. uaddw2 v27.8h, v27.8h, v6.16b
  233. rshrn v21.8b, v26.8h, #2 // p1'_2
  234. rshrn2 v21.16b, v27.8h, #2 // p1'_2
  235. uaddl v28.8h, v4.8b, v5.8b
  236. uaddl2 v29.8h, v4.16b, v5.16b
  237. shl v28.8h, v28.8h, #1
  238. shl v29.8h, v29.8h, #1
  239. add v28.8h, v28.8h, v26.8h
  240. add v29.8h, v29.8h, v27.8h
  241. rshrn v19.8b, v28.8h, #3 // p2'_2
  242. rshrn2 v19.16b, v29.8h, #3 // p2'_2
  243. //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
  244. uaddl v26.8h, v2.8b, v0.8b
  245. uaddl2 v27.8h, v2.16b, v0.16b
  246. uaddw v26.8h, v26.8h, v7.8b
  247. uaddw2 v27.8h, v27.8h, v7.16b
  248. add v22.8h, v22.8h, v26.8h
  249. add v23.8h, v23.8h, v27.8h
  250. uaddw v22.8h, v22.8h, v7.8b
  251. uaddw2 v23.8h, v23.8h, v7.16b
  252. rshrn v22.8b, v22.8h, #3 // q0'_2
  253. rshrn2 v22.16b, v23.8h, #3 // q0'_2
  254. uaddw v26.8h, v26.8h, v1.8b
  255. uaddw2 v27.8h, v27.8h, v1.16b
  256. rshrn v23.8b, v26.8h, #2 // q1'_2
  257. rshrn2 v23.16b, v27.8h, #2 // q1'_2
  258. uaddl v28.8h, v2.8b, v3.8b
  259. uaddl2 v29.8h, v2.16b, v3.16b
  260. shl v28.8h, v28.8h, #1
  261. shl v29.8h, v29.8h, #1
  262. add v28.8h, v28.8h, v26.8h
  263. add v29.8h, v29.8h, v27.8h
  264. rshrn v26.8b, v28.8h, #3 // q2'_2
  265. rshrn2 v26.16b, v29.8h, #3 // q2'_2
  266. bit v7.16b, v24.16b, v30.16b // p0'_1
  267. bit v0.16b, v25.16b, v31.16b // q0'_1
  268. bit v7.16b, v20.16b, v17.16b // p0'_2
  269. bit v6.16b, v21.16b, v17.16b // p1'_2
  270. bit v5.16b, v19.16b, v17.16b // p2'_2
  271. bit v0.16b, v22.16b, v18.16b // q0'_2
  272. bit v1.16b, v23.16b, v18.16b // q1'_2
  273. bit v2.16b, v26.16b, v18.16b // q2'_2
  274. .endm
  275. function deblock_v_luma_intra_neon, export=1
  276. h264_loop_filter_start_intra
  277. ld1 {v0.16b}, [x0], x1 // q0
  278. ld1 {v1.16b}, [x0], x1 // q1
  279. ld1 {v2.16b}, [x0], x1 // q2
  280. ld1 {v3.16b}, [x0], x1 // q3
  281. sub x0, x0, x1, lsl #3
  282. ld1 {v4.16b}, [x0], x1 // p3
  283. ld1 {v5.16b}, [x0], x1 // p2
  284. ld1 {v6.16b}, [x0], x1 // p1
  285. ld1 {v7.16b}, [x0] // p0
  286. h264_loop_filter_luma_intra
  287. sub x0, x0, x1, lsl #1
  288. st1 {v5.16b}, [x0], x1 // p2
  289. st1 {v6.16b}, [x0], x1 // p1
  290. st1 {v7.16b}, [x0], x1 // p0
  291. st1 {v0.16b}, [x0], x1 // q0
  292. st1 {v1.16b}, [x0], x1 // q1
  293. st1 {v2.16b}, [x0] // q2
  294. 9:
  295. ret
  296. endfunc
  297. function deblock_h_luma_intra_neon, export=1
  298. h264_loop_filter_start_intra
  299. sub x0, x0, #4
  300. ld1 {v4.8b}, [x0], x1
  301. ld1 {v5.8b}, [x0], x1
  302. ld1 {v6.8b}, [x0], x1
  303. ld1 {v7.8b}, [x0], x1
  304. ld1 {v0.8b}, [x0], x1
  305. ld1 {v1.8b}, [x0], x1
  306. ld1 {v2.8b}, [x0], x1
  307. ld1 {v3.8b}, [x0], x1
  308. ld1 {v4.d}[1], [x0], x1
  309. ld1 {v5.d}[1], [x0], x1
  310. ld1 {v6.d}[1], [x0], x1
  311. ld1 {v7.d}[1], [x0], x1
  312. ld1 {v0.d}[1], [x0], x1
  313. ld1 {v1.d}[1], [x0], x1
  314. ld1 {v2.d}[1], [x0], x1
  315. ld1 {v3.d}[1], [x0], x1
  316. transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
  317. h264_loop_filter_luma_intra
  318. transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
  319. sub x0, x0, x1, lsl #4
  320. st1 {v4.8b}, [x0], x1
  321. st1 {v5.8b}, [x0], x1
  322. st1 {v6.8b}, [x0], x1
  323. st1 {v7.8b}, [x0], x1
  324. st1 {v0.8b}, [x0], x1
  325. st1 {v1.8b}, [x0], x1
  326. st1 {v2.8b}, [x0], x1
  327. st1 {v3.8b}, [x0], x1
  328. st1 {v4.d}[1], [x0], x1
  329. st1 {v5.d}[1], [x0], x1
  330. st1 {v6.d}[1], [x0], x1
  331. st1 {v7.d}[1], [x0], x1
  332. st1 {v0.d}[1], [x0], x1
  333. st1 {v1.d}[1], [x0], x1
  334. st1 {v2.d}[1], [x0], x1
  335. st1 {v3.d}[1], [x0], x1
  336. 9:
  337. ret
  338. endfunc
  339. .macro h264_loop_filter_chroma
  340. dup v22.16b, w2 // alpha
  341. uxtl v24.8h, v24.8b
  342. uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
  343. uxtl v4.8h, v0.8b
  344. uxtl2 v5.8h, v0.16b
  345. uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
  346. usubw v4.8h, v4.8h, v16.8b
  347. usubw2 v5.8h, v5.8h, v16.16b
  348. sli v24.8h, v24.8h, #8
  349. shl v4.8h, v4.8h, #2
  350. shl v5.8h, v5.8h, #2
  351. uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
  352. uxtl v24.4s, v24.4h
  353. uaddw v4.8h, v4.8h, v18.8b
  354. uaddw2 v5.8h, v5.8h, v18.16b
  355. cmhi v26.16b, v22.16b, v26.16b // < alpha
  356. usubw v4.8h, v4.8h, v2.8b
  357. usubw2 v5.8h, v5.8h, v2.16b
  358. sli v24.4s, v24.4s, #16
  359. dup v22.16b, w3 // beta
  360. rshrn v4.8b, v4.8h, #3
  361. rshrn2 v4.16b, v5.8h, #3
  362. cmhi v28.16b, v22.16b, v28.16b // < beta
  363. cmhi v30.16b, v22.16b, v30.16b // < beta
  364. smin v4.16b, v4.16b, v24.16b
  365. neg v25.16b, v24.16b
  366. and v26.16b, v26.16b, v28.16b
  367. smax v4.16b, v4.16b, v25.16b
  368. and v26.16b, v26.16b, v30.16b
  369. uxtl v22.8h, v0.8b
  370. uxtl2 v23.8h, v0.16b
  371. and v4.16b, v4.16b, v26.16b
  372. uxtl v28.8h, v16.8b
  373. uxtl2 v29.8h, v16.16b
  374. saddw v28.8h, v28.8h, v4.8b
  375. saddw2 v29.8h, v29.8h, v4.16b
  376. ssubw v22.8h, v22.8h, v4.8b
  377. ssubw2 v23.8h, v23.8h, v4.16b
  378. sqxtun v16.8b, v28.8h
  379. sqxtun v0.8b, v22.8h
  380. sqxtun2 v16.16b, v29.8h
  381. sqxtun2 v0.16b, v23.8h
  382. .endm
  383. function deblock_v_chroma_neon, export=1
  384. h264_loop_filter_start
  385. sub x0, x0, x1, lsl #1
  386. ld1 {v18.16b}, [x0], x1
  387. ld1 {v16.16b}, [x0], x1
  388. ld1 {v0.16b}, [x0], x1
  389. ld1 {v2.16b}, [x0]
  390. h264_loop_filter_chroma
  391. sub x0, x0, x1, lsl #1
  392. st1 {v16.16b}, [x0], x1
  393. st1 {v0.16b}, [x0], x1
  394. ret
  395. endfunc
  396. function deblock_h_chroma_neon, export=1
  397. h264_loop_filter_start
  398. sub x0, x0, #4
  399. deblock_h_chroma:
  400. ld1 {v18.d}[0], [x0], x1
  401. ld1 {v16.d}[0], [x0], x1
  402. ld1 {v0.d}[0], [x0], x1
  403. ld1 {v2.d}[0], [x0], x1
  404. ld1 {v18.d}[1], [x0], x1
  405. ld1 {v16.d}[1], [x0], x1
  406. ld1 {v0.d}[1], [x0], x1
  407. ld1 {v2.d}[1], [x0], x1
  408. transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
  409. h264_loop_filter_chroma
  410. transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
  411. sub x0, x0, x1, lsl #3
  412. st1 {v18.d}[0], [x0], x1
  413. st1 {v16.d}[0], [x0], x1
  414. st1 {v0.d}[0], [x0], x1
  415. st1 {v2.d}[0], [x0], x1
  416. st1 {v18.d}[1], [x0], x1
  417. st1 {v16.d}[1], [x0], x1
  418. st1 {v0.d}[1], [x0], x1
  419. st1 {v2.d}[1], [x0], x1
  420. ret
  421. endfunc
  422. function deblock_h_chroma_422_neon, export=1
  423. add x5, x0, x1
  424. sub x0, x0, #4
  425. add x1, x1, x1
  426. h264_loop_filter_start
  427. mov x7, x30
  428. bl deblock_h_chroma
  429. mov x30, x7
  430. sub x0, x5, #4
  431. mov v24.s[0], w6
  432. b deblock_h_chroma
  433. endfunc
  434. .macro h264_loop_filter_chroma8
  435. dup v22.8b, w2 // alpha
  436. uxtl v24.8h, v24.8b
  437. uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
  438. uxtl v4.8h, v17.8b
  439. uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
  440. usubw v4.8h, v4.8h, v16.8b
  441. sli v24.8h, v24.8h, #8
  442. shl v4.8h, v4.8h, #2
  443. uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
  444. uaddw v4.8h, v4.8h, v18.8b
  445. cmhi v26.8b, v22.8b, v26.8b // < alpha
  446. usubw v4.8h, v4.8h, v19.8b
  447. dup v22.8b, w3 // beta
  448. rshrn v4.8b, v4.8h, #3
  449. cmhi v28.8b, v22.8b, v28.8b // < beta
  450. cmhi v30.8b, v22.8b, v30.8b // < beta
  451. smin v4.8b, v4.8b, v24.8b
  452. neg v25.8b, v24.8b
  453. and v26.8b, v26.8b, v28.8b
  454. smax v4.8b, v4.8b, v25.8b
  455. and v26.8b, v26.8b, v30.8b
  456. uxtl v22.8h, v17.8b
  457. and v4.8b, v4.8b, v26.8b
  458. uxtl v28.8h, v16.8b
  459. saddw v28.8h, v28.8h, v4.8b
  460. ssubw v22.8h, v22.8h, v4.8b
  461. sqxtun v16.8b, v28.8h
  462. sqxtun v17.8b, v22.8h
  463. .endm
  464. function deblock_h_chroma_mbaff_neon, export=1
  465. h264_loop_filter_start
  466. sub x4, x0, #4
  467. sub x0, x0, #2
  468. ld1 {v18.8b}, [x4], x1
  469. ld1 {v16.8b}, [x4], x1
  470. ld1 {v17.8b}, [x4], x1
  471. ld1 {v19.8b}, [x4]
  472. transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
  473. h264_loop_filter_chroma8
  474. st2 {v16.h,v17.h}[0], [x0], x1
  475. st2 {v16.h,v17.h}[1], [x0], x1
  476. st2 {v16.h,v17.h}[2], [x0], x1
  477. st2 {v16.h,v17.h}[3], [x0]
  478. ret
  479. endfunc
  480. .macro h264_loop_filter_chroma_intra width=16
  481. uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
  482. uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
  483. uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
  484. cmhi v26.16b, v30.16b, v26.16b // < alpha
  485. cmhi v27.16b, v31.16b, v27.16b // < beta
  486. cmhi v28.16b, v31.16b, v28.16b // < beta
  487. and v26.16b, v26.16b, v27.16b
  488. and v26.16b, v26.16b, v28.16b
  489. ushll v4.8h, v18.8b, #1
  490. ushll v6.8h, v19.8b, #1
  491. .ifc \width, 16
  492. ushll2 v5.8h, v18.16b, #1
  493. ushll2 v7.8h, v19.16b, #1
  494. uaddl2 v21.8h, v16.16b, v19.16b
  495. uaddl2 v23.8h, v17.16b, v18.16b
  496. .endif
  497. uaddl v20.8h, v16.8b, v19.8b
  498. uaddl v22.8h, v17.8b, v18.8b
  499. add v20.8h, v20.8h, v4.8h // mlal?
  500. add v22.8h, v22.8h, v6.8h
  501. .ifc \width, 16
  502. add v21.8h, v21.8h, v5.8h
  503. add v23.8h, v23.8h, v7.8h
  504. .endif
  505. uqrshrn v24.8b, v20.8h, #2
  506. uqrshrn v25.8b, v22.8h, #2
  507. .ifc \width, 16
  508. uqrshrn2 v24.16b, v21.8h, #2
  509. uqrshrn2 v25.16b, v23.8h, #2
  510. .endif
  511. bit v16.16b, v24.16b, v26.16b
  512. bit v17.16b, v25.16b, v26.16b
  513. .endm
  514. function deblock_v_chroma_intra_neon, export=1
  515. h264_loop_filter_start_intra
  516. sub x0, x0, x1, lsl #1
  517. ld1 {v18.16b}, [x0], x1
  518. ld1 {v16.16b}, [x0], x1
  519. ld1 {v17.16b}, [x0], x1
  520. ld1 {v19.16b}, [x0]
  521. h264_loop_filter_chroma_intra
  522. sub x0, x0, x1, lsl #1
  523. st1 {v16.16b}, [x0], x1
  524. st1 {v17.16b}, [x0], x1
  525. ret
  526. endfunc
  527. function deblock_h_chroma_intra_mbaff_neon, export=1
  528. h264_loop_filter_start_intra
  529. sub x4, x0, #4
  530. sub x0, x0, #2
  531. ld1 {v18.8b}, [x4], x1
  532. ld1 {v16.8b}, [x4], x1
  533. ld1 {v17.8b}, [x4], x1
  534. ld1 {v19.8b}, [x4], x1
  535. transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
  536. h264_loop_filter_chroma_intra width=8
  537. st2 {v16.h,v17.h}[0], [x0], x1
  538. st2 {v16.h,v17.h}[1], [x0], x1
  539. st2 {v16.h,v17.h}[2], [x0], x1
  540. st2 {v16.h,v17.h}[3], [x0], x1
  541. ret
  542. endfunc
  543. function deblock_h_chroma_intra_neon, export=1
  544. h264_loop_filter_start_intra
  545. sub x4, x0, #4
  546. sub x0, x0, #2
  547. ld1 {v18.d}[0], [x4], x1
  548. ld1 {v16.d}[0], [x4], x1
  549. ld1 {v17.d}[0], [x4], x1
  550. ld1 {v19.d}[0], [x4], x1
  551. ld1 {v18.d}[1], [x4], x1
  552. ld1 {v16.d}[1], [x4], x1
  553. ld1 {v17.d}[1], [x4], x1
  554. ld1 {v19.d}[1], [x4], x1
  555. transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
  556. h264_loop_filter_chroma_intra
  557. st2 {v16.h,v17.h}[0], [x0], x1
  558. st2 {v16.h,v17.h}[1], [x0], x1
  559. st2 {v16.h,v17.h}[2], [x0], x1
  560. st2 {v16.h,v17.h}[3], [x0], x1
  561. st2 {v16.h,v17.h}[4], [x0], x1
  562. st2 {v16.h,v17.h}[5], [x0], x1
  563. st2 {v16.h,v17.h}[6], [x0], x1
  564. st2 {v16.h,v17.h}[7], [x0], x1
  565. ret
  566. endfunc
  567. function deblock_h_chroma_422_intra_neon, export=1
  568. h264_loop_filter_start_intra
  569. sub x4, x0, #4
  570. sub x0, x0, #2
  571. ld1 {v18.d}[0], [x4], x1
  572. ld1 {v16.d}[0], [x4], x1
  573. ld1 {v17.d}[0], [x4], x1
  574. ld1 {v19.d}[0], [x4], x1
  575. ld1 {v18.d}[1], [x4], x1
  576. ld1 {v16.d}[1], [x4], x1
  577. ld1 {v17.d}[1], [x4], x1
  578. ld1 {v19.d}[1], [x4], x1
  579. transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
  580. h264_loop_filter_chroma_intra
  581. st2 {v16.h,v17.h}[0], [x0], x1
  582. st2 {v16.h,v17.h}[1], [x0], x1
  583. st2 {v16.h,v17.h}[2], [x0], x1
  584. st2 {v16.h,v17.h}[3], [x0], x1
  585. st2 {v16.h,v17.h}[4], [x0], x1
  586. st2 {v16.h,v17.h}[5], [x0], x1
  587. st2 {v16.h,v17.h}[6], [x0], x1
  588. st2 {v16.h,v17.h}[7], [x0], x1
  589. ld1 {v18.d}[0], [x4], x1
  590. ld1 {v16.d}[0], [x4], x1
  591. ld1 {v17.d}[0], [x4], x1
  592. ld1 {v19.d}[0], [x4], x1
  593. ld1 {v18.d}[1], [x4], x1
  594. ld1 {v16.d}[1], [x4], x1
  595. ld1 {v17.d}[1], [x4], x1
  596. ld1 {v19.d}[1], [x4], x1
  597. transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
  598. h264_loop_filter_chroma_intra
  599. st2 {v16.h,v17.h}[0], [x0], x1
  600. st2 {v16.h,v17.h}[1], [x0], x1
  601. st2 {v16.h,v17.h}[2], [x0], x1
  602. st2 {v16.h,v17.h}[3], [x0], x1
  603. st2 {v16.h,v17.h}[4], [x0], x1
  604. st2 {v16.h,v17.h}[5], [x0], x1
  605. st2 {v16.h,v17.h}[6], [x0], x1
  606. st2 {v16.h,v17.h}[7], [x0], x1
  607. ret
  608. endfunc
  609. // void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
  610. // int8_t ref[2][X264_SCAN8_LUMA_SIZE],
  611. // int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
  612. // uint8_t bs[2][8][4], int mvy_limit,
  613. // int bframe )
  614. function deblock_strength_neon, export=1
  615. movi v4.16b, #0
  616. lsl w4, w4, #8
  617. add x3, x3, #32
  618. sub w4, w4, #(1<<8)-3
  619. movi v5.16b, #0
  620. dup v6.8h, w4
  621. mov x6, #-32
  622. bframe:
  623. // load bytes ref
  624. add x2, x2, #16
  625. ld1 {v31.d}[1], [x1], #8
  626. ld1 {v1.16b}, [x1], #16
  627. movi v0.16b, #0
  628. ld1 {v2.16b}, [x1], #16
  629. ext v3.16b, v0.16b, v1.16b, #15
  630. ext v0.16b, v0.16b, v2.16b, #15
  631. unzip v21.4s, v22.4s, v1.4s, v2.4s
  632. unzip v23.4s, v20.4s, v3.4s, v0.4s
  633. ext v21.16b, v31.16b, v22.16b, #12
  634. eor v0.16b, v20.16b, v22.16b
  635. eor v1.16b, v21.16b, v22.16b
  636. orr v4.16b, v4.16b, v0.16b
  637. orr v5.16b, v5.16b, v1.16b
  638. ld1 {v21.8h}, [x2], #16 // mv + 0x10
  639. ld1 {v19.8h}, [x2], #16 // mv + 0x20
  640. ld1 {v22.8h}, [x2], #16 // mv + 0x30
  641. ld1 {v18.8h}, [x2], #16 // mv + 0x40
  642. ld1 {v23.8h}, [x2], #16 // mv + 0x50
  643. ext v19.16b, v19.16b, v22.16b, #12
  644. ext v18.16b, v18.16b, v23.16b, #12
  645. sabd v0.8h, v22.8h, v19.8h
  646. ld1 {v19.8h}, [x2], #16 // mv + 0x60
  647. sabd v1.8h, v23.8h, v18.8h
  648. ld1 {v24.8h}, [x2], #16 // mv + 0x70
  649. uqxtn v0.8b, v0.8h
  650. ld1 {v18.8h}, [x2], #16 // mv + 0x80
  651. ld1 {v25.8h}, [x2], #16 // mv + 0x90
  652. uqxtn2 v0.16b, v1.8h
  653. ext v19.16b, v19.16b, v24.16b, #12
  654. ext v18.16b, v18.16b, v25.16b, #12
  655. sabd v1.8h, v24.8h, v19.8h
  656. sabd v2.8h, v25.8h, v18.8h
  657. uqxtn v1.8b, v1.8h
  658. uqxtn2 v1.16b, v2.8h
  659. uqsub v0.16b, v0.16b, v6.16b
  660. uqsub v1.16b, v1.16b, v6.16b
  661. uqxtn v0.8b, v0.8h
  662. uqxtn2 v0.16b, v1.8h
  663. sabd v1.8h, v22.8h, v23.8h
  664. orr v4.16b, v4.16b, v0.16b
  665. sabd v0.8h, v21.8h, v22.8h
  666. sabd v2.8h, v23.8h, v24.8h
  667. sabd v3.8h, v24.8h, v25.8h
  668. uqxtn v0.8b, v0.8h
  669. uqxtn2 v0.16b, v1.8h
  670. uqxtn v1.8b, v2.8h
  671. uqxtn2 v1.16b, v3.8h
  672. uqsub v0.16b, v0.16b, v6.16b
  673. uqsub v1.16b, v1.16b, v6.16b
  674. uqxtn v0.8b, v0.8h
  675. uqxtn2 v0.16b, v1.8h
  676. subs w5, w5, #1
  677. orr v5.16b, v5.16b, v0.16b
  678. b.eq bframe
  679. movi v6.16b, #1
  680. // load bytes nnz
  681. ld1 {v31.d}[1], [x0], #8
  682. ld1 {v1.16b}, [x0], #16
  683. movi v0.16b, #0
  684. ld1 {v2.16b}, [x0], #16
  685. ext v3.16b, v0.16b, v1.16b, #15
  686. ext v0.16b, v0.16b, v2.16b, #15
  687. unzip v21.4s, v22.4s, v1.4s, v2.4s
  688. unzip v23.4s, v20.4s, v3.4s, v0.4s
  689. ext v21.16b, v31.16b, v22.16b, #12
  690. movrel x7, transpose_table
  691. ld1 {v7.16b}, [x7]
  692. orr v0.16b, v20.16b, v22.16b
  693. orr v1.16b, v21.16b, v22.16b
  694. umin v0.16b, v0.16b, v6.16b
  695. umin v1.16b, v1.16b, v6.16b
  696. umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
  697. umin v5.16b, v5.16b, v6.16b
  698. add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
  699. add v1.16b, v1.16b, v1.16b
  700. umax v4.16b, v4.16b, v0.16b
  701. umax v5.16b, v5.16b, v1.16b
  702. tbl v6.16b, {v4.16b}, v7.16b
  703. st1 {v5.16b}, [x3], x6 // bs[1]
  704. st1 {v6.16b}, [x3] // bs[0]
  705. ret
  706. endfunc
  707. const transpose_table
  708. .byte 0, 4, 8, 12
  709. .byte 1, 5, 9, 13
  710. .byte 2, 6, 10, 14
  711. .byte 3, 7, 11, 15
  712. endconst