pixel-a.S 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414
  1. /*****************************************************************************
  2. * pixel.S: aarch64 pixel metrics
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Janne Grunau <janne-x264@jannau.net>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "asm.S"
  27. const mask
  28. .rept 16
  29. .byte 0xff
  30. .endr
  31. .rept 16
  32. .byte 0x00
  33. .endr
  34. endconst
  35. const mask_ac_4_8
  36. .short 0, -1, -1, -1, 0, -1, -1, -1
  37. .short 0, -1, -1, -1, -1, -1, -1, -1
  38. endconst
  39. .macro SAD_START_4
  40. ld1 {v1.s}[0], [x2], x3
  41. ld1 {v0.s}[0], [x0], x1
  42. ld1 {v1.s}[1], [x2], x3
  43. ld1 {v0.s}[1], [x0], x1
  44. uabdl v16.8h, v0.8b, v1.8b
  45. .endm
  46. .macro SAD_4
  47. ld1 {v1.s}[0], [x2], x3
  48. ld1 {v0.s}[0], [x0], x1
  49. ld1 {v1.s}[1], [x2], x3
  50. ld1 {v0.s}[1], [x0], x1
  51. uabal v16.8h, v0.8b, v1.8b
  52. .endm
  53. .macro SAD_START_8
  54. ld1 {v1.8b}, [x2], x3
  55. ld1 {v0.8b}, [x0], x1
  56. ld1 {v3.8b}, [x2], x3
  57. ld1 {v2.8b}, [x0], x1
  58. uabdl v16.8h, v0.8b, v1.8b
  59. uabdl v17.8h, v2.8b, v3.8b
  60. .endm
  61. .macro SAD_8
  62. ld1 {v1.8b}, [x2], x3
  63. ld1 {v0.8b}, [x0], x1
  64. ld1 {v3.8b}, [x2], x3
  65. ld1 {v2.8b}, [x0], x1
  66. uabal v16.8h, v0.8b, v1.8b
  67. uabal v17.8h, v2.8b, v3.8b
  68. .endm
  69. .macro SAD_START_16
  70. ld1 {v1.16b}, [x2], x3
  71. ld1 {v0.16b}, [x0], x1
  72. ld1 {v3.16b}, [x2], x3
  73. ld1 {v2.16b}, [x0], x1
  74. uabdl v16.8h, v0.8b, v1.8b
  75. uabdl2 v17.8h, v0.16b, v1.16b
  76. uabal v16.8h, v2.8b, v3.8b
  77. uabal2 v17.8h, v2.16b, v3.16b
  78. .endm
  79. .macro SAD_16
  80. ld1 {v1.16b}, [x2], x3
  81. ld1 {v0.16b}, [x0], x1
  82. ld1 {v3.16b}, [x2], x3
  83. ld1 {v2.16b}, [x0], x1
  84. uabal v16.8h, v0.8b, v1.8b
  85. uabal2 v17.8h, v0.16b, v1.16b
  86. uabal v16.8h, v2.8b, v3.8b
  87. uabal2 v17.8h, v2.16b, v3.16b
  88. .endm
  89. .macro SAD_FUNC w, h, name
  90. function pixel_sad\name\()_\w\()x\h\()_neon, export=1
  91. SAD_START_\w
  92. .rept \h / 2 - 1
  93. SAD_\w
  94. .endr
  95. .if \w > 4
  96. add v16.8h, v16.8h, v17.8h
  97. .endif
  98. uaddlv s0, v16.8h
  99. fmov w0, s0
  100. ret
  101. endfunc
  102. .endm
  103. SAD_FUNC 4, 4
  104. SAD_FUNC 4, 8
  105. SAD_FUNC 4, 16
  106. SAD_FUNC 8, 4
  107. SAD_FUNC 8, 8
  108. SAD_FUNC 8, 16
  109. SAD_FUNC 16, 8
  110. SAD_FUNC 16, 16
  111. .macro SAD_X_4 x, first=uabal
  112. ld1 {v0.s}[0], [x0], x7
  113. ld1 {v1.s}[0], [x1], x5
  114. ld1 {v0.s}[1], [x0], x7
  115. ld1 {v1.s}[1], [x1], x5
  116. \first v16.8h, v1.8b, v0.8b
  117. ld1 {v2.s}[0], [x2], x5
  118. ld1 {v2.s}[1], [x2], x5
  119. \first v17.8h, v2.8b, v0.8b
  120. ld1 {v3.s}[0], [x3], x5
  121. ld1 {v3.s}[1], [x3], x5
  122. \first v18.8h, v3.8b, v0.8b
  123. .if \x == 4
  124. ld1 {v4.s}[0], [x4], x5
  125. ld1 {v4.s}[1], [x4], x5
  126. \first v19.8h, v4.8b, v0.8b
  127. .endif
  128. .endm
  129. .macro SAD_X_8 x, first=uabal
  130. ld1 {v0.8b}, [x0], x7
  131. ld1 {v1.8b}, [x1], x5
  132. \first v16.8h, v1.8b, v0.8b
  133. ld1 {v2.8b}, [x2], x5
  134. ld1 {v5.8b}, [x0], x7
  135. \first v17.8h, v2.8b, v0.8b
  136. ld1 {v3.8b}, [x3], x5
  137. ld1 {v1.8b}, [x1], x5
  138. \first v18.8h, v3.8b, v0.8b
  139. uabal v16.8h, v1.8b, v5.8b
  140. ld1 {v2.8b}, [x2], x5
  141. ld1 {v3.8b}, [x3], x5
  142. uabal v17.8h, v2.8b, v5.8b
  143. uabal v18.8h, v3.8b, v5.8b
  144. .if \x == 4
  145. ld1 {v4.8b}, [x4], x5
  146. \first v19.8h, v4.8b, v0.8b
  147. ld1 {v4.8b}, [x4], x5
  148. uabal v19.8h, v4.8b, v5.8b
  149. .endif
  150. .endm
  151. .macro SAD_X_16 x, first=uabal
  152. ld1 {v0.16b}, [x0], x7
  153. ld1 {v1.16b}, [x1], x5
  154. \first v16.8h, v1.8b, v0.8b
  155. \first\()2 v20.8h, v1.16b, v0.16b
  156. ld1 {v2.16b}, [x2], x5
  157. ld1 {v5.16b}, [x0], x7
  158. \first v17.8h, v2.8b, v0.8b
  159. \first\()2 v21.8h, v2.16b, v0.16b
  160. ld1 {v3.16b}, [x3], x5
  161. ld1 {v1.16b}, [x1], x5
  162. \first v18.8h, v3.8b, v0.8b
  163. \first\()2 v22.8h, v3.16b, v0.16b
  164. uabal v16.8h, v1.8b, v5.8b
  165. uabal2 v20.8h, v1.16b, v5.16b
  166. ld1 {v2.16b}, [x2], x5
  167. ld1 {v3.16b}, [x3], x5
  168. uabal v17.8h, v2.8b, v5.8b
  169. uabal2 v21.8h, v2.16b, v5.16b
  170. uabal v18.8h, v3.8b, v5.8b
  171. uabal2 v22.8h, v3.16b, v5.16b
  172. .if \x == 4
  173. ld1 {v4.16b}, [x4], x5
  174. \first v19.8h, v4.8b, v0.8b
  175. \first\()2 v23.8h, v4.16b, v0.16b
  176. ld1 {v4.16b}, [x4], x5
  177. uabal v19.8h, v4.8b, v5.8b
  178. uabal2 v23.8h, v4.16b, v5.16b
  179. .endif
  180. .endm
  181. .macro SAD_X_FUNC x, w, h
  182. function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
  183. .if \x == 3
  184. mov x6, x5
  185. mov x5, x4
  186. .endif
  187. mov x7, #FENC_STRIDE
  188. SAD_X_\w \x, uabdl
  189. .rept \h / 2 - 1
  190. SAD_X_\w \x
  191. .endr
  192. .if \w > 8
  193. add v16.8h, v16.8h, v20.8h
  194. add v17.8h, v17.8h, v21.8h
  195. add v18.8h, v18.8h, v22.8h
  196. .if \x == 4
  197. add v19.8h, v19.8h, v23.8h
  198. .endif
  199. .endif
  200. // add up the sads
  201. uaddlv s0, v16.8h
  202. uaddlv s1, v17.8h
  203. uaddlv s2, v18.8h
  204. stp s0, s1, [x6], #8
  205. .if \x == 3
  206. str s2, [x6]
  207. .else
  208. uaddlv s3, v19.8h
  209. stp s2, s3, [x6]
  210. .endif
  211. ret
  212. endfunc
  213. .endm
  214. SAD_X_FUNC 3, 4, 4
  215. SAD_X_FUNC 3, 4, 8
  216. SAD_X_FUNC 3, 8, 4
  217. SAD_X_FUNC 3, 8, 8
  218. SAD_X_FUNC 3, 8, 16
  219. SAD_X_FUNC 3, 16, 8
  220. SAD_X_FUNC 3, 16, 16
  221. SAD_X_FUNC 4, 4, 4
  222. SAD_X_FUNC 4, 4, 8
  223. SAD_X_FUNC 4, 8, 4
  224. SAD_X_FUNC 4, 8, 8
  225. SAD_X_FUNC 4, 8, 16
  226. SAD_X_FUNC 4, 16, 8
  227. SAD_X_FUNC 4, 16, 16
  228. function pixel_vsad_neon, export=1
  229. subs w2, w2, #2
  230. ld1 {v0.16b}, [x0], x1
  231. ld1 {v1.16b}, [x0], x1
  232. uabdl v6.8h, v0.8b, v1.8b
  233. uabdl2 v7.8h, v0.16b, v1.16b
  234. b.le 2f
  235. 1:
  236. subs w2, w2, #2
  237. ld1 {v0.16b}, [x0], x1
  238. uabal v6.8h, v1.8b, v0.8b
  239. uabal2 v7.8h, v1.16b, v0.16b
  240. ld1 {v1.16b}, [x0], x1
  241. b.lt 2f
  242. uabal v6.8h, v0.8b, v1.8b
  243. uabal2 v7.8h, v0.16b, v1.16b
  244. b.gt 1b
  245. 2:
  246. add v5.8h, v6.8h, v7.8h
  247. uaddlv s0, v5.8h
  248. fmov w0, s0
  249. ret
  250. endfunc
  251. function pixel_asd8_neon, export=1
  252. sub w4, w4, #2
  253. ld1 {v0.8b}, [x0], x1
  254. ld1 {v1.8b}, [x2], x3
  255. ld1 {v2.8b}, [x0], x1
  256. ld1 {v3.8b}, [x2], x3
  257. usubl v16.8h, v0.8b, v1.8b
  258. 1:
  259. subs w4, w4, #2
  260. ld1 {v4.8b}, [x0], x1
  261. ld1 {v5.8b}, [x2], x3
  262. usubl v17.8h, v2.8b, v3.8b
  263. usubl v18.8h, v4.8b, v5.8b
  264. add v16.8h, v16.8h, v17.8h
  265. ld1 {v2.8b}, [x0], x1
  266. ld1 {v3.8b}, [x2], x3
  267. add v16.8h, v16.8h, v18.8h
  268. b.gt 1b
  269. usubl v17.8h, v2.8b, v3.8b
  270. add v16.8h, v16.8h, v17.8h
  271. saddlv s0, v16.8h
  272. abs v0.2s, v0.2s
  273. fmov w0, s0
  274. ret
  275. endfunc
  276. .macro SSD_START_4
  277. ld1 {v16.s}[0], [x0], x1
  278. ld1 {v17.s}[0], [x2], x3
  279. usubl v2.8h, v16.8b, v17.8b
  280. ld1 {v16.s}[0], [x0], x1
  281. ld1 {v17.s}[0], [x2], x3
  282. smull v0.4s, v2.4h, v2.4h
  283. .endm
  284. .macro SSD_4
  285. usubl v2.8h, v16.8b, v17.8b
  286. ld1 {v16.s}[0], [x0], x1
  287. ld1 {v17.s}[0], [x2], x3
  288. smlal v0.4s, v2.4h, v2.4h
  289. .endm
  290. .macro SSD_END_4
  291. usubl v2.8h, v16.8b, v17.8b
  292. smlal v0.4s, v2.4h, v2.4h
  293. .endm
  294. .macro SSD_START_8
  295. ld1 {v16.8b}, [x0], x1
  296. ld1 {v17.8b}, [x2], x3
  297. usubl v2.8h, v16.8b, v17.8b
  298. ld1 {v16.8b}, [x0], x1
  299. smull v0.4s, v2.4h, v2.4h
  300. ld1 {v17.8b}, [x2], x3
  301. smlal2 v0.4s, v2.8h, v2.8h
  302. .endm
  303. .macro SSD_8
  304. usubl v2.8h, v16.8b, v17.8b
  305. ld1 {v16.8b}, [x0], x1
  306. smlal v0.4s, v2.4h, v2.4h
  307. ld1 {v17.8b}, [x2], x3
  308. smlal2 v0.4s, v2.8h, v2.8h
  309. .endm
  310. .macro SSD_END_8
  311. usubl v2.8h, v16.8b, v17.8b
  312. smlal v0.4s, v2.4h, v2.4h
  313. smlal2 v0.4s, v2.8h, v2.8h
  314. .endm
  315. .macro SSD_START_16
  316. ld1 {v16.16b}, [x0], x1
  317. ld1 {v17.16b}, [x2], x3
  318. usubl v2.8h, v16.8b, v17.8b
  319. usubl2 v3.8h, v16.16b, v17.16b
  320. ld1 {v16.16b}, [x0], x1
  321. smull v0.4s, v2.4h, v2.4h
  322. smull2 v1.4s, v2.8h, v2.8h
  323. ld1 {v17.16b}, [x2], x3
  324. smlal v0.4s, v3.4h, v3.4h
  325. smlal2 v1.4s, v3.8h, v3.8h
  326. .endm
  327. .macro SSD_16
  328. usubl v2.8h, v16.8b, v17.8b
  329. usubl2 v3.8h, v16.16b, v17.16b
  330. ld1 {v16.16b}, [x0], x1
  331. smlal v0.4s, v2.4h, v2.4h
  332. smlal2 v1.4s, v2.8h, v2.8h
  333. ld1 {v17.16b}, [x2], x3
  334. smlal v0.4s, v3.4h, v3.4h
  335. smlal2 v1.4s, v3.8h, v3.8h
  336. .endm
  337. .macro SSD_END_16
  338. usubl v2.8h, v16.8b, v17.8b
  339. usubl2 v3.8h, v16.16b, v17.16b
  340. smlal v0.4s, v2.4h, v2.4h
  341. smlal2 v1.4s, v2.8h, v2.8h
  342. smlal v0.4s, v3.4h, v3.4h
  343. smlal2 v1.4s, v3.8h, v3.8h
  344. add v0.4s, v0.4s, v1.4s
  345. .endm
  346. .macro SSD_FUNC w h
  347. function pixel_ssd_\w\()x\h\()_neon, export=1
  348. SSD_START_\w
  349. .rept \h-2
  350. SSD_\w
  351. .endr
  352. SSD_END_\w
  353. addv s0, v0.4s
  354. mov w0, v0.s[0]
  355. ret
  356. endfunc
  357. .endm
  358. SSD_FUNC 4, 4
  359. SSD_FUNC 4, 8
  360. SSD_FUNC 4, 16
  361. SSD_FUNC 8, 4
  362. SSD_FUNC 8, 8
  363. SSD_FUNC 8, 16
  364. SSD_FUNC 16, 8
  365. SSD_FUNC 16, 16
  366. function pixel_ssd_nv12_core_neon, export=1
  367. sxtw x8, w4
  368. add x8, x8, #8
  369. and x8, x8, #~15
  370. movi v6.2d, #0
  371. movi v7.2d, #0
  372. sub x1, x1, x8, lsl #1
  373. sub x3, x3, x8, lsl #1
  374. 1:
  375. subs w8, w4, #16
  376. ld2 {v0.8b,v1.8b}, [x0], #16
  377. ld2 {v2.8b,v3.8b}, [x2], #16
  378. ld2 {v24.8b,v25.8b}, [x0], #16
  379. ld2 {v26.8b,v27.8b}, [x2], #16
  380. usubl v16.8h, v0.8b, v2.8b
  381. usubl v17.8h, v1.8b, v3.8b
  382. smull v20.4s, v16.4h, v16.4h
  383. smull v21.4s, v17.4h, v17.4h
  384. usubl v18.8h, v24.8b, v26.8b
  385. usubl v19.8h, v25.8b, v27.8b
  386. smlal2 v20.4s, v16.8h, v16.8h
  387. smlal2 v21.4s, v17.8h, v17.8h
  388. b.lt 4f
  389. b.eq 3f
  390. 2:
  391. smlal v20.4s, v18.4h, v18.4h
  392. smlal v21.4s, v19.4h, v19.4h
  393. ld2 {v0.8b,v1.8b}, [x0], #16
  394. ld2 {v2.8b,v3.8b}, [x2], #16
  395. smlal2 v20.4s, v18.8h, v18.8h
  396. smlal2 v21.4s, v19.8h, v19.8h
  397. subs w8, w8, #16
  398. usubl v16.8h, v0.8b, v2.8b
  399. usubl v17.8h, v1.8b, v3.8b
  400. smlal v20.4s, v16.4h, v16.4h
  401. smlal v21.4s, v17.4h, v17.4h
  402. ld2 {v24.8b,v25.8b}, [x0], #16
  403. ld2 {v26.8b,v27.8b}, [x2], #16
  404. smlal2 v20.4s, v16.8h, v16.8h
  405. smlal2 v21.4s, v17.8h, v17.8h
  406. b.lt 4f
  407. usubl v18.8h, v24.8b, v26.8b
  408. usubl v19.8h, v25.8b, v27.8b
  409. b.gt 2b
  410. 3:
  411. smlal v20.4s, v18.4h, v18.4h
  412. smlal v21.4s, v19.4h, v19.4h
  413. smlal2 v20.4s, v18.8h, v18.8h
  414. smlal2 v21.4s, v19.8h, v19.8h
  415. 4:
  416. subs w5, w5, #1
  417. uaddw v6.2d, v6.2d, v20.2s
  418. uaddw v7.2d, v7.2d, v21.2s
  419. add x0, x0, x1
  420. add x2, x2, x3
  421. uaddw2 v6.2d, v6.2d, v20.4s
  422. uaddw2 v7.2d, v7.2d, v21.4s
  423. b.gt 1b
  424. addp v6.2d, v6.2d, v7.2d
  425. st1 {v6.d}[0], [x6]
  426. st1 {v6.d}[1], [x7]
  427. ret
  428. endfunc
  429. .macro pixel_var_8 h
  430. function pixel_var_8x\h\()_neon, export=1
  431. ld1 {v16.8b}, [x0], x1
  432. ld1 {v17.8b}, [x0], x1
  433. mov x2, \h - 4
  434. umull v1.8h, v16.8b, v16.8b
  435. uxtl v0.8h, v16.8b
  436. umull v2.8h, v17.8b, v17.8b
  437. uaddw v0.8h, v0.8h, v17.8b
  438. ld1 {v18.8b}, [x0], x1
  439. uaddlp v1.4s, v1.8h
  440. uaddlp v2.4s, v2.8h
  441. ld1 {v19.8b}, [x0], x1
  442. 1: subs x2, x2, #4
  443. uaddw v0.8h, v0.8h, v18.8b
  444. umull v24.8h, v18.8b, v18.8b
  445. ld1 {v20.8b}, [x0], x1
  446. uaddw v0.8h, v0.8h, v19.8b
  447. umull v25.8h, v19.8b, v19.8b
  448. uadalp v1.4s, v24.8h
  449. ld1 {v21.8b}, [x0], x1
  450. uaddw v0.8h, v0.8h, v20.8b
  451. umull v26.8h, v20.8b, v20.8b
  452. uadalp v2.4s, v25.8h
  453. ld1 {v18.8b}, [x0], x1
  454. uaddw v0.8h, v0.8h, v21.8b
  455. umull v27.8h, v21.8b, v21.8b
  456. uadalp v1.4s, v26.8h
  457. ld1 {v19.8b}, [x0], x1
  458. uadalp v2.4s, v27.8h
  459. b.gt 1b
  460. uaddw v0.8h, v0.8h, v18.8b
  461. umull v28.8h, v18.8b, v18.8b
  462. uaddw v0.8h, v0.8h, v19.8b
  463. umull v29.8h, v19.8b, v19.8b
  464. uadalp v1.4s, v28.8h
  465. uadalp v2.4s, v29.8h
  466. b var_end
  467. endfunc
  468. .endm
  469. pixel_var_8 8
  470. pixel_var_8 16
  471. function pixel_var_16x16_neon, export=1
  472. ld1 {v16.16b}, [x0], x1
  473. ld1 {v17.16b}, [x0], x1
  474. mov x2, #14
  475. umull v1.8h, v16.8b, v16.8b
  476. umull2 v2.8h, v16.16b, v16.16b
  477. uxtl v0.8h, v16.8b
  478. uaddlp v1.4s, v1.8h
  479. uaddlp v2.4s, v2.8h
  480. uaddw2 v0.8h, v0.8h, v16.16b
  481. 1: subs x2, x2, #2
  482. ld1 {v18.16b}, [x0], x1
  483. uaddw v0.8h, v0.8h, v17.8b
  484. umull v3.8h, v17.8b, v17.8b
  485. uaddw2 v0.8h, v0.8h, v17.16b
  486. umull2 v4.8h, v17.16b, v17.16b
  487. uadalp v1.4s, v3.8h
  488. uadalp v2.4s, v4.8h
  489. ld1 {v17.16b}, [x0], x1
  490. uaddw v0.8h, v0.8h, v18.8b
  491. umull v5.8h, v18.8b, v18.8b
  492. uaddw2 v0.8h, v0.8h, v18.16b
  493. umull2 v6.8h, v18.16b, v18.16b
  494. uadalp v1.4s, v5.8h
  495. uadalp v2.4s, v6.8h
  496. b.gt 1b
  497. uaddw v0.8h, v0.8h, v17.8b
  498. umull v3.8h, v17.8b, v17.8b
  499. uaddw2 v0.8h, v0.8h, v17.16b
  500. umull2 v4.8h, v17.16b, v17.16b
  501. uadalp v1.4s, v3.8h
  502. uadalp v2.4s, v4.8h
  503. endfunc
  504. function var_end
  505. add v1.4s, v1.4s, v2.4s
  506. uaddlv s0, v0.8h
  507. uaddlv d1, v1.4s
  508. mov w0, v0.s[0]
  509. mov x1, v1.d[0]
  510. orr x0, x0, x1, lsl #32
  511. ret
  512. endfunc
  513. .macro pixel_var2_8 h
  514. function pixel_var2_8x\h\()_neon, export=1
  515. mov x3, #16
  516. ld1 {v16.8b}, [x0], #8
  517. ld1 {v18.8b}, [x1], x3
  518. ld1 {v17.8b}, [x0], #8
  519. ld1 {v19.8b}, [x1], x3
  520. mov x5, \h - 2
  521. usubl v0.8h, v16.8b, v18.8b
  522. usubl v1.8h, v17.8b, v19.8b
  523. ld1 {v16.8b}, [x0], #8
  524. ld1 {v18.8b}, [x1], x3
  525. smull v2.4s, v0.4h, v0.4h
  526. smull2 v3.4s, v0.8h, v0.8h
  527. smull v4.4s, v1.4h, v1.4h
  528. smull2 v5.4s, v1.8h, v1.8h
  529. usubl v6.8h, v16.8b, v18.8b
  530. 1: subs x5, x5, #1
  531. ld1 {v17.8b}, [x0], #8
  532. ld1 {v19.8b}, [x1], x3
  533. smlal v2.4s, v6.4h, v6.4h
  534. smlal2 v3.4s, v6.8h, v6.8h
  535. usubl v7.8h, v17.8b, v19.8b
  536. add v0.8h, v0.8h, v6.8h
  537. ld1 {v16.8b}, [x0], #8
  538. ld1 {v18.8b}, [x1], x3
  539. smlal v4.4s, v7.4h, v7.4h
  540. smlal2 v5.4s, v7.8h, v7.8h
  541. usubl v6.8h, v16.8b, v18.8b
  542. add v1.8h, v1.8h, v7.8h
  543. b.gt 1b
  544. ld1 {v17.8b}, [x0], #8
  545. ld1 {v19.8b}, [x1], x3
  546. smlal v2.4s, v6.4h, v6.4h
  547. smlal2 v3.4s, v6.8h, v6.8h
  548. usubl v7.8h, v17.8b, v19.8b
  549. add v0.8h, v0.8h, v6.8h
  550. smlal v4.4s, v7.4h, v7.4h
  551. add v1.8h, v1.8h, v7.8h
  552. smlal2 v5.4s, v7.8h, v7.8h
  553. saddlv s0, v0.8h
  554. saddlv s1, v1.8h
  555. add v2.4s, v2.4s, v3.4s
  556. add v4.4s, v4.4s, v5.4s
  557. mov w0, v0.s[0]
  558. mov w1, v1.s[0]
  559. addv s2, v2.4s
  560. addv s4, v4.4s
  561. mul w0, w0, w0
  562. mul w1, w1, w1
  563. mov w3, v2.s[0]
  564. mov w4, v4.s[0]
  565. sub w0, w3, w0, lsr # 6 + (\h >> 4)
  566. sub w1, w4, w1, lsr # 6 + (\h >> 4)
  567. str w3, [x2]
  568. add w0, w0, w1
  569. str w4, [x2, #4]
  570. ret
  571. endfunc
  572. .endm
  573. pixel_var2_8 8
  574. pixel_var2_8 16
  575. function pixel_satd_4x4_neon, export=1
  576. ld1 {v1.s}[0], [x2], x3
  577. ld1 {v0.s}[0], [x0], x1
  578. ld1 {v3.s}[0], [x2], x3
  579. ld1 {v2.s}[0], [x0], x1
  580. ld1 {v1.s}[1], [x2], x3
  581. ld1 {v0.s}[1], [x0], x1
  582. ld1 {v3.s}[1], [x2], x3
  583. ld1 {v2.s}[1], [x0], x1
  584. usubl v0.8h, v0.8b, v1.8b
  585. usubl v1.8h, v2.8b, v3.8b
  586. SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
  587. zip1 v0.2d, v2.2d, v3.2d
  588. zip2 v1.2d, v2.2d, v3.2d
  589. SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
  590. trn1 v0.8h, v2.8h, v3.8h
  591. trn2 v1.8h, v2.8h, v3.8h
  592. SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h
  593. trn1 v0.4s, v2.4s, v3.4s
  594. trn2 v1.4s, v2.4s, v3.4s
  595. abs v0.8h, v0.8h
  596. abs v1.8h, v1.8h
  597. umax v0.8h, v0.8h, v1.8h
  598. uaddlv s0, v0.8h
  599. mov w0, v0.s[0]
  600. ret
  601. endfunc
  602. function pixel_satd_4x8_neon, export=1
  603. ld1 {v1.s}[0], [x2], x3
  604. ld1 {v0.s}[0], [x0], x1
  605. ld1 {v3.s}[0], [x2], x3
  606. ld1 {v2.s}[0], [x0], x1
  607. ld1 {v5.s}[0], [x2], x3
  608. ld1 {v4.s}[0], [x0], x1
  609. ld1 {v7.s}[0], [x2], x3
  610. ld1 {v6.s}[0], [x0], x1
  611. ld1 {v1.s}[1], [x2], x3
  612. ld1 {v0.s}[1], [x0], x1
  613. ld1 {v3.s}[1], [x2], x3
  614. ld1 {v2.s}[1], [x0], x1
  615. ld1 {v5.s}[1], [x2], x3
  616. ld1 {v4.s}[1], [x0], x1
  617. ld1 {v7.s}[1], [x2], x3
  618. ld1 {v6.s}[1], [x0], x1
  619. b satd_4x8_8x4_end_neon
  620. endfunc
  621. function pixel_satd_8x4_neon, export=1
  622. ld1 {v1.8b}, [x2], x3
  623. ld1 {v0.8b}, [x0], x1
  624. ld1 {v3.8b}, [x2], x3
  625. ld1 {v2.8b}, [x0], x1
  626. ld1 {v5.8b}, [x2], x3
  627. ld1 {v4.8b}, [x0], x1
  628. ld1 {v7.8b}, [x2], x3
  629. ld1 {v6.8b}, [x0], x1
  630. endfunc
  631. function satd_4x8_8x4_end_neon
  632. usubl v0.8h, v0.8b, v1.8b
  633. usubl v1.8h, v2.8b, v3.8b
  634. usubl v2.8h, v4.8b, v5.8b
  635. usubl v3.8h, v6.8b, v7.8b
  636. SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
  637. SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
  638. SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
  639. SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
  640. trn1 v0.8h, v4.8h, v5.8h
  641. trn2 v1.8h, v4.8h, v5.8h
  642. trn1 v2.8h, v6.8h, v7.8h
  643. trn2 v3.8h, v6.8h, v7.8h
  644. SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
  645. SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
  646. trn1 v0.4s, v16.4s, v18.4s
  647. trn2 v1.4s, v16.4s, v18.4s
  648. trn1 v2.4s, v17.4s, v19.4s
  649. trn2 v3.4s, v17.4s, v19.4s
  650. abs v0.8h, v0.8h
  651. abs v1.8h, v1.8h
  652. abs v2.8h, v2.8h
  653. abs v3.8h, v3.8h
  654. umax v0.8h, v0.8h, v1.8h
  655. umax v1.8h, v2.8h, v3.8h
  656. add v0.8h, v0.8h, v1.8h
  657. uaddlv s0, v0.8h
  658. mov w0, v0.s[0]
  659. ret
  660. endfunc
  661. function pixel_satd_8x8_neon, export=1
  662. mov x4, x30
  663. bl satd_8x8_neon
  664. add v0.8h, v0.8h, v1.8h
  665. add v1.8h, v2.8h, v3.8h
  666. add v0.8h, v0.8h, v1.8h
  667. uaddlv s0, v0.8h
  668. mov w0, v0.s[0]
  669. ret x4
  670. endfunc
  671. function pixel_satd_8x16_neon, export=1
  672. mov x4, x30
  673. bl satd_8x8_neon
  674. add v0.8h, v0.8h, v1.8h
  675. add v1.8h, v2.8h, v3.8h
  676. add v30.8h, v0.8h, v1.8h
  677. bl satd_8x8_neon
  678. add v0.8h, v0.8h, v1.8h
  679. add v1.8h, v2.8h, v3.8h
  680. add v31.8h, v0.8h, v1.8h
  681. add v0.8h, v30.8h, v31.8h
  682. uaddlv s0, v0.8h
  683. mov w0, v0.s[0]
  684. ret x4
  685. endfunc
  686. .macro SUMSUBL_AB sum, sub, a, b
  687. uaddl \sum, \a, \b
  688. usubl \sub, \a, \b
  689. .endm
  690. .macro load_diff_fly_8x8
  691. ld1 {v1.8b}, [x2], x3
  692. ld1 {v0.8b}, [x0], x1
  693. ld1 {v3.8b}, [x2], x3
  694. ld1 {v2.8b}, [x0], x1
  695. usubl v16.8h, v0.8b, v1.8b
  696. ld1 {v5.8b}, [x2], x3
  697. ld1 {v4.8b}, [x0], x1
  698. usubl v17.8h, v2.8b, v3.8b
  699. ld1 {v7.8b}, [x2], x3
  700. ld1 {v6.8b}, [x0], x1
  701. usubl v18.8h, v4.8b, v5.8b
  702. ld1 {v1.8b}, [x2], x3
  703. ld1 {v0.8b}, [x0], x1
  704. usubl v19.8h, v6.8b, v7.8b
  705. ld1 {v3.8b}, [x2], x3
  706. ld1 {v2.8b}, [x0], x1
  707. usubl v20.8h, v0.8b, v1.8b
  708. ld1 {v5.8b}, [x2], x3
  709. ld1 {v4.8b}, [x0], x1
  710. usubl v21.8h, v2.8b, v3.8b
  711. ld1 {v7.8b}, [x2], x3
  712. ld1 {v6.8b}, [x0], x1
  713. SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
  714. SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
  715. usubl v22.8h, v4.8b, v5.8b
  716. usubl v23.8h, v6.8b, v7.8b
  717. .endm
  718. .macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
  719. SUMSUB_AB \s1, \d1, \a, \b
  720. SUMSUB_AB \s2, \d2, \c, \d
  721. .endm
  722. .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
  723. SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
  724. SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
  725. .endm
  726. function satd_8x8_neon
  727. load_diff_fly_8x8
  728. endfunc
  729. // one vertical hadamard pass and two horizontal
  730. function satd_8x4v_8x8h_neon
  731. SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
  732. SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
  733. HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
  734. transpose v0.8h, v1.8h, v16.8h, v17.8h
  735. transpose v2.8h, v3.8h, v18.8h, v19.8h
  736. transpose v4.8h, v5.8h, v20.8h, v21.8h
  737. transpose v6.8h, v7.8h, v22.8h, v23.8h
  738. SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
  739. SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
  740. SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
  741. SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
  742. transpose v0.4s, v2.4s, v16.4s, v18.4s
  743. transpose v1.4s, v3.4s, v17.4s, v19.4s
  744. transpose v4.4s, v6.4s, v20.4s, v22.4s
  745. transpose v5.4s, v7.4s, v21.4s, v23.4s
  746. abs v0.8h, v0.8h
  747. abs v1.8h, v1.8h
  748. abs v2.8h, v2.8h
  749. abs v3.8h, v3.8h
  750. abs v4.8h, v4.8h
  751. abs v5.8h, v5.8h
  752. abs v6.8h, v6.8h
  753. abs v7.8h, v7.8h
  754. umax v0.8h, v0.8h, v2.8h
  755. umax v1.8h, v1.8h, v3.8h
  756. umax v2.8h, v4.8h, v6.8h
  757. umax v3.8h, v5.8h, v7.8h
  758. ret
  759. endfunc
  760. function pixel_satd_16x8_neon, export=1
  761. mov x4, x30
  762. bl satd_16x4_neon
  763. add v30.8h, v0.8h, v1.8h
  764. add v31.8h, v2.8h, v3.8h
  765. bl satd_16x4_neon
  766. add v0.8h, v0.8h, v1.8h
  767. add v1.8h, v2.8h, v3.8h
  768. add v30.8h, v30.8h, v0.8h
  769. add v31.8h, v31.8h, v1.8h
  770. add v0.8h, v30.8h, v31.8h
  771. uaddlv s0, v0.8h
  772. mov w0, v0.s[0]
  773. ret x4
  774. endfunc
  775. function pixel_satd_16x16_neon, export=1
  776. mov x4, x30
  777. bl satd_16x4_neon
  778. add v30.8h, v0.8h, v1.8h
  779. add v31.8h, v2.8h, v3.8h
  780. bl satd_16x4_neon
  781. add v0.8h, v0.8h, v1.8h
  782. add v1.8h, v2.8h, v3.8h
  783. add v30.8h, v30.8h, v0.8h
  784. add v31.8h, v31.8h, v1.8h
  785. bl satd_16x4_neon
  786. add v0.8h, v0.8h, v1.8h
  787. add v1.8h, v2.8h, v3.8h
  788. add v30.8h, v30.8h, v0.8h
  789. add v31.8h, v31.8h, v1.8h
  790. bl satd_16x4_neon
  791. add v0.8h, v0.8h, v1.8h
  792. add v1.8h, v2.8h, v3.8h
  793. add v30.8h, v30.8h, v0.8h
  794. add v31.8h, v31.8h, v1.8h
  795. add v0.8h, v30.8h, v31.8h
  796. uaddlv s0, v0.8h
  797. mov w0, v0.s[0]
  798. ret x4
  799. endfunc
  800. function satd_16x4_neon
  801. ld1 {v1.16b}, [x2], x3
  802. ld1 {v0.16b}, [x0], x1
  803. ld1 {v3.16b}, [x2], x3
  804. ld1 {v2.16b}, [x0], x1
  805. usubl v16.8h, v0.8b, v1.8b
  806. usubl2 v20.8h, v0.16b, v1.16b
  807. ld1 {v5.16b}, [x2], x3
  808. ld1 {v4.16b}, [x0], x1
  809. usubl v17.8h, v2.8b, v3.8b
  810. usubl2 v21.8h, v2.16b, v3.16b
  811. ld1 {v7.16b}, [x2], x3
  812. ld1 {v6.16b}, [x0], x1
  813. usubl v18.8h, v4.8b, v5.8b
  814. usubl2 v22.8h, v4.16b, v5.16b
  815. usubl v19.8h, v6.8b, v7.8b
  816. usubl2 v23.8h, v6.16b, v7.16b
  817. SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
  818. SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
  819. b satd_8x4v_8x8h_neon
  820. endfunc
  821. function pixel_satd_4x16_neon, export=1
  822. mov x4, x30
  823. ld1 {v1.s}[0], [x2], x3
  824. ld1 {v0.s}[0], [x0], x1
  825. ld1 {v3.s}[0], [x2], x3
  826. ld1 {v2.s}[0], [x0], x1
  827. ld1 {v5.s}[0], [x2], x3
  828. ld1 {v4.s}[0], [x0], x1
  829. ld1 {v7.s}[0], [x2], x3
  830. ld1 {v6.s}[0], [x0], x1
  831. ld1 {v1.s}[1], [x2], x3
  832. ld1 {v0.s}[1], [x0], x1
  833. ld1 {v3.s}[1], [x2], x3
  834. ld1 {v2.s}[1], [x0], x1
  835. ld1 {v5.s}[1], [x2], x3
  836. ld1 {v4.s}[1], [x0], x1
  837. ld1 {v7.s}[1], [x2], x3
  838. ld1 {v6.s}[1], [x0], x1
  839. usubl v16.8h, v0.8b, v1.8b
  840. usubl v17.8h, v2.8b, v3.8b
  841. usubl v18.8h, v4.8b, v5.8b
  842. usubl v19.8h, v6.8b, v7.8b
  843. ld1 {v1.s}[0], [x2], x3
  844. ld1 {v0.s}[0], [x0], x1
  845. ld1 {v3.s}[0], [x2], x3
  846. ld1 {v2.s}[0], [x0], x1
  847. ld1 {v5.s}[0], [x2], x3
  848. ld1 {v4.s}[0], [x0], x1
  849. ld1 {v7.s}[0], [x2], x3
  850. ld1 {v6.s}[0], [x0], x1
  851. ld1 {v1.s}[1], [x2], x3
  852. ld1 {v0.s}[1], [x0], x1
  853. ld1 {v3.s}[1], [x2], x3
  854. ld1 {v2.s}[1], [x0], x1
  855. ld1 {v5.s}[1], [x2], x3
  856. ld1 {v4.s}[1], [x0], x1
  857. ld1 {v7.s}[1], [x2], x3
  858. ld1 {v6.s}[1], [x0], x1
  859. usubl v20.8h, v0.8b, v1.8b
  860. usubl v21.8h, v2.8b, v3.8b
  861. usubl v22.8h, v4.8b, v5.8b
  862. usubl v23.8h, v6.8b, v7.8b
  863. SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
  864. SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
  865. bl satd_8x4v_8x8h_neon
  866. add v30.8h, v0.8h, v1.8h
  867. add v31.8h, v2.8h, v3.8h
  868. add v0.8h, v30.8h, v31.8h
  869. uaddlv s0, v0.8h
  870. mov w0, v0.s[0]
  871. ret x4
  872. endfunc
  873. function pixel_sa8d_8x8_neon, export=1
  874. mov x4, x30
  875. bl pixel_sa8d_8x8_neon
  876. add v0.8h, v0.8h, v1.8h
  877. uaddlv s0, v0.8h
  878. mov w0, v0.s[0]
  879. add w0, w0, #1
  880. lsr w0, w0, #1
  881. ret x4
  882. endfunc
  883. function pixel_sa8d_16x16_neon, export=1
  884. mov x4, x30
  885. bl pixel_sa8d_8x8_neon
  886. uaddlp v30.4s, v0.8h
  887. uaddlp v31.4s, v1.8h
  888. bl pixel_sa8d_8x8_neon
  889. uadalp v30.4s, v0.8h
  890. uadalp v31.4s, v1.8h
  891. sub x0, x0, x1, lsl #4
  892. sub x2, x2, x3, lsl #4
  893. add x0, x0, #8
  894. add x2, x2, #8
  895. bl pixel_sa8d_8x8_neon
  896. uadalp v30.4s, v0.8h
  897. uadalp v31.4s, v1.8h
  898. bl pixel_sa8d_8x8_neon
  899. uadalp v30.4s, v0.8h
  900. uadalp v31.4s, v1.8h
  901. add v0.4s, v30.4s, v31.4s
  902. addv s0, v0.4s
  903. mov w0, v0.s[0]
  904. add w0, w0, #1
  905. lsr w0, w0, #1
  906. ret x4
  907. endfunc
  908. .macro sa8d_satd_8x8 satd=
  909. function pixel_sa8d_\satd\()8x8_neon
  910. load_diff_fly_8x8
  911. SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
  912. SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
  913. HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
  914. .ifc \satd, satd_
  915. transpose v0.8h, v1.8h, v16.8h, v17.8h
  916. transpose v2.8h, v3.8h, v18.8h, v19.8h
  917. transpose v4.8h, v5.8h, v20.8h, v21.8h
  918. transpose v6.8h, v7.8h, v22.8h, v23.8h
  919. SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
  920. SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
  921. SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
  922. SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
  923. transpose v4.4s, v6.4s, v24.4s, v26.4s
  924. transpose v5.4s, v7.4s, v25.4s, v27.4s
  925. transpose v24.4s, v26.4s, v0.4s, v2.4s
  926. transpose v25.4s, v27.4s, v1.4s, v3.4s
  927. abs v0.8h, v4.8h
  928. abs v1.8h, v5.8h
  929. abs v2.8h, v6.8h
  930. abs v3.8h, v7.8h
  931. abs v4.8h, v24.8h
  932. abs v5.8h, v25.8h
  933. abs v6.8h, v26.8h
  934. abs v7.8h, v27.8h
  935. umax v0.8h, v0.8h, v2.8h
  936. umax v1.8h, v1.8h, v3.8h
  937. umax v2.8h, v4.8h, v6.8h
  938. umax v3.8h, v5.8h, v7.8h
  939. add v26.8h, v0.8h, v1.8h
  940. add v27.8h, v2.8h, v3.8h
  941. .endif
  942. SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
  943. SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
  944. SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
  945. SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
  946. transpose v20.8h, v21.8h, v16.8h, v17.8h
  947. transpose v4.8h, v5.8h, v0.8h, v1.8h
  948. transpose v22.8h, v23.8h, v18.8h, v19.8h
  949. transpose v6.8h, v7.8h, v2.8h, v3.8h
  950. SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
  951. SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
  952. SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
  953. SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
  954. transpose v20.4s, v22.4s, v2.4s, v0.4s
  955. transpose v21.4s, v23.4s, v3.4s, v1.4s
  956. transpose v16.4s, v18.4s, v24.4s, v4.4s
  957. transpose v17.4s, v19.4s, v25.4s, v5.4s
  958. SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
  959. SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
  960. SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
  961. SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
  962. transpose v16.2d, v20.2d, v0.2d, v4.2d
  963. transpose v17.2d, v21.2d, v1.2d, v5.2d
  964. transpose v18.2d, v22.2d, v2.2d, v6.2d
  965. transpose v19.2d, v23.2d, v3.2d, v7.2d
  966. abs v16.8h, v16.8h
  967. abs v20.8h, v20.8h
  968. abs v17.8h, v17.8h
  969. abs v21.8h, v21.8h
  970. abs v18.8h, v18.8h
  971. abs v22.8h, v22.8h
  972. abs v19.8h, v19.8h
  973. abs v23.8h, v23.8h
  974. umax v16.8h, v16.8h, v20.8h
  975. umax v17.8h, v17.8h, v21.8h
  976. umax v18.8h, v18.8h, v22.8h
  977. umax v19.8h, v19.8h, v23.8h
  978. add v0.8h, v16.8h, v17.8h
  979. add v1.8h, v18.8h, v19.8h
  980. ret
  981. endfunc
  982. .endm
  983. sa8d_satd_8x8
  984. sa8d_satd_8x8 satd_
  985. function pixel_sa8d_satd_16x16_neon, export=1
  986. mov x4, x30
  987. bl pixel_sa8d_satd_8x8_neon
  988. uaddlp v30.4s, v0.8h
  989. uaddlp v31.4s, v1.8h
  990. uaddlp v28.4s, v26.8h
  991. uaddlp v29.4s, v27.8h
  992. bl pixel_sa8d_satd_8x8_neon
  993. uadalp v30.4s, v0.8h
  994. uadalp v31.4s, v1.8h
  995. uadalp v28.4s, v26.8h
  996. uadalp v29.4s, v27.8h
  997. sub x0, x0, x1, lsl #4
  998. sub x2, x2, x3, lsl #4
  999. add x0, x0, #8
  1000. add x2, x2, #8
  1001. bl pixel_sa8d_satd_8x8_neon
  1002. uadalp v30.4s, v0.8h
  1003. uadalp v31.4s, v1.8h
  1004. uadalp v28.4s, v26.8h
  1005. uadalp v29.4s, v27.8h
  1006. bl pixel_sa8d_satd_8x8_neon
  1007. uadalp v30.4s, v0.8h
  1008. uadalp v31.4s, v1.8h
  1009. uadalp v28.4s, v26.8h
  1010. uadalp v29.4s, v27.8h
  1011. add v0.4s, v30.4s, v31.4s // sa8d
  1012. add v1.4s, v28.4s, v29.4s // satd
  1013. addv s0, v0.4s
  1014. addv s1, v1.4s
  1015. urshr v0.4s, v0.4s, #1
  1016. fmov w0, s0
  1017. fmov w1, s1
  1018. add x0, x0, x1, lsl #32
  1019. ret x4
  1020. endfunc
  1021. .macro HADAMARD_AC w h
  1022. function pixel_hadamard_ac_\w\()x\h\()_neon, export=1
  1023. movrel x5, mask_ac_4_8
  1024. mov x4, x30
  1025. ld1 {v30.8h,v31.8h}, [x5]
  1026. movi v28.16b, #0
  1027. movi v29.16b, #0
  1028. bl hadamard_ac_8x8_neon
  1029. .if \h > 8
  1030. bl hadamard_ac_8x8_neon
  1031. .endif
  1032. .if \w > 8
  1033. sub x0, x0, x1, lsl #3
  1034. add x0, x0, #8
  1035. bl hadamard_ac_8x8_neon
  1036. .endif
  1037. .if \w * \h == 256
  1038. sub x0, x0, x1, lsl #4
  1039. bl hadamard_ac_8x8_neon
  1040. .endif
  1041. addv s1, v29.4s
  1042. addv s0, v28.4s
  1043. mov w1, v1.s[0]
  1044. mov w0, v0.s[0]
  1045. lsr w1, w1, #2
  1046. lsr w0, w0, #1
  1047. orr x0, x0, x1, lsl #32
  1048. ret x4
  1049. endfunc
  1050. .endm
  1051. HADAMARD_AC 8, 8
  1052. HADAMARD_AC 8, 16
  1053. HADAMARD_AC 16, 8
  1054. HADAMARD_AC 16, 16
  1055. // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
  1056. function hadamard_ac_8x8_neon
  1057. ld1 {v16.8b}, [x0], x1
  1058. ld1 {v17.8b}, [x0], x1
  1059. ld1 {v18.8b}, [x0], x1
  1060. ld1 {v19.8b}, [x0], x1
  1061. SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b
  1062. ld1 {v20.8b}, [x0], x1
  1063. ld1 {v21.8b}, [x0], x1
  1064. SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b
  1065. ld1 {v22.8b}, [x0], x1
  1066. ld1 {v23.8b}, [x0], x1
  1067. SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b
  1068. SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b
  1069. SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
  1070. SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
  1071. transpose v0.8h, v1.8h, v16.8h, v17.8h
  1072. transpose v2.8h, v3.8h, v18.8h, v19.8h
  1073. transpose v4.8h, v5.8h, v20.8h, v21.8h
  1074. transpose v6.8h, v7.8h, v22.8h, v23.8h
  1075. SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
  1076. SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
  1077. SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
  1078. SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
  1079. transpose v0.4s, v2.4s, v16.4s, v18.4s
  1080. transpose v1.4s, v3.4s, v17.4s, v19.4s
  1081. transpose v4.4s, v6.4s, v20.4s, v22.4s
  1082. transpose v5.4s, v7.4s, v21.4s, v23.4s
  1083. SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
  1084. SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
  1085. SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
  1086. abs v0.8h, v16.8h
  1087. abs v4.8h, v20.8h
  1088. abs v1.8h, v17.8h
  1089. abs v5.8h, v21.8h
  1090. abs v2.8h, v18.8h
  1091. abs v6.8h, v22.8h
  1092. abs v3.8h, v19.8h
  1093. abs v7.8h, v23.8h
  1094. add v0.8h, v0.8h, v4.8h
  1095. add v1.8h, v1.8h, v5.8h
  1096. and v0.16b, v0.16b, v30.16b
  1097. add v2.8h, v2.8h, v6.8h
  1098. add v3.8h, v3.8h, v7.8h
  1099. add v0.8h, v0.8h, v2.8h
  1100. add v1.8h, v1.8h, v3.8h
  1101. uadalp v28.4s, v0.8h
  1102. uadalp v28.4s, v1.8h
  1103. SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
  1104. SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
  1105. SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
  1106. SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
  1107. transpose v16.2d, v17.2d, v6.2d, v7.2d
  1108. transpose v18.2d, v19.2d, v4.2d, v5.2d
  1109. transpose v20.2d, v21.2d, v2.2d, v3.2d
  1110. abs v16.8h, v16.8h
  1111. abs v17.8h, v17.8h
  1112. abs v18.8h, v18.8h
  1113. abs v19.8h, v19.8h
  1114. abs v20.8h, v20.8h
  1115. abs v21.8h, v21.8h
  1116. transpose v7.2d, v6.2d, v1.2d, v0.2d
  1117. umax v3.8h, v16.8h, v17.8h
  1118. umax v2.8h, v18.8h, v19.8h
  1119. umax v1.8h, v20.8h, v21.8h
  1120. SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
  1121. add v2.8h, v2.8h, v3.8h
  1122. add v2.8h, v2.8h, v1.8h
  1123. and v4.16b, v4.16b, v31.16b
  1124. add v2.8h, v2.8h, v2.8h
  1125. abs v5.8h, v5.8h
  1126. abs v4.8h, v4.8h
  1127. add v2.8h, v2.8h, v5.8h
  1128. add v2.8h, v2.8h, v4.8h
  1129. uadalp v29.4s, v2.8h
  1130. ret
  1131. endfunc
  1132. function pixel_ssim_4x4x2_core_neon, export=1
  1133. ld1 {v0.8b}, [x0], x1
  1134. ld1 {v2.8b}, [x2], x3
  1135. umull v16.8h, v0.8b, v0.8b
  1136. umull v17.8h, v0.8b, v2.8b
  1137. umull v18.8h, v2.8b, v2.8b
  1138. ld1 {v28.8b}, [x0], x1
  1139. ld1 {v29.8b}, [x2], x3
  1140. umull v20.8h, v28.8b, v28.8b
  1141. umull v21.8h, v28.8b, v29.8b
  1142. umull v22.8h, v29.8b, v29.8b
  1143. uaddlp v16.4s, v16.8h
  1144. uaddlp v17.4s, v17.8h
  1145. uaddl v0.8h, v0.8b, v28.8b
  1146. uadalp v16.4s, v18.8h
  1147. uaddl v1.8h, v2.8b, v29.8b
  1148. ld1 {v26.8b}, [x0], x1
  1149. ld1 {v27.8b}, [x2], x3
  1150. umull v23.8h, v26.8b, v26.8b
  1151. umull v24.8h, v26.8b, v27.8b
  1152. umull v25.8h, v27.8b, v27.8b
  1153. uadalp v16.4s, v20.8h
  1154. uaddw v0.8h, v0.8h, v26.8b
  1155. uadalp v17.4s, v21.8h
  1156. uaddw v1.8h, v1.8h, v27.8b
  1157. uadalp v16.4s, v22.8h
  1158. ld1 {v28.8b}, [x0], x1
  1159. ld1 {v29.8b}, [x2], x3
  1160. umull v20.8h, v28.8b, v28.8b
  1161. umull v21.8h, v28.8b, v29.8b
  1162. umull v22.8h, v29.8b, v29.8b
  1163. uadalp v16.4s, v23.8h
  1164. uaddw v0.8h, v0.8h, v28.8b
  1165. uadalp v17.4s, v24.8h
  1166. uaddw v1.8h, v1.8h, v29.8b
  1167. uadalp v16.4s, v25.8h
  1168. uadalp v16.4s, v20.8h
  1169. uadalp v17.4s, v21.8h
  1170. uadalp v16.4s, v22.8h
  1171. uaddlp v0.4s, v0.8h
  1172. uaddlp v1.4s, v1.8h
  1173. addp v0.4s, v0.4s, v0.4s
  1174. addp v1.4s, v1.4s, v1.4s
  1175. addp v2.4s, v16.4s, v16.4s
  1176. addp v3.4s, v17.4s, v17.4s
  1177. st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
  1178. ret
  1179. endfunc
  1180. function pixel_ssim_end4_neon, export=1
  1181. mov x5, #4
  1182. ld1 {v16.4s,v17.4s}, [x0], #32
  1183. ld1 {v18.4s,v19.4s}, [x1], #32
  1184. mov w4, #0x99bb
  1185. subs x2, x5, w2, uxtw
  1186. mov w3, #416 // ssim_c1 = .01*.01*255*255*64
  1187. movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63
  1188. add v0.4s, v16.4s, v18.4s
  1189. add v1.4s, v17.4s, v19.4s
  1190. add v0.4s, v0.4s, v1.4s
  1191. ld1 {v20.4s,v21.4s}, [x0], #32
  1192. ld1 {v22.4s,v23.4s}, [x1], #32
  1193. add v2.4s, v20.4s, v22.4s
  1194. add v3.4s, v21.4s, v23.4s
  1195. add v1.4s, v1.4s, v2.4s
  1196. ld1 {v16.4s}, [x0], #16
  1197. ld1 {v18.4s}, [x1], #16
  1198. add v16.4s, v16.4s, v18.4s
  1199. add v2.4s, v2.4s, v3.4s
  1200. add v3.4s, v3.4s, v16.4s
  1201. dup v30.4s, w3
  1202. dup v31.4s, w4
  1203. transpose v4.4s, v5.4s, v0.4s, v1.4s
  1204. transpose v6.4s, v7.4s, v2.4s, v3.4s
  1205. transpose v0.2d, v2.2d, v4.2d, v6.2d
  1206. transpose v1.2d, v3.2d, v5.2d, v7.2d
  1207. mul v16.4s, v0.4s, v1.4s // s1*s2
  1208. mul v0.4s, v0.4s, v0.4s
  1209. mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2
  1210. shl v3.4s, v3.4s, #7
  1211. shl v2.4s, v2.4s, #6
  1212. add v1.4s, v16.4s, v16.4s
  1213. sub v2.4s, v2.4s, v0.4s // vars
  1214. sub v3.4s, v3.4s, v1.4s // covar*2
  1215. add v0.4s, v0.4s, v30.4s
  1216. add v2.4s, v2.4s, v31.4s
  1217. add v1.4s, v1.4s, v30.4s
  1218. add v3.4s, v3.4s, v31.4s
  1219. scvtf v0.4s, v0.4s
  1220. scvtf v2.4s, v2.4s
  1221. scvtf v1.4s, v1.4s
  1222. scvtf v3.4s, v3.4s
  1223. fmul v0.4s, v0.4s, v2.4s
  1224. fmul v1.4s, v1.4s, v3.4s
  1225. fdiv v0.4s, v1.4s, v0.4s
  1226. b.eq 1f
  1227. movrel x3, mask
  1228. add x3, x3, x2, lsl #2
  1229. ld1 {v29.4s}, [x3]
  1230. and v0.16b, v0.16b, v29.16b
  1231. 1:
  1232. faddp v0.4s, v0.4s, v0.4s
  1233. faddp s0, v0.2s
  1234. ret
  1235. endfunc