predict-a.S 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. /*****************************************************************************
  2. * predict.S: aarch64 intra prediction
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Mans Rullgard <mans@mansr.com>
  8. * Janne Grunau <janne-x264@jannau.net>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include "asm.S"
  28. const p8weight, align=4
  29. .short 1, 2, 3, 4, 1, 2, 3, 4
  30. endconst
  31. const p16weight, align=4
  32. .short 1, 2, 3, 4, 5, 6, 7, 8
  33. endconst
  34. .macro ldcol.8 vd, xn, xm, n=8, hi=0
  35. .if \n == 8 || \hi == 0
  36. ld1 {\vd\().b}[0], [\xn], \xm
  37. ld1 {\vd\().b}[1], [\xn], \xm
  38. ld1 {\vd\().b}[2], [\xn], \xm
  39. ld1 {\vd\().b}[3], [\xn], \xm
  40. .endif
  41. .if \n == 8 || \hi == 1
  42. ld1 {\vd\().b}[4], [\xn], \xm
  43. ld1 {\vd\().b}[5], [\xn], \xm
  44. ld1 {\vd\().b}[6], [\xn], \xm
  45. ld1 {\vd\().b}[7], [\xn], \xm
  46. .endif
  47. .endm
  48. .macro ldcol.16 vd, xn, xm
  49. ldcol.8 \vd, \xn, \xm
  50. ld1 {\vd\().b}[ 8], [\xn], \xm
  51. ld1 {\vd\().b}[ 9], [\xn], \xm
  52. ld1 {\vd\().b}[10], [\xn], \xm
  53. ld1 {\vd\().b}[11], [\xn], \xm
  54. ld1 {\vd\().b}[12], [\xn], \xm
  55. ld1 {\vd\().b}[13], [\xn], \xm
  56. ld1 {\vd\().b}[14], [\xn], \xm
  57. ld1 {\vd\().b}[15], [\xn], \xm
  58. .endm
  59. function predict_4x4_h_aarch64, export=1
  60. ldurb w1, [x0, #0*FDEC_STRIDE-1]
  61. mov w5, #0x01010101
  62. ldrb w2, [x0, #1*FDEC_STRIDE-1]
  63. ldrb w3, [x0, #2*FDEC_STRIDE-1]
  64. mul w1, w1, w5
  65. ldrb w4, [x0, #3*FDEC_STRIDE-1]
  66. mul w2, w2, w5
  67. str w1, [x0, #0*FDEC_STRIDE]
  68. mul w3, w3, w5
  69. str w2, [x0, #1*FDEC_STRIDE]
  70. mul w4, w4, w5
  71. str w3, [x0, #2*FDEC_STRIDE]
  72. str w4, [x0, #3*FDEC_STRIDE]
  73. ret
  74. endfunc
  75. function predict_4x4_v_aarch64, export=1
  76. ldur w1, [x0, #0 - 1 * FDEC_STRIDE]
  77. str w1, [x0, #0 + 0 * FDEC_STRIDE]
  78. str w1, [x0, #0 + 1 * FDEC_STRIDE]
  79. str w1, [x0, #0 + 2 * FDEC_STRIDE]
  80. str w1, [x0, #0 + 3 * FDEC_STRIDE]
  81. ret
  82. endfunc
  83. function predict_4x4_dc_neon, export=1
  84. sub x1, x0, #FDEC_STRIDE
  85. ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE]
  86. ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
  87. ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
  88. ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
  89. add w4, w4, w5
  90. ldr s0, [x1]
  91. add w6, w6, w7
  92. uaddlv h0, v0.8b
  93. add w4, w4, w6
  94. dup v0.4h, v0.h[0]
  95. dup v1.4h, w4
  96. add v0.4h, v0.4h, v1.4h
  97. rshrn v0.8b, v0.8h, #3
  98. str s0, [x0]
  99. str s0, [x0, #1 * FDEC_STRIDE]
  100. str s0, [x0, #2 * FDEC_STRIDE]
  101. str s0, [x0, #3 * FDEC_STRIDE]
  102. ret
  103. endfunc
  104. function predict_4x4_dc_top_neon, export=1
  105. sub x1, x0, #FDEC_STRIDE
  106. ldr s0, [x1]
  107. uaddlv h0, v0.8b
  108. dup v0.4h, v0.h[0]
  109. rshrn v0.8b, v0.8h, #2
  110. str s0, [x0]
  111. str s0, [x0, #1 * FDEC_STRIDE]
  112. str s0, [x0, #2 * FDEC_STRIDE]
  113. str s0, [x0, #3 * FDEC_STRIDE]
  114. ret
  115. ret
  116. endfunc
  117. function predict_4x4_ddr_neon, export=1
  118. sub x1, x0, #FDEC_STRIDE+1
  119. mov x7, #FDEC_STRIDE
  120. ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
  121. ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
  122. ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
  123. ext v0.8b, v1.8b, v0.8b, #7
  124. ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
  125. ext v0.8b, v2.8b, v0.8b, #7 // a
  126. ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
  127. ext v1.8b, v3.8b, v0.8b, #7 // b
  128. ext v2.8b, v4.8b, v1.8b, #7 // c
  129. uaddl v0.8h, v0.8b, v1.8b
  130. uaddl v1.8h, v1.8b, v2.8b
  131. add v0.8h, v0.8h, v1.8h
  132. rshrn v0.8b, v0.8h, #2
  133. ext v3.8b, v0.8b, v0.8b, #3
  134. ext v2.8b, v0.8b, v0.8b, #2
  135. ext v1.8b, v0.8b, v0.8b, #1
  136. str s3, [x0], #FDEC_STRIDE
  137. str s2, [x0], #FDEC_STRIDE
  138. str s1, [x0], #FDEC_STRIDE
  139. str s0, [x0]
  140. ret
  141. endfunc
  142. function predict_4x4_ddl_neon, export=1
  143. sub x0, x0, #FDEC_STRIDE
  144. mov x7, #FDEC_STRIDE
  145. ld1 {v0.8b}, [x0], x7
  146. dup v3.8b, v0.b[7]
  147. ext v1.8b, v0.8b, v0.8b, #1
  148. ext v2.8b, v0.8b, v3.8b, #2
  149. uhadd v0.8b, v0.8b, v2.8b
  150. urhadd v0.8b, v0.8b, v1.8b
  151. str s0, [x0], #FDEC_STRIDE
  152. ext v1.8b, v0.8b, v0.8b, #1
  153. ext v2.8b, v0.8b, v0.8b, #2
  154. str s1, [x0], #FDEC_STRIDE
  155. ext v3.8b, v0.8b, v0.8b, #3
  156. str s2, [x0], #FDEC_STRIDE
  157. str s3, [x0]
  158. ret
  159. endfunc
  160. function predict_8x8_dc_neon, export=1
  161. mov x7, #FDEC_STRIDE
  162. ld1 {v0.16b}, [x1], #16
  163. ld1 {v1.8b}, [x1]
  164. ext v0.16b, v0.16b, v0.16b, #7
  165. uaddlv h1, v1.8b
  166. uaddlv h0, v0.8b
  167. add v0.8h, v0.8h, v1.8h
  168. dup v0.8h, v0.h[0]
  169. rshrn v0.8b, v0.8h, #4
  170. .rept 8
  171. st1 {v0.8b}, [x0], x7
  172. .endr
  173. ret
  174. endfunc
  175. function predict_8x8_h_neon, export=1
  176. mov x7, #FDEC_STRIDE
  177. ld1 {v16.16b}, [x1]
  178. dup v0.8b, v16.b[14]
  179. dup v1.8b, v16.b[13]
  180. st1 {v0.8b}, [x0], x7
  181. dup v2.8b, v16.b[12]
  182. st1 {v1.8b}, [x0], x7
  183. dup v3.8b, v16.b[11]
  184. st1 {v2.8b}, [x0], x7
  185. dup v4.8b, v16.b[10]
  186. st1 {v3.8b}, [x0], x7
  187. dup v5.8b, v16.b[9]
  188. st1 {v4.8b}, [x0], x7
  189. dup v6.8b, v16.b[8]
  190. st1 {v5.8b}, [x0], x7
  191. dup v7.8b, v16.b[7]
  192. st1 {v6.8b}, [x0], x7
  193. st1 {v7.8b}, [x0], x7
  194. ret
  195. endfunc
  196. function predict_8x8_v_neon, export=1
  197. add x1, x1, #16
  198. mov x7, #FDEC_STRIDE
  199. ld1 {v0.8b}, [x1]
  200. .rept 8
  201. st1 {v0.8b}, [x0], x7
  202. .endr
  203. ret
  204. endfunc
  205. function predict_8x8_ddl_neon, export=1
  206. add x1, x1, #16
  207. mov x7, #FDEC_STRIDE
  208. ld1 {v0.16b}, [x1]
  209. movi v3.16b, #0
  210. dup v2.16b, v0.b[15]
  211. ext v4.16b, v3.16b, v0.16b, #15
  212. ext v2.16b, v0.16b, v2.16b, #1
  213. uhadd v4.16b, v4.16b, v2.16b
  214. urhadd v0.16b, v0.16b, v4.16b
  215. ext v1.16b, v0.16b, v0.16b, #1
  216. ext v2.16b, v0.16b, v0.16b, #2
  217. st1 {v1.8b}, [x0], x7
  218. ext v3.16b, v0.16b, v0.16b, #3
  219. st1 {v2.8b}, [x0], x7
  220. ext v4.16b, v0.16b, v0.16b, #4
  221. st1 {v3.8b}, [x0], x7
  222. ext v5.16b, v0.16b, v0.16b, #5
  223. st1 {v4.8b}, [x0], x7
  224. ext v6.16b, v0.16b, v0.16b, #6
  225. st1 {v5.8b}, [x0], x7
  226. ext v7.16b, v0.16b, v0.16b, #7
  227. st1 {v6.8b}, [x0], x7
  228. ext v0.16b, v0.16b, v0.16b, #8
  229. st1 {v7.8b}, [x0], x7
  230. st1 {v0.8b}, [x0], x7
  231. ret
  232. endfunc
  233. function predict_8x8_ddr_neon, export=1
  234. ld1 {v0.16b,v1.16b}, [x1]
  235. ext v2.16b, v0.16b, v1.16b, #7
  236. ext v4.16b, v0.16b, v1.16b, #9
  237. ext v3.16b, v0.16b, v1.16b, #8
  238. uhadd v2.16b, v2.16b, v4.16b
  239. urhadd v7.16b, v3.16b, v2.16b
  240. add x0, x0, #7*FDEC_STRIDE
  241. mov x7, #-1*FDEC_STRIDE
  242. ext v6.16b, v7.16b, v7.16b, #1
  243. st1 {v7.8b}, [x0], x7
  244. ext v5.16b, v7.16b, v7.16b, #2
  245. st1 {v6.8b}, [x0], x7
  246. ext v4.16b, v7.16b, v7.16b, #3
  247. st1 {v5.8b}, [x0], x7
  248. ext v3.16b, v7.16b, v7.16b, #4
  249. st1 {v4.8b}, [x0], x7
  250. ext v2.16b, v7.16b, v7.16b, #5
  251. st1 {v3.8b}, [x0], x7
  252. ext v1.16b, v7.16b, v7.16b, #6
  253. st1 {v2.8b}, [x0], x7
  254. ext v0.16b, v7.16b, v7.16b, #7
  255. st1 {v1.8b}, [x0], x7
  256. st1 {v0.8b}, [x0], x7
  257. ret
  258. endfunc
  259. function predict_8x8_vl_neon, export=1
  260. add x1, x1, #16
  261. mov x7, #FDEC_STRIDE
  262. ld1 {v0.16b}, [x1]
  263. ext v1.16b, v1.16b, v0.16b, #15
  264. ext v2.16b, v0.16b, v2.16b, #1
  265. uhadd v1.16b, v1.16b, v2.16b
  266. urhadd v3.16b, v0.16b, v2.16b
  267. urhadd v0.16b, v0.16b, v1.16b
  268. ext v4.16b, v0.16b, v0.16b, #1
  269. st1 {v3.8b}, [x0], x7
  270. ext v5.16b, v3.16b, v3.16b, #1
  271. st1 {v4.8b}, [x0], x7
  272. ext v6.16b, v0.16b, v0.16b, #2
  273. st1 {v5.8b}, [x0], x7
  274. ext v7.16b, v3.16b, v3.16b, #2
  275. st1 {v6.8b}, [x0], x7
  276. ext v4.16b, v0.16b, v0.16b, #3
  277. st1 {v7.8b}, [x0], x7
  278. ext v5.16b, v3.16b, v3.16b, #3
  279. st1 {v4.8b}, [x0], x7
  280. ext v6.16b, v0.16b, v0.16b, #4
  281. st1 {v5.8b}, [x0], x7
  282. st1 {v6.8b}, [x0], x7
  283. ret
  284. endfunc
  285. function predict_8x8_vr_neon, export=1
  286. add x1, x1, #8
  287. mov x7, #FDEC_STRIDE
  288. ld1 {v2.16b}, [x1]
  289. ext v1.16b, v2.16b, v2.16b, #14
  290. ext v0.16b, v2.16b, v2.16b, #15
  291. uhadd v3.16b, v2.16b, v1.16b
  292. urhadd v2.16b, v2.16b, v0.16b
  293. urhadd v0.16b, v0.16b, v3.16b
  294. ext v1.16b, v2.16b, v2.16b, #8
  295. uzp1 v2.8b, v0.8b, v0.8b
  296. uzp2 v3.8b, v0.8b, v0.8b
  297. ext v0.16b, v0.16b, v0.16b, #8
  298. st1 {v1.8b}, [x0], x7
  299. st1 {v0.8b}, [x0], x7
  300. ext v4.8b, v3.8b, v1.8b, #7
  301. ext v5.8b, v2.8b, v0.8b, #7
  302. st1 {v4.8b}, [x0], x7
  303. st1 {v5.8b}, [x0], x7
  304. ext v6.8b, v3.8b, v1.8b, #6
  305. ext v7.8b, v2.8b, v0.8b, #6
  306. st1 {v6.8b}, [x0], x7
  307. st1 {v7.8b}, [x0], x7
  308. ext v1.8b, v3.8b, v1.8b, #5
  309. ext v0.8b, v2.8b, v0.8b, #5
  310. st1 {v1.8b}, [x0], x7
  311. st1 {v0.8b}, [x0], x7
  312. ret
  313. endfunc
  314. function predict_8x8_hd_neon, export=1
  315. add x1, x1, #7
  316. mov x7, #FDEC_STRIDE
  317. ld1 {v1.16b}, [x1]
  318. ext v3.16b, v1.16b, v1.16b, #1
  319. ext v2.16b, v1.16b, v1.16b, #2
  320. urhadd v4.16b, v1.16b, v3.16b
  321. uhadd v1.16b, v1.16b, v2.16b
  322. urhadd v0.16b, v1.16b, v3.16b
  323. zip1 v16.8b, v4.8b, v0.8b
  324. zip2 v17.8b, v4.8b, v0.8b
  325. ext v7.16b, v0.16b, v0.16b, #8
  326. ext v0.8b, v17.8b, v7.8b, #6
  327. ext v1.8b, v17.8b, v7.8b, #4
  328. st1 {v0.8b}, [x0], x7
  329. ext v2.8b, v17.8b, v7.8b, #2
  330. st1 {v1.8b}, [x0], x7
  331. st1 {v2.8b}, [x0], x7
  332. ext v3.8b, v16.8b, v17.8b, #6
  333. st1 {v17.8b}, [x0], x7
  334. ext v4.8b, v16.8b, v17.8b, #4
  335. st1 {v3.8b}, [x0], x7
  336. ext v5.8b, v16.8b, v17.8b, #2
  337. st1 {v4.8b}, [x0], x7
  338. st1 {v5.8b}, [x0], x7
  339. st1 {v16.8b}, [x0], x7
  340. ret
  341. endfunc
  342. function predict_8x8_hu_neon, export=1
  343. add x1, x1, #7
  344. mov x7, #FDEC_STRIDE
  345. ld1 {v7.8b}, [x1]
  346. dup v6.8b, v7.b[0]
  347. rev64 v7.8b, v7.8b
  348. ext v4.8b, v7.8b, v6.8b, #2
  349. ext v2.8b, v7.8b, v6.8b, #1
  350. uhadd v5.8b, v7.8b, v4.8b
  351. urhadd v0.8b, v2.8b, v7.8b
  352. urhadd v1.8b, v5.8b, v2.8b
  353. zip1 v16.8b, v0.8b, v1.8b
  354. zip2 v17.8b, v0.8b, v1.8b
  355. dup v18.4h, v17.h[3]
  356. ext v0.8b, v16.8b, v17.8b, #2
  357. ext v1.8b, v16.8b, v17.8b, #4
  358. ext v2.8b, v16.8b, v17.8b, #6
  359. st1 {v16.8b}, [x0], x7
  360. st1 {v0.8b}, [x0], x7
  361. st1 {v1.8b}, [x0], x7
  362. st1 {v2.8b}, [x0], x7
  363. ext v4.8b, v17.8b, v18.8b, #2
  364. ext v5.8b, v17.8b, v18.8b, #4
  365. ext v6.8b, v17.8b, v18.8b, #6
  366. st1 {v17.8b}, [x0], x7
  367. st1 {v4.8b}, [x0], x7
  368. st1 {v5.8b}, [x0], x7
  369. st1 {v6.8b}, [x0]
  370. ret
  371. endfunc
  372. function predict_8x8c_dc_top_neon, export=1
  373. sub x2, x0, #FDEC_STRIDE
  374. mov x1, #FDEC_STRIDE
  375. ld1 {v0.8b}, [x2]
  376. uaddlp v0.4h, v0.8b
  377. addp v0.4h, v0.4h, v0.4h
  378. rshrn v0.8b, v0.8h, #2
  379. dup v3.8b, v0.b[1]
  380. dup v2.8b, v0.b[0]
  381. transpose v0.2s, v1.2s, v2.2s, v3.2s
  382. b pred8x8c_dc_end
  383. endfunc
  384. function predict_8x8c_dc_left_neon, export=1
  385. ldurb w2, [x0, #0 * FDEC_STRIDE - 1]
  386. ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
  387. ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
  388. ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
  389. mov x1, #FDEC_STRIDE
  390. add w2, w2, w3
  391. add w3, w4, w5
  392. ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
  393. ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
  394. ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
  395. ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
  396. add w6, w6, w7
  397. add w7, w8, w9
  398. add w2, w2, w3
  399. add w6, w6, w7
  400. dup v0.8h, w2
  401. dup v1.8h, w6
  402. rshrn v0.8b, v0.8h, #2
  403. rshrn v1.8b, v1.8h, #2
  404. b pred8x8c_dc_end
  405. endfunc
  406. function predict_8x8c_dc_neon, export=1
  407. mov x1, #FDEC_STRIDE
  408. sub x2, x0, #FDEC_STRIDE
  409. ldurb w10, [x0, #0 * FDEC_STRIDE - 1]
  410. ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
  411. ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
  412. ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
  413. add w10, w10, w11
  414. ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
  415. ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
  416. add w12, w12, w13
  417. ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
  418. ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
  419. add w4, w4, w5
  420. add w6, w6, w7
  421. add w10, w10, w12, lsl #16
  422. add w4, w4, w6, lsl #16
  423. ld1 {v0.8b}, [x2]
  424. add x10, x10, x4, lsl #32
  425. uaddlp v0.4h, v0.8b // s0, s1
  426. mov v1.d[0], x10 // s2, s3
  427. add v3.4h, v0.4h, v1.4h
  428. addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
  429. addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
  430. uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
  431. uzp1 v1.2d, v1.2d, v1.2d
  432. uzp1 v0.2d, v0.2d, v0.2d
  433. rshrn v3.8b, v1.8h, #3
  434. rshrn v2.8b, v0.8h, #2
  435. uzp1 v0.8b, v3.8b, v2.8b
  436. uzp2 v1.8b, v2.8b, v3.8b
  437. pred8x8c_dc_end:
  438. add x2, x0, #2 * FDEC_STRIDE
  439. add x4, x0, #4 * FDEC_STRIDE
  440. add x5, x0, #6 * FDEC_STRIDE
  441. st1 {v0.8b}, [x0], x1
  442. st1 {v0.8b}, [x2], x1
  443. st1 {v0.8b}, [x0]
  444. st1 {v0.8b}, [x2]
  445. st1 {v1.8b}, [x4], x1
  446. st1 {v1.8b}, [x5], x1
  447. st1 {v1.8b}, [x4]
  448. st1 {v1.8b}, [x5]
  449. ret
  450. endfunc
  451. function predict_8x8c_h_neon, export=1
  452. sub x1, x0, #1
  453. mov x7, #FDEC_STRIDE
  454. .rept 4
  455. ld1r {v0.8b}, [x1], x7
  456. ld1r {v1.8b}, [x1], x7
  457. st1 {v0.8b}, [x0], x7
  458. st1 {v1.8b}, [x0], x7
  459. .endr
  460. ret
  461. endfunc
  462. function predict_8x8c_v_aarch64, export=1
  463. ldur x1, [x0, #-FDEC_STRIDE]
  464. .irp c, 0,1,2,3,4,5,6,7
  465. str x1, [x0, #\c * FDEC_STRIDE]
  466. .endr
  467. ret
  468. endfunc
  469. function predict_8x8c_p_neon, export=1
  470. sub x3, x0, #FDEC_STRIDE
  471. mov x1, #FDEC_STRIDE
  472. add x2, x3, #4
  473. sub x3, x3, #1
  474. ld1 {v0.s}[0], [x3]
  475. ld1 {v2.s}[0], [x2], x1
  476. ldcol.8 v0, x3, x1, 4, hi=1
  477. add x3, x3, x1
  478. ldcol.8 v3, x3, x1, 4
  479. movrel x4, p8weight
  480. movrel x5, p16weight
  481. uaddl v4.8h, v2.8b, v3.8b
  482. rev32 v0.8b, v0.8b
  483. trn1 v2.2s, v2.2s, v3.2s
  484. ld1 {v7.8h}, [x4]
  485. usubl v2.8h, v2.8b, v0.8b
  486. mul v2.8h, v2.8h, v7.8h
  487. ld1 {v0.8h}, [x5]
  488. saddlp v2.4s, v2.8h
  489. addp v2.4s, v2.4s, v2.4s
  490. shl v3.2s, v2.2s, #4
  491. add v2.2s, v2.2s, v3.2s
  492. rshrn v5.4h, v2.4s, #5 // b, c, x, x
  493. addp v2.4h, v5.4h, v5.4h
  494. shl v3.4h, v2.4h, #2
  495. sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
  496. rev64 v4.4h, v4.4h
  497. add v4.4h, v4.4h, v0.4h
  498. shl v2.4h, v4.4h, #4 // a
  499. sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
  500. ext v0.16b, v0.16b, v0.16b, #14
  501. sub v6.4h, v5.4h, v3.4h
  502. mov v0.h[0], wzr
  503. mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
  504. dup v1.8h, v2.h[0] // pix
  505. dup v2.8h, v5.h[1] // c
  506. add v1.8h, v1.8h, v0.8h // pix + x*b
  507. mov x3, #8
  508. 1:
  509. subs x3, x3, #1
  510. sqshrun v0.8b, v1.8h, #5
  511. add v1.8h, v1.8h, v2.8h
  512. st1 {v0.8b}, [x0], x1
  513. b.ne 1b
  514. ret
  515. endfunc
  516. .macro loadsum4 wd, t1, t2, t3, x, idx
  517. .if \idx == 0
  518. ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
  519. .else
  520. ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
  521. .endif
  522. ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
  523. ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
  524. ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
  525. add \wd, \wd, \t1
  526. add \t1, \t2, \t3
  527. add \wd, \wd, \t1
  528. .endm
  529. function predict_8x16c_h_neon, export=1
  530. sub x2, x0, #1
  531. add x3, x0, #FDEC_STRIDE - 1
  532. mov x7, #2 * FDEC_STRIDE
  533. add x1, x0, #FDEC_STRIDE
  534. .rept 4
  535. ld1r {v0.8b}, [x2], x7
  536. ld1r {v1.8b}, [x3], x7
  537. ld1r {v2.8b}, [x2], x7
  538. ld1r {v3.8b}, [x3], x7
  539. st1 {v0.8b}, [x0], x7
  540. st1 {v1.8b}, [x1], x7
  541. st1 {v2.8b}, [x0], x7
  542. st1 {v3.8b}, [x1], x7
  543. .endr
  544. ret
  545. endfunc
  546. function predict_8x16c_v_neon, export=1
  547. sub x1, x0, #FDEC_STRIDE
  548. mov x2, #2 * FDEC_STRIDE
  549. ld1 {v0.8b}, [x1], x2
  550. .rept 8
  551. st1 {v0.8b}, [x0], x2
  552. st1 {v0.8b}, [x1], x2
  553. .endr
  554. ret
  555. endfunc
  556. function predict_8x16c_p_neon, export=1
  557. movrel x4, p16weight
  558. ld1 {v17.8h}, [x4]
  559. sub x3, x0, #FDEC_STRIDE
  560. mov x1, #FDEC_STRIDE
  561. add x2, x3, #4
  562. sub x3, x3, #1
  563. ld1 {v0.8b}, [x3]
  564. ld1 {v2.8b}, [x2], x1
  565. ldcol.8 v1, x3, x1
  566. add x3, x3, x1
  567. ldcol.8 v3, x3, x1
  568. ext v4.8b, v2.8b, v2.8b, #3
  569. ext v5.8b, v3.8b, v3.8b, #7
  570. rev32 v0.8b, v0.8b
  571. rev64 v1.8b, v1.8b
  572. uaddl v4.8h, v5.8b, v4.8b // a * 1/16
  573. usubl v2.8h, v2.8b, v0.8b
  574. mul v2.8h, v2.8h, v17.8h
  575. saddlp v2.4s, v2.8h
  576. addp v2.4s, v2.4s, v2.4s // H
  577. usubl v3.8h, v3.8b, v1.8b
  578. mul v3.8h, v3.8h, v17.8h
  579. saddlp v3.4s, v3.8h
  580. addp v3.4s, v3.4s, v3.4s
  581. addp v3.4s, v3.4s, v3.4s // V
  582. ext v17.16b, v17.16b, v17.16b, #14
  583. shl v4.4h, v4.4h, #4 // a
  584. shl v6.2s, v2.2s, #4 // 16 * H
  585. shl v7.2s, v3.2s, #2 // 4 * V
  586. add v2.2s, v2.2s, v6.2s // 17 * H
  587. add v3.2s, v3.2s, v7.2s // 5 * V
  588. rshrn v2.4h, v2.4s, #5 // b
  589. rshrn v3.4h, v3.4s, #6 // c
  590. mov v17.h[0], wzr
  591. sub v4.4h, v4.4h, v2.4h // a - b
  592. shl v6.4h, v2.4h, #1 // 2 * b
  593. add v4.4h, v4.4h, v3.4h // a - b + c
  594. shl v7.4h, v3.4h, #3 // 8 * c
  595. sub v4.4h, v4.4h, v6.4h // a - 3b + c
  596. sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
  597. mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
  598. dup v1.8h, v4.h[0] // i00
  599. dup v2.8h, v3.h[0] // c
  600. add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
  601. mov x3, #16
  602. 1:
  603. subs x3, x3, #2
  604. sqrshrun v4.8b, v1.8h, #5
  605. add v1.8h, v1.8h, v2.8h
  606. sqrshrun v5.8b, v1.8h, #5
  607. st1 {v4.8b}, [x0], x1
  608. add v1.8h, v1.8h, v2.8h
  609. st1 {v5.8b}, [x0], x1
  610. b.ne 1b
  611. ret
  612. endfunc
  613. function predict_8x16c_dc_neon, export=1
  614. mov x1, #FDEC_STRIDE
  615. sub x10, x0, #FDEC_STRIDE
  616. loadsum4 w2, w3, w4, w5, x0, 0
  617. ld1 {v6.8b}, [x10]
  618. loadsum4 w6, w7, w8, w9, x0, 4
  619. uaddlp v6.4h, v6.8b
  620. dup v22.8h, w2 // s2
  621. dup v23.8h, w6 // s3
  622. loadsum4 w2, w3, w4, w5, x0, 8
  623. addp v6.4h, v6.4h, v6.4h // s0, s1
  624. loadsum4 w6, w7, w8, w9, x0, 12
  625. dup v20.8h, v6.h[0] // s0
  626. dup v21.8h, v6.h[1] // s1
  627. dup v24.8h, w2 // s4
  628. dup v25.8h, w6 // s5
  629. ext v16.16b, v20.16b, v21.16b, #8
  630. ext v17.16b, v22.16b, v21.16b, #8
  631. ext v1.16b, v23.16b, v21.16b, #8
  632. ext v2.16b, v24.16b, v21.16b, #8
  633. ext v3.16b, v25.16b, v21.16b, #8
  634. add v0.8h, v16.8h, v17.8h
  635. add v1.8h, v1.8h, v23.8h
  636. add v2.8h, v2.8h, v24.8h
  637. add v3.8h, v3.8h, v25.8h
  638. rshrn v0.8b, v0.8h, #3
  639. rshrn v1.8b, v1.8h, #3
  640. rshrn v2.8b, v2.8h, #3
  641. rshrn v3.8b, v3.8h, #3
  642. add x11, x0, #4 * FDEC_STRIDE
  643. add x12, x0, #8 * FDEC_STRIDE
  644. add x13, x0, #12 * FDEC_STRIDE
  645. .rept 4
  646. st1 {v0.8b}, [x0], x1
  647. st1 {v1.8b}, [x11], x1
  648. st1 {v2.8b}, [x12], x1
  649. st1 {v3.8b}, [x13], x1
  650. .endr
  651. ret
  652. endfunc
  653. function predict_8x16c_dc_left_neon, export=1
  654. mov x1, #FDEC_STRIDE
  655. ldurb w2, [x0, # 0 * FDEC_STRIDE - 1]
  656. ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
  657. ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
  658. ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
  659. add w2, w2, w3
  660. ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
  661. add w4, w4, w5
  662. ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
  663. add w2, w2, w4
  664. ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
  665. ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
  666. dup v0.8h, w2
  667. add w6, w6, w7
  668. rshrn v0.8b, v0.8h, #2
  669. add w8, w8, w9
  670. ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
  671. ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
  672. add w6, w6, w8
  673. ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
  674. ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
  675. dup v1.8h, w6
  676. add w10, w10, w11
  677. rshrn v1.8b, v1.8h, #2
  678. add w12, w12, w13
  679. ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
  680. ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
  681. add w10, w10, w12
  682. ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
  683. ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
  684. dup v2.8h, w10
  685. add w2, w2, w3
  686. rshrn v2.8b, v2.8h, #2
  687. add w4, w4, w5
  688. st1 {v0.8b}, [x0], x1
  689. st1 {v0.8b}, [x0], x1
  690. add w2, w2, w4
  691. st1 {v0.8b}, [x0], x1
  692. dup v3.8h, w2
  693. st1 {v0.8b}, [x0], x1
  694. rshrn v3.8b, v3.8h, #2
  695. .irp idx, 1, 2, 3
  696. .rept 4
  697. st1 {v\idx\().8b}, [x0], x1
  698. .endr
  699. .endr
  700. ret
  701. endfunc
  702. function predict_8x16c_dc_top_neon, export=1
  703. sub x2, x0, #FDEC_STRIDE
  704. mov x1, #FDEC_STRIDE
  705. ld1 {v0.8b}, [x2]
  706. uaddlp v0.4h, v0.8b
  707. addp v0.4h, v0.4h, v0.4h
  708. rshrn v4.8b, v0.8h, #2
  709. dup v0.8b, v4.b[0]
  710. dup v1.8b, v4.b[1]
  711. ext v0.8b, v0.8b, v1.8b, #4
  712. .rept 16
  713. st1 {v0.8b}, [x0], x1
  714. .endr
  715. ret
  716. endfunc
  717. function predict_16x16_dc_top_neon, export=1
  718. sub x2, x0, #FDEC_STRIDE
  719. mov x1, #FDEC_STRIDE
  720. ld1 {v0.16b}, [x2]
  721. uaddlv h0, v0.16b
  722. rshrn v0.8b, v0.8h, #4
  723. dup v0.16b, v0.b[0]
  724. b pred16x16_dc_end
  725. endfunc
  726. function predict_16x16_dc_left_neon, export=1
  727. sub x2, x0, #1
  728. mov x1, #FDEC_STRIDE
  729. ldcol.16 v0, x2, x1
  730. uaddlv h0, v0.16b
  731. rshrn v0.8b, v0.8h, #4
  732. dup v0.16b, v0.b[0]
  733. b pred16x16_dc_end
  734. endfunc
  735. function predict_16x16_dc_neon, export=1
  736. sub x3, x0, #FDEC_STRIDE
  737. sub x2, x0, #1
  738. mov x1, #FDEC_STRIDE
  739. ld1 {v0.16b}, [x3]
  740. ldcol.16 v1, x2, x1
  741. uaddlv h0, v0.16b
  742. uaddlv h1, v1.16b
  743. add v0.4h, v0.4h, v1.4h
  744. rshrn v0.8b, v0.8h, #5
  745. dup v0.16b, v0.b[0]
  746. pred16x16_dc_end:
  747. .rept 16
  748. st1 {v0.16b}, [x0], x1
  749. .endr
  750. ret
  751. endfunc
  752. function predict_16x16_h_neon, export=1
  753. sub x1, x0, #1
  754. mov x7, #FDEC_STRIDE
  755. .rept 8
  756. ld1r {v0.16b}, [x1], x7
  757. ld1r {v1.16b}, [x1], x7
  758. st1 {v0.16b}, [x0], x7
  759. st1 {v1.16b}, [x0], x7
  760. .endr
  761. ret
  762. endfunc
  763. function predict_16x16_v_neon, export=1
  764. sub x0, x0, #FDEC_STRIDE
  765. mov x7, #FDEC_STRIDE
  766. ld1 {v0.16b}, [x0], x7
  767. .rept 16
  768. st1 {v0.16b}, [x0], x7
  769. .endr
  770. ret
  771. endfunc
  772. function predict_16x16_p_neon, export=1
  773. sub x3, x0, #FDEC_STRIDE
  774. mov x1, #FDEC_STRIDE
  775. add x2, x3, #8
  776. sub x3, x3, #1
  777. ld1 {v0.8b}, [x3]
  778. ld1 {v2.8b}, [x2], x1
  779. ldcol.8 v1, x3, x1
  780. add x3, x3, x1
  781. ldcol.8 v3, x3, x1
  782. rev64 v0.8b, v0.8b
  783. rev64 v1.8b, v1.8b
  784. movrel x4, p16weight
  785. uaddl v4.8h, v2.8b, v3.8b
  786. ld1 {v7.8h}, [x4]
  787. usubl v2.8h, v2.8b, v0.8b
  788. usubl v3.8h, v3.8b, v1.8b
  789. mul v2.8h, v2.8h, v7.8h
  790. mul v3.8h, v3.8h, v7.8h
  791. saddlp v2.4s, v2.8h
  792. saddlp v3.4s, v3.8h
  793. addp v2.4s, v2.4s, v3.4s
  794. addp v2.4s, v2.4s, v2.4s
  795. shl v3.2s, v2.2s, #2
  796. add v2.2s, v2.2s, v3.2s
  797. rshrn v5.4h, v2.4s, #6 // b, c, x, x
  798. addp v2.4h, v5.4h, v5.4h
  799. shl v3.4h, v2.4h, #3
  800. sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
  801. ext v4.16b, v4.16b, v4.16b, #14
  802. add v4.4h, v4.4h, v7.4h
  803. shl v2.4h, v4.4h, #4 // a
  804. sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
  805. ext v7.16b, v7.16b, v7.16b, #14
  806. mov v7.h[0], wzr
  807. dup v3.8h, v5.h[0]
  808. mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
  809. dup v1.8h, v2.h[0] // pix
  810. dup v2.8h, v5.h[1] // c
  811. shl v3.8h, v3.8h, #3
  812. add v1.8h, v1.8h, v0.8h // pix + x*b
  813. add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
  814. mov x3, #16
  815. 1:
  816. subs x3, x3, #1
  817. sqshrun v0.8b, v1.8h, #5
  818. add v1.8h, v1.8h, v2.8h
  819. sqshrun2 v0.16b, v3.8h, #5
  820. add v3.8h, v3.8h, v2.8h
  821. st1 {v0.16b}, [x0], x1
  822. b.ne 1b
  823. ret
  824. endfunc