mc-a.S 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755
  1. /*****************************************************************************
  2. * mc.S: aarch64 motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Janne Grunau <janne-x264@jannau.net>
  8. * Mans Rullgard <mans@mansr.com>
  9. * Stefan Groenroos <stefan.gronroos@gmail.com>
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. *
  25. * This program is also available under a commercial proprietary license.
  26. * For more information, contact us at licensing@x264.com.
  27. *****************************************************************************/
  28. #include "asm.S"
  29. // note: prefetch stuff assumes 64-byte cacheline
  30. // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
  31. function prefetch_ref_aarch64, export=1
  32. cmp w2, #1
  33. csel x2, xzr, x1, eq
  34. add x0, x0, #64
  35. add x0, x0, x2, lsl #3
  36. lsl x2, x1, #1
  37. add x3, x1, x1, lsl #1
  38. add x4, x0, x1, lsl #2
  39. prfm pldl1strm, [x0]
  40. prfm pldl1strm, [x0, x1]
  41. prfm pldl1strm, [x0, x2]
  42. prfm pldl1strm, [x0, x3]
  43. prfm pldl1strm, [x4]
  44. prfm pldl1strm, [x4, x1]
  45. prfm pldl1strm, [x4, x2]
  46. prfm pldl1strm, [x4, x3]
  47. ret
  48. endfunc
  49. // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
  50. // uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
  51. .macro prefetch_fenc sub
  52. function prefetch_fenc_\sub\()_aarch64, export=1
  53. and w6, w5, #3
  54. and w7, w5, #3
  55. mul x6, x6, x1
  56. mul x7, x7, x3
  57. add x0, x0, #64
  58. add x2, x2, #64
  59. add x0, x0, x6, lsl #2
  60. add x6, x0, x1, lsl #1
  61. prfm pldl1strm, [x0]
  62. prfm pldl1strm, [x0, x1]
  63. prfm pldl1strm, [x6]
  64. prfm pldl1strm, [x6, x1]
  65. add x2, x2, x7, lsl #1
  66. prfm pldl1strm, [x2]
  67. prfm pldl1strm, [x2, x3]
  68. .ifc \sub, 422
  69. add x7, x2, x3, lsl #1
  70. prfm pldl1strm, [x7]
  71. prfm pldl1strm, [x7, x3]
  72. .endif
  73. ret
  74. endfunc
  75. .endm
  76. prefetch_fenc 420
  77. prefetch_fenc 422
  78. // void pixel_avg( uint8_t *dst, intptr_t dst_stride,
  79. // uint8_t *src1, intptr_t src1_stride,
  80. // uint8_t *src2, intptr_t src2_stride, int weight );
  81. .macro AVGH w h
  82. function pixel_avg_\w\()x\h\()_neon, export=1
  83. mov w10, #64
  84. cmp w6, #32
  85. mov w9, #\h
  86. b.eq pixel_avg_w\w\()_neon
  87. subs w7, w10, w6
  88. b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
  89. cmp w6, #0
  90. b.ge pixel_avg_weight_w\w\()_add_add_neon
  91. b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
  92. endfunc
  93. .endm
  94. AVGH 4, 2
  95. AVGH 4, 4
  96. AVGH 4, 8
  97. AVGH 4, 16
  98. AVGH 8, 4
  99. AVGH 8, 8
  100. AVGH 8, 16
  101. AVGH 16, 8
  102. AVGH 16, 16
  103. // 0 < weight < 64
  104. .macro load_weights_add_add
  105. mov w6, w6
  106. .endm
  107. .macro weight_add_add dst, s1, s2, h=
  108. .ifc \h, 2
  109. umull2 \dst, \s1, v30.16b
  110. umlal2 \dst, \s2, v31.16b
  111. .else
  112. umull \dst, \s1, v30.8b
  113. umlal \dst, \s2, v31.8b
  114. .endif
  115. .endm
  116. // weight > 64
  117. .macro load_weights_add_sub
  118. neg w7, w7
  119. .endm
  120. .macro weight_add_sub dst, s1, s2, h=
  121. .ifc \h, 2
  122. umull2 \dst, \s1, v30.16b
  123. umlsl2 \dst, \s2, v31.16b
  124. .else
  125. umull \dst, \s1, v30.8b
  126. umlsl \dst, \s2, v31.8b
  127. .endif
  128. .endm
  129. // weight < 0
  130. .macro load_weights_sub_add
  131. neg w6, w6
  132. .endm
  133. .macro weight_sub_add dst, s1, s2, h=
  134. .ifc \h, 2
  135. umull2 \dst, \s2, v31.16b
  136. umlsl2 \dst, \s1, v30.16b
  137. .else
  138. umull \dst, \s2, v31.8b
  139. umlsl \dst, \s1, v30.8b
  140. .endif
  141. .endm
  142. .macro AVG_WEIGHT ext
  143. function pixel_avg_weight_w4_\ext\()_neon
  144. load_weights_\ext
  145. dup v30.8b, w6
  146. dup v31.8b, w7
  147. 1: // height loop
  148. subs w9, w9, #2
  149. ld1 {v0.s}[0], [x2], x3
  150. ld1 {v1.s}[0], [x4], x5
  151. weight_\ext v4.8h, v0.8b, v1.8b
  152. ld1 {v2.s}[0], [x2], x3
  153. ld1 {v3.s}[0], [x4], x5
  154. sqrshrun v0.8b, v4.8h, #6
  155. weight_\ext v5.8h, v2.8b, v3.8b
  156. st1 {v0.s}[0], [x0], x1
  157. sqrshrun v1.8b, v5.8h, #6
  158. st1 {v1.s}[0], [x0], x1
  159. b.gt 1b
  160. ret
  161. endfunc
  162. function pixel_avg_weight_w8_\ext\()_neon
  163. load_weights_\ext
  164. dup v30.8b, w6
  165. dup v31.8b, w7
  166. 1: // height loop
  167. subs w9, w9, #4
  168. ld1 {v0.8b}, [x2], x3
  169. ld1 {v1.8b}, [x4], x5
  170. weight_\ext v16.8h, v0.8b, v1.8b
  171. ld1 {v2.8b}, [x2], x3
  172. ld1 {v3.8b}, [x4], x5
  173. weight_\ext v17.8h, v2.8b, v3.8b
  174. ld1 {v4.8b}, [x2], x3
  175. ld1 {v5.8b}, [x4], x5
  176. weight_\ext v18.8h, v4.8b, v5.8b
  177. ld1 {v6.8b}, [x2], x3
  178. ld1 {v7.8b}, [x4], x5
  179. weight_\ext v19.8h, v6.8b, v7.8b
  180. sqrshrun v0.8b, v16.8h, #6
  181. sqrshrun v1.8b, v17.8h, #6
  182. sqrshrun v2.8b, v18.8h, #6
  183. sqrshrun v3.8b, v19.8h, #6
  184. st1 {v0.8b}, [x0], x1
  185. st1 {v1.8b}, [x0], x1
  186. st1 {v2.8b}, [x0], x1
  187. st1 {v3.8b}, [x0], x1
  188. b.gt 1b
  189. ret
  190. endfunc
  191. function pixel_avg_weight_w16_\ext\()_neon
  192. load_weights_\ext
  193. dup v30.16b, w6
  194. dup v31.16b, w7
  195. 1: // height loop
  196. subs w9, w9, #2
  197. ld1 {v0.16b}, [x2], x3
  198. ld1 {v1.16b}, [x4], x5
  199. weight_\ext v16.8h, v0.8b, v1.8b
  200. weight_\ext v17.8h, v0.16b, v1.16b, 2
  201. ld1 {v2.16b}, [x2], x3
  202. ld1 {v3.16b}, [x4], x5
  203. weight_\ext v18.8h, v2.8b, v3.8b
  204. weight_\ext v19.8h, v2.16b, v3.16b, 2
  205. sqrshrun v0.8b, v16.8h, #6
  206. sqrshrun v1.8b, v18.8h, #6
  207. sqrshrun2 v0.16b, v17.8h, #6
  208. sqrshrun2 v1.16b, v19.8h, #6
  209. st1 {v0.16b}, [x0], x1
  210. st1 {v1.16b}, [x0], x1
  211. b.gt 1b
  212. ret
  213. endfunc
  214. .endm
  215. AVG_WEIGHT add_add
  216. AVG_WEIGHT add_sub
  217. AVG_WEIGHT sub_add
  218. function pixel_avg_w4_neon
  219. 1: subs w9, w9, #2
  220. ld1 {v0.s}[0], [x2], x3
  221. ld1 {v2.s}[0], [x4], x5
  222. urhadd v0.8b, v0.8b, v2.8b
  223. ld1 {v1.s}[0], [x2], x3
  224. ld1 {v3.s}[0], [x4], x5
  225. urhadd v1.8b, v1.8b, v3.8b
  226. st1 {v0.s}[0], [x0], x1
  227. st1 {v1.s}[0], [x0], x1
  228. b.gt 1b
  229. ret
  230. endfunc
  231. function pixel_avg_w8_neon
  232. 1: subs w9, w9, #4
  233. ld1 {v0.8b}, [x2], x3
  234. ld1 {v1.8b}, [x4], x5
  235. ld1 {v2.8b}, [x2], x3
  236. urhadd v0.8b, v0.8b, v1.8b
  237. ld1 {v3.8b}, [x4], x5
  238. st1 {v0.8b}, [x0], x1
  239. ld1 {v4.8b}, [x2], x3
  240. urhadd v1.8b, v2.8b, v3.8b
  241. ld1 {v5.8b}, [x4], x5
  242. st1 {v1.8b}, [x0], x1
  243. ld1 {v6.8b}, [x2], x3
  244. ld1 {v7.8b}, [x4], x5
  245. urhadd v2.8b, v4.8b, v5.8b
  246. urhadd v3.8b, v6.8b, v7.8b
  247. st1 {v2.8b}, [x0], x1
  248. st1 {v3.8b}, [x0], x1
  249. b.gt 1b
  250. ret
  251. endfunc
  252. function pixel_avg_w16_neon
  253. 1: subs w9, w9, #4
  254. ld1 {v0.16b}, [x2], x3
  255. ld1 {v1.16b}, [x4], x5
  256. ld1 {v2.16b}, [x2], x3
  257. urhadd v0.16b, v0.16b, v1.16b
  258. ld1 {v3.16b}, [x4], x5
  259. st1 {v0.16b}, [x0], x1
  260. ld1 {v4.16b}, [x2], x3
  261. urhadd v1.16b, v2.16b, v3.16b
  262. ld1 {v5.16b}, [x4], x5
  263. st1 {v1.16b}, [x0], x1
  264. ld1 {v6.16b}, [x2], x3
  265. ld1 {v7.16b}, [x4], x5
  266. urhadd v2.16b, v4.16b, v5.16b
  267. urhadd v3.16b, v6.16b, v7.16b
  268. st1 {v2.16b}, [x0], x1
  269. st1 {v3.16b}, [x0], x1
  270. b.gt 1b
  271. ret
  272. endfunc
  273. function pixel_avg2_w4_neon, export=1
  274. 1:
  275. subs w5, w5, #2
  276. ld1 {v0.s}[0], [x2], x3
  277. ld1 {v2.s}[0], [x4], x3
  278. urhadd v0.8b, v0.8b, v2.8b
  279. ld1 {v1.s}[0], [x2], x3
  280. ld1 {v3.s}[0], [x4], x3
  281. urhadd v1.8b, v1.8b, v3.8b
  282. st1 {v0.s}[0], [x0], x1
  283. st1 {v1.s}[0], [x0], x1
  284. b.gt 1b
  285. ret
  286. endfunc
  287. function pixel_avg2_w8_neon, export=1
  288. 1:
  289. subs w5, w5, #2
  290. ld1 {v0.8b}, [x2], x3
  291. ld1 {v2.8b}, [x4], x3
  292. urhadd v0.8b, v0.8b, v2.8b
  293. ld1 {v1.8b}, [x2], x3
  294. ld1 {v3.8b}, [x4], x3
  295. urhadd v1.8b, v1.8b, v3.8b
  296. st1 {v0.8b}, [x0], x1
  297. st1 {v1.8b}, [x0], x1
  298. b.gt 1b
  299. ret
  300. endfunc
  301. function pixel_avg2_w16_neon, export=1
  302. 1:
  303. subs w5, w5, #2
  304. ld1 {v0.16b}, [x2], x3
  305. ld1 {v2.16b}, [x4], x3
  306. urhadd v0.16b, v0.16b, v2.16b
  307. ld1 {v1.16b}, [x2], x3
  308. ld1 {v3.16b}, [x4], x3
  309. urhadd v1.16b, v1.16b, v3.16b
  310. st1 {v0.16b}, [x0], x1
  311. st1 {v1.16b}, [x0], x1
  312. b.gt 1b
  313. ret
  314. endfunc
  315. function pixel_avg2_w20_neon, export=1
  316. sub x1, x1, #16
  317. 1:
  318. subs w5, w5, #2
  319. ld1 {v0.16b,v1.16b}, [x2], x3
  320. ld1 {v2.16b,v3.16b}, [x4], x3
  321. urhadd v0.16b, v0.16b, v2.16b
  322. urhadd v1.8b, v1.8b, v3.8b
  323. ld1 {v4.16b,v5.16b}, [x2], x3
  324. ld1 {v6.16b,v7.16b}, [x4], x3
  325. urhadd v4.16b, v4.16b, v6.16b
  326. urhadd v5.8b, v5.8b, v7.8b
  327. st1 {v0.16b}, [x0], #16
  328. st1 {v1.s}[0], [x0], x1
  329. st1 {v4.16b}, [x0], #16
  330. st1 {v5.s}[0], [x0], x1
  331. b.gt 1b
  332. ret
  333. endfunc
  334. .macro weight_prologue type
  335. mov w9, w5 // height
  336. .ifc \type, full
  337. ldr w12, [x4, #32] // denom
  338. .endif
  339. ldp w4, w5, [x4, #32+4] // scale, offset
  340. dup v0.16b, w4
  341. dup v1.8h, w5
  342. .ifc \type, full
  343. neg w12, w12
  344. dup v2.8h, w12
  345. .endif
  346. .endm
  347. // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
  348. // intptr_t dst_stride, const x264_weight_t *weight, int h )
  349. function mc_weight_w20_neon, export=1
  350. weight_prologue full
  351. sub x1, x1, #16
  352. 1:
  353. subs w9, w9, #2
  354. ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
  355. ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
  356. umull v22.8h, v16.8b, v0.8b
  357. umull v23.8h, v17.8b, v0.8b
  358. zip1 v18.2s, v18.2s, v21.2s
  359. umull v25.8h, v19.8b, v0.8b
  360. umull v26.8h, v20.8b, v0.8b
  361. umull v24.8h, v18.8b, v0.8b
  362. srshl v22.8h, v22.8h, v2.8h
  363. srshl v23.8h, v23.8h, v2.8h
  364. srshl v24.8h, v24.8h, v2.8h
  365. srshl v25.8h, v25.8h, v2.8h
  366. srshl v26.8h, v26.8h, v2.8h
  367. add v22.8h, v22.8h, v1.8h
  368. add v23.8h, v23.8h, v1.8h
  369. add v24.8h, v24.8h, v1.8h
  370. add v25.8h, v25.8h, v1.8h
  371. add v26.8h, v26.8h, v1.8h
  372. sqxtun v4.8b, v22.8h
  373. sqxtun2 v4.16b, v23.8h
  374. sqxtun v6.8b, v24.8h
  375. sqxtun v5.8b, v25.8h
  376. sqxtun2 v5.16b, v26.8h
  377. st1 {v4.16b}, [x0], #16
  378. st1 {v6.s}[0], [x0], x1
  379. st1 {v5.16b}, [x0], #16
  380. st1 {v6.s}[1], [x0], x1
  381. b.gt 1b
  382. ret
  383. endfunc
  384. function mc_weight_w16_neon, export=1
  385. weight_prologue full
  386. weight16_loop:
  387. 1:
  388. subs w9, w9, #2
  389. ld1 {v4.16b}, [x2], x3
  390. ld1 {v5.16b}, [x2], x3
  391. umull v22.8h, v4.8b, v0.8b
  392. umull2 v23.8h, v4.16b, v0.16b
  393. umull v24.8h, v5.8b, v0.8b
  394. umull2 v25.8h, v5.16b, v0.16b
  395. srshl v22.8h, v22.8h, v2.8h
  396. srshl v23.8h, v23.8h, v2.8h
  397. srshl v24.8h, v24.8h, v2.8h
  398. srshl v25.8h, v25.8h, v2.8h
  399. add v22.8h, v22.8h, v1.8h
  400. add v23.8h, v23.8h, v1.8h
  401. add v24.8h, v24.8h, v1.8h
  402. add v25.8h, v25.8h, v1.8h
  403. sqxtun v4.8b, v22.8h
  404. sqxtun2 v4.16b, v23.8h
  405. sqxtun v5.8b, v24.8h
  406. sqxtun2 v5.16b, v25.8h
  407. st1 {v4.16b}, [x0], x1
  408. st1 {v5.16b}, [x0], x1
  409. b.gt 1b
  410. ret
  411. endfunc
  412. function mc_weight_w8_neon, export=1
  413. weight_prologue full
  414. 1:
  415. subs w9, w9, #2
  416. ld1 {v16.8b}, [x2], x3
  417. ld1 {v17.8b}, [x2], x3
  418. umull v4.8h, v16.8b, v0.8b
  419. umull v5.8h, v17.8b, v0.8b
  420. srshl v4.8h, v4.8h, v2.8h
  421. srshl v5.8h, v5.8h, v2.8h
  422. add v4.8h, v4.8h, v1.8h
  423. add v5.8h, v5.8h, v1.8h
  424. sqxtun v16.8b, v4.8h
  425. sqxtun v17.8b, v5.8h
  426. st1 {v16.8b}, [x0], x1
  427. st1 {v17.8b}, [x0], x1
  428. b.gt 1b
  429. ret
  430. endfunc
  431. function mc_weight_w4_neon, export=1
  432. weight_prologue full
  433. 1:
  434. subs w9, w9, #2
  435. ld1 {v16.s}[0], [x2], x3
  436. ld1 {v16.s}[1], [x2], x3
  437. umull v4.8h, v16.8b, v0.8b
  438. srshl v4.8h, v4.8h, v2.8h
  439. add v4.8h, v4.8h, v1.8h
  440. sqxtun v16.8b, v4.8h
  441. st1 {v16.s}[0], [x0], x1
  442. st1 {v16.s}[1], [x0], x1
  443. b.gt 1b
  444. ret
  445. endfunc
  446. function mc_weight_w20_nodenom_neon, export=1
  447. weight_prologue nodenom
  448. sub x1, x1, #16
  449. 1:
  450. subs w9, w9, #2
  451. ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
  452. mov v27.16b, v1.16b
  453. mov v28.16b, v1.16b
  454. ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
  455. mov v31.16b, v1.16b
  456. mov v29.16b, v1.16b
  457. mov v30.16b, v1.16b
  458. zip1 v18.2s, v18.2s, v21.2s
  459. umlal v27.8h, v16.8b, v0.8b
  460. umlal v28.8h, v17.8b, v0.8b
  461. umlal v31.8h, v18.8b, v0.8b
  462. umlal v29.8h, v19.8b, v0.8b
  463. umlal v30.8h, v20.8b, v0.8b
  464. sqxtun v4.8b, v27.8h
  465. sqxtun2 v4.16b, v28.8h
  466. sqxtun v5.8b, v29.8h
  467. sqxtun2 v5.16b, v30.8h
  468. sqxtun v6.8b, v31.8h
  469. st1 {v4.16b}, [x0], #16
  470. st1 {v6.s}[0], [x0], x1
  471. st1 {v5.16b}, [x0], #16
  472. st1 {v6.s}[1], [x0], x1
  473. b.gt 1b
  474. ret
  475. endfunc
  476. function mc_weight_w16_nodenom_neon, export=1
  477. weight_prologue nodenom
  478. 1:
  479. subs w9, w9, #2
  480. ld1 {v6.16b}, [x2], x3
  481. mov v27.16b, v1.16b
  482. mov v28.16b, v1.16b
  483. ld1 {v7.16b}, [x2], x3
  484. mov v29.16b, v1.16b
  485. mov v30.16b, v1.16b
  486. umlal v27.8h, v6.8b, v0.8b
  487. umlal2 v28.8h, v6.16b, v0.16b
  488. umlal v29.8h, v7.8b, v0.8b
  489. umlal2 v30.8h, v7.16b, v0.16b
  490. sqxtun v4.8b, v27.8h
  491. sqxtun2 v4.16b, v28.8h
  492. sqxtun v5.8b, v29.8h
  493. sqxtun2 v5.16b, v30.8h
  494. st1 {v4.16b}, [x0], x1
  495. st1 {v5.16b}, [x0], x1
  496. b.gt 1b
  497. ret
  498. endfunc
  499. function mc_weight_w8_nodenom_neon, export=1
  500. weight_prologue nodenom
  501. 1:
  502. subs w9, w9, #2
  503. ld1 {v16.8b}, [x2], x3
  504. mov v27.16b, v1.16b
  505. ld1 {v17.8b}, [x2], x3
  506. mov v29.16b, v1.16b
  507. umlal v27.8h, v16.8b, v0.8b
  508. umlal v29.8h, v17.8b, v0.8b
  509. sqxtun v4.8b, v27.8h
  510. sqxtun v5.8b, v29.8h
  511. st1 {v4.8b}, [x0], x1
  512. st1 {v5.8b}, [x0], x1
  513. b.gt 1b
  514. ret
  515. endfunc
  516. function mc_weight_w4_nodenom_neon, export=1
  517. weight_prologue nodenom
  518. 1:
  519. subs w9, w9, #2
  520. ld1 {v16.s}[0], [x2], x3
  521. ld1 {v16.s}[1], [x2], x3
  522. mov v27.16b, v1.16b
  523. umlal v27.8h, v16.8b, v0.8b
  524. sqxtun v4.8b, v27.8h
  525. st1 {v4.s}[0], [x0], x1
  526. st1 {v4.s}[1], [x0], x1
  527. b.gt 1b
  528. ret
  529. endfunc
  530. .macro weight_simple_prologue
  531. ldr w6, [x4] // offset
  532. dup v1.16b, w6
  533. .endm
  534. .macro weight_simple name op
  535. function mc_weight_w20_\name\()_neon, export=1
  536. weight_simple_prologue
  537. 1:
  538. subs w5, w5, #2
  539. ldr s18, [x2, #16]
  540. ld1 {v16.16b}, [x2], x3
  541. ldr s19, [x2, #16]
  542. ld1 {v17.16b}, [x2], x3
  543. \op v18.8b, v18.8b, v1.8b
  544. \op v16.16b, v16.16b, v1.16b
  545. \op v19.8b, v19.8b, v1.8b
  546. \op v17.16b, v17.16b, v1.16b
  547. str s18, [x0, #16]
  548. st1 {v16.16b}, [x0], x1
  549. str s19, [x0, #16]
  550. st1 {v17.16b}, [x0], x1
  551. b.gt 1b
  552. ret
  553. endfunc
  554. function mc_weight_w16_\name\()_neon, export=1
  555. weight_simple_prologue
  556. 1:
  557. subs w5, w5, #2
  558. ld1 {v16.16b}, [x2], x3
  559. ld1 {v17.16b}, [x2], x3
  560. \op v16.16b, v16.16b, v1.16b
  561. \op v17.16b, v17.16b, v1.16b
  562. st1 {v16.16b}, [x0], x1
  563. st1 {v17.16b}, [x0], x1
  564. b.gt 1b
  565. ret
  566. endfunc
  567. function mc_weight_w8_\name\()_neon, export=1
  568. weight_simple_prologue
  569. 1:
  570. subs w5, w5, #2
  571. ld1 {v16.8b}, [x2], x3
  572. ld1 {v17.8b}, [x2], x3
  573. \op v16.8b, v16.8b, v1.8b
  574. \op v17.8b, v17.8b, v1.8b
  575. st1 {v16.8b}, [x0], x1
  576. st1 {v17.8b}, [x0], x1
  577. b.gt 1b
  578. ret
  579. endfunc
  580. function mc_weight_w4_\name\()_neon, export=1
  581. weight_simple_prologue
  582. 1:
  583. subs w5, w5, #2
  584. ld1 {v16.s}[0], [x2], x3
  585. ld1 {v16.s}[1], [x2], x3
  586. \op v16.8b, v16.8b, v1.8b
  587. st1 {v16.s}[0], [x0], x1
  588. st1 {v16.s}[1], [x0], x1
  589. b.gt 1b
  590. ret
  591. endfunc
  592. .endm
  593. weight_simple offsetadd, uqadd
  594. weight_simple offsetsub, uqsub
  595. // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
  596. function mc_copy_w4_neon, export=1
  597. 1:
  598. subs w4, w4, #4
  599. ld1 {v0.s}[0], [x2], x3
  600. ld1 {v1.s}[0], [x2], x3
  601. ld1 {v2.s}[0], [x2], x3
  602. ld1 {v3.s}[0], [x2], x3
  603. st1 {v0.s}[0], [x0], x1
  604. st1 {v1.s}[0], [x0], x1
  605. st1 {v2.s}[0], [x0], x1
  606. st1 {v3.s}[0], [x0], x1
  607. b.gt 1b
  608. ret
  609. endfunc
  610. function mc_copy_w8_neon, export=1
  611. 1: subs w4, w4, #4
  612. ld1 {v0.8b}, [x2], x3
  613. ld1 {v1.8b}, [x2], x3
  614. ld1 {v2.8b}, [x2], x3
  615. ld1 {v3.8b}, [x2], x3
  616. st1 {v0.8b}, [x0], x1
  617. st1 {v1.8b}, [x0], x1
  618. st1 {v2.8b}, [x0], x1
  619. st1 {v3.8b}, [x0], x1
  620. b.gt 1b
  621. ret
  622. endfunc
  623. function mc_copy_w16_neon, export=1
  624. 1: subs w4, w4, #4
  625. ld1 {v0.16b}, [x2], x3
  626. ld1 {v1.16b}, [x2], x3
  627. ld1 {v2.16b}, [x2], x3
  628. ld1 {v3.16b}, [x2], x3
  629. st1 {v0.16b}, [x0], x1
  630. st1 {v1.16b}, [x0], x1
  631. st1 {v2.16b}, [x0], x1
  632. st1 {v3.16b}, [x0], x1
  633. b.gt 1b
  634. ret
  635. endfunc
  636. // void mc_chroma( uint8_t *dst_u, uint8_t *dst_v,
  637. // intptr_t i_dst_stride,
  638. // uint8_t *src, intptr_t i_src_stride,
  639. // int dx, int dy, int i_width, int i_height );
  640. function mc_chroma_neon, export=1
  641. ldr w15, [sp] // height
  642. sbfx x12, x6, #3, #29 // asr(3) and sign extend
  643. sbfx x11, x5, #3, #29 // asr(3) and sign extend
  644. cmp w7, #4
  645. mul x12, x12, x4
  646. add x3, x3, x11, lsl #1
  647. and w5, w5, #7
  648. and w6, w6, #7
  649. add x3, x3, x12
  650. //pld [x3]
  651. //pld [x3, x4]
  652. b.gt mc_chroma_w8_neon
  653. b.eq mc_chroma_w4_neon
  654. endfunc
  655. .macro CHROMA_MC_START r00, r01, r10, r11
  656. mul w12, w5, w6 // cD = d8x *d8y
  657. lsl w13, w5, #3
  658. add w9, w12, #64
  659. lsl w14, w6, #3
  660. tst w12, w12
  661. sub w9, w9, w13
  662. sub w10, w13, w12 // cB = d8x *(8-d8y);
  663. sub w11, w14, w12 // cC = (8-d8x)*d8y
  664. sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
  665. .endm
  666. .macro CHROMA_MC width, vsize
  667. function mc_chroma_w\width\()_neon
  668. // since the element size varies, there's a different index for the 2nd store
  669. .if \width == 4
  670. .set idx2, 1
  671. .else
  672. .set idx2, 2
  673. .endif
  674. CHROMA_MC_START
  675. b.eq 2f
  676. ld2 {v28.8b,v29.8b}, [x3], x4
  677. dup v0.8b, w9 // cA
  678. dup v1.8b, w10 // cB
  679. ext v6.8b, v28.8b, v6.8b, #1
  680. ext v7.8b, v29.8b, v7.8b, #1
  681. ld2 {v30.8b,v31.8b}, [x3], x4
  682. dup v2.8b, w11 // cC
  683. dup v3.8b, w12 // cD
  684. ext v22.8b, v30.8b, v22.8b, #1
  685. ext v23.8b, v31.8b, v23.8b, #1
  686. trn1 v0.2s, v0.2s, v1.2s
  687. trn1 v2.2s, v2.2s, v3.2s
  688. trn1 v4.2s, v28.2s, v6.2s
  689. trn1 v5.2s, v29.2s, v7.2s
  690. trn1 v20.2s, v30.2s, v22.2s
  691. trn1 v21.2s, v31.2s, v23.2s
  692. 1: // height loop, interpolate xy
  693. subs w15, w15, #2
  694. umull v16.8h, v4.8b, v0.8b
  695. umlal v16.8h, v20.8b, v2.8b
  696. umull v17.8h, v5.8b, v0.8b
  697. umlal v17.8h, v21.8b, v2.8b
  698. ld2 {v28.8b,v29.8b}, [x3], x4
  699. transpose v24.2d, v25.2d, v16.2d, v17.2d
  700. ext v6.8b, v28.8b, v6.8b, #1
  701. ext v7.8b, v29.8b, v7.8b, #1
  702. trn1 v4.2s, v28.2s, v6.2s
  703. trn1 v5.2s, v29.2s, v7.2s
  704. add v16.8h, v24.8h, v25.8h
  705. umull v18.8h, v20.8b, v0.8b
  706. umlal v18.8h, v4.8b, v2.8b
  707. umull v19.8h, v21.8b, v0.8b
  708. umlal v19.8h, v5.8b, v2.8b
  709. ld2 {v30.8b,v31.8b}, [x3], x4
  710. transpose v26.2d, v27.2d, v18.2d, v19.2d
  711. ext v22.8b, v30.8b, v22.8b, #1
  712. ext v23.8b, v31.8b, v23.8b, #1
  713. trn1 v20.2s, v30.2s, v22.2s
  714. trn1 v21.2s, v31.2s, v23.2s
  715. add v17.8h, v26.8h, v27.8h
  716. rshrn v16.8b, v16.8h, #6
  717. rshrn v17.8b, v17.8h, #6
  718. //pld [x3]
  719. //pld [x3, x4]
  720. st1 {v16.\vsize}[0], [x0], x2
  721. st1 {v16.\vsize}[idx2], [x1], x2
  722. st1 {v17.\vsize}[0], [x0], x2
  723. st1 {v17.\vsize}[idx2], [x1], x2
  724. b.gt 1b
  725. ret
  726. 2: // dx or dy are 0
  727. tst w11, w11
  728. add w10, w10, w11
  729. dup v0.8b, w9
  730. dup v1.8b, w10
  731. b.eq 4f
  732. ld1 {v4.8b}, [x3], x4
  733. ld1 {v6.8b}, [x3], x4
  734. 3: // vertical interpolation loop
  735. subs w15, w15, #2
  736. umull v16.8h, v4.8b, v0.8b
  737. ld1 {v4.8b}, [x3], x4
  738. umlal v16.8h, v6.8b, v1.8b
  739. umull v17.8h, v6.8b, v0.8b
  740. ld1 {v6.8b}, [x3], x4
  741. umlal v17.8h, v4.8b, v1.8b
  742. rshrn v20.8b, v16.8h, #6 // uvuvuvuv
  743. rshrn v21.8b, v17.8h, #6 // uvuvuvuv
  744. uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
  745. uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
  746. //pld [x3]
  747. //pld [x3, x4]
  748. st1 {v16.\vsize}[0], [x0], x2
  749. st1 {v16.\vsize}[idx2], [x0], x2
  750. st1 {v17.\vsize}[0], [x1], x2
  751. st1 {v17.\vsize}[idx2], [x1], x2
  752. b.gt 3b
  753. ret
  754. 4: // dy is 0
  755. ld1 {v4.8b,v5.8b}, [x3], x4
  756. ld1 {v6.8b,v7.8b}, [x3], x4
  757. ext v5.8b, v4.8b, v5.8b, #2
  758. ext v7.8b, v6.8b, v7.8b, #2
  759. 5: // horizontal interpolation loop
  760. subs w15, w15, #2
  761. umull v16.8h, v4.8b, v0.8b
  762. umlal v16.8h, v5.8b, v1.8b
  763. umull v17.8h, v6.8b, v0.8b
  764. umlal v17.8h, v7.8b, v1.8b
  765. ld1 {v4.8b,v5.8b}, [x3], x4
  766. ld1 {v6.8b,v7.8b}, [x3], x4
  767. rshrn v20.8b, v16.8h, #6
  768. rshrn v21.8b, v17.8h, #6
  769. ext v5.8b, v4.8b, v5.8b, #2
  770. ext v7.8b, v6.8b, v7.8b, #2
  771. uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
  772. uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
  773. //pld [x3]
  774. //pld [x3, x4]
  775. st1 {v16.\vsize}[0], [x0], x2
  776. st1 {v16.\vsize}[idx2], [x0], x2
  777. st1 {v17.\vsize}[0], [x1], x2
  778. st1 {v17.\vsize}[idx2], [x1], x2
  779. b.gt 5b
  780. ret
  781. endfunc
  782. .endm
  783. CHROMA_MC 2, h
  784. CHROMA_MC 4, s
  785. function mc_chroma_w8_neon
  786. CHROMA_MC_START
  787. b.eq 2f
  788. ld2 {v4.16b,v5.16b}, [x3], x4
  789. ld2 {v20.16b,v21.16b}, [x3], x4
  790. dup v0.8b, w9 // cA
  791. dup v1.8b, w10 // cB
  792. ext v6.16b, v4.16b, v4.16b, #1
  793. ext v7.16b, v5.16b, v5.16b, #1
  794. dup v2.8b, w11 // cC
  795. dup v3.8b, w12 // cD
  796. ext v22.16b, v20.16b, v20.16b, #1
  797. ext v23.16b, v21.16b, v21.16b, #1
  798. 1: // height loop, interpolate xy
  799. subs w15, w15, #2
  800. umull v16.8h, v4.8b, v0.8b
  801. umlal v16.8h, v6.8b, v1.8b
  802. umlal v16.8h, v20.8b, v2.8b
  803. umlal v16.8h, v22.8b, v3.8b
  804. umull v17.8h, v5.8b, v0.8b
  805. umlal v17.8h, v7.8b, v1.8b
  806. umlal v17.8h, v21.8b, v2.8b
  807. umlal v17.8h, v23.8b, v3.8b
  808. ld2 {v4.16b,v5.16b}, [x3], x4
  809. ext v6.16b, v4.16b, v4.16b, #1
  810. ext v7.16b, v5.16b, v5.16b, #1
  811. umull v18.8h, v20.8b, v0.8b
  812. umlal v18.8h, v22.8b, v1.8b
  813. umlal v18.8h, v4.8b, v2.8b
  814. umlal v18.8h, v6.8b, v3.8b
  815. umull v19.8h, v21.8b, v0.8b
  816. umlal v19.8h, v23.8b, v1.8b
  817. umlal v19.8h, v5.8b, v2.8b
  818. umlal v19.8h, v7.8b, v3.8b
  819. ld2 {v20.16b,v21.16b}, [x3], x4
  820. rshrn v16.8b, v16.8h, #6
  821. rshrn v17.8b, v17.8h, #6
  822. rshrn v18.8b, v18.8h, #6
  823. rshrn v19.8b, v19.8h, #6
  824. ext v22.16b, v20.16b, v20.16b, #1
  825. ext v23.16b, v21.16b, v21.16b, #1
  826. //pld [x3]
  827. //pld [x3, x4]
  828. st1 {v16.8b}, [x0], x2
  829. st1 {v17.8b}, [x1], x2
  830. st1 {v18.8b}, [x0], x2
  831. st1 {v19.8b}, [x1], x2
  832. b.gt 1b
  833. ret
  834. 2: // dx or dy are 0
  835. tst w11, w11
  836. add w10, w10, w11
  837. dup v0.8b, w9
  838. dup v1.8b, w10
  839. b.eq 4f
  840. ld2 {v4.8b,v5.8b}, [x3], x4
  841. ld2 {v6.8b,v7.8b}, [x3], x4
  842. 3: // vertical interpolation loop
  843. subs w15, w15, #2
  844. umull v16.8h, v4.8b, v0.8b //U
  845. umlal v16.8h, v6.8b, v1.8b
  846. umull v17.8h, v5.8b, v0.8b //V
  847. umlal v17.8h, v7.8b, v1.8b
  848. ld2 {v4.8b,v5.8b}, [x3], x4
  849. umull v18.8h, v6.8b, v0.8b
  850. umlal v18.8h, v4.8b, v1.8b
  851. umull v19.8h, v7.8b, v0.8b
  852. umlal v19.8h, v5.8b, v1.8b
  853. ld2 {v6.8b,v7.8b}, [x3], x4
  854. rshrn v16.8b, v16.8h, #6
  855. rshrn v17.8b, v17.8h, #6
  856. rshrn v18.8b, v18.8h, #6
  857. rshrn v19.8b, v19.8h, #6
  858. //pld [x3]
  859. //pld [x3, x4]
  860. st1 {v16.8b}, [x0], x2
  861. st1 {v17.8b}, [x1], x2
  862. st1 {v18.8b}, [x0], x2
  863. st1 {v19.8b}, [x1], x2
  864. b.gt 3b
  865. ret
  866. 4: // dy is 0
  867. ld2 {v4.16b,v5.16b}, [x3], x4
  868. ext v6.16b, v4.16b, v4.16b, #1
  869. ext v7.16b, v5.16b, v5.16b, #1
  870. ld2 {v20.16b,v21.16b}, [x3], x4
  871. ext v22.16b, v20.16b, v20.16b, #1
  872. ext v23.16b, v21.16b, v21.16b, #1
  873. 5: // horizontal interpolation loop
  874. subs w15, w15, #2
  875. umull v16.8h, v4.8b, v0.8b //U
  876. umlal v16.8h, v6.8b, v1.8b
  877. umull v17.8h, v5.8b, v0.8b //V
  878. umlal v17.8h, v7.8b, v1.8b
  879. ld2 {v4.16b,v5.16b}, [x3], x4
  880. umull v18.8h, v20.8b, v0.8b
  881. umlal v18.8h, v22.8b, v1.8b
  882. umull v19.8h, v21.8b, v0.8b
  883. umlal v19.8h, v23.8b, v1.8b
  884. ld2 {v20.16b,v21.16b}, [x3], x4
  885. rshrn v16.8b, v16.8h, #6
  886. rshrn v17.8b, v17.8h, #6
  887. rshrn v18.8b, v18.8h, #6
  888. rshrn v19.8b, v19.8h, #6
  889. ext v6.16b, v4.16b, v4.16b, #1
  890. ext v7.16b, v5.16b, v5.16b, #1
  891. ext v22.16b, v20.16b, v20.16b, #1
  892. ext v23.16b, v21.16b, v21.16b, #1
  893. //pld [x3]
  894. //pld [x3, x4]
  895. st1 {v16.8b}, [x0], x2
  896. st1 {v17.8b}, [x1], x2
  897. st1 {v18.8b}, [x0], x2
  898. st1 {v19.8b}, [x1], x2
  899. b.gt 5b
  900. ret
  901. endfunc
  902. // void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
  903. // intptr_t stride, int width, int height, int16_t *buf )
  904. function hpel_filter_neon, export=1
  905. ubfm x9, x3, #0, #3
  906. add w15, w5, w9
  907. sub x13, x3, x9 // align src
  908. sub x10, x0, x9
  909. sub x11, x1, x9
  910. sub x12, x2, x9
  911. movi v30.16b, #5
  912. movi v31.16b, #20
  913. 1: // line start
  914. mov x3, x13
  915. mov x2, x12
  916. mov x1, x11
  917. mov x0, x10
  918. add x7, x3, #16 // src pointer next 16b for horiz filter
  919. mov x5, x15 // restore width
  920. sub x3, x3, x4, lsl #1 // src - 2*stride
  921. ld1 {v28.16b}, [x7], #16 // src[16:31]
  922. add x9, x3, x5 // holds src - 2*stride + width
  923. ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
  924. ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
  925. ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
  926. ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
  927. ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
  928. ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
  929. ext v22.16b, v7.16b, v18.16b, #14
  930. uaddl v1.8h, v16.8b, v21.8b
  931. ext v26.16b, v18.16b, v28.16b, #3
  932. umlsl v1.8h, v17.8b, v30.8b
  933. ext v23.16b, v7.16b, v18.16b, #15
  934. umlal v1.8h, v18.8b, v31.8b
  935. ext v24.16b, v18.16b, v28.16b, #1
  936. umlal v1.8h, v19.8b, v31.8b
  937. ext v25.16b, v18.16b, v28.16b, #2
  938. umlsl v1.8h, v20.8b, v30.8b
  939. 2: // next 16 pixel of line
  940. subs x5, x5, #16
  941. sub x3, x9, x5 // src - 2*stride += 16
  942. uaddl v4.8h, v22.8b, v26.8b
  943. uaddl2 v5.8h, v22.16b, v26.16b
  944. sqrshrun v6.8b, v1.8h, #5
  945. umlsl v4.8h, v23.8b, v30.8b
  946. umlsl2 v5.8h, v23.16b, v30.16b
  947. umlal v4.8h, v18.8b, v31.8b
  948. umlal2 v5.8h, v18.16b, v31.16b
  949. umlal v4.8h, v24.8b, v31.8b
  950. umlal2 v5.8h, v24.16b, v31.16b
  951. umlsl v4.8h, v25.8b, v30.8b
  952. umlsl2 v5.8h, v25.16b, v30.16b
  953. uaddl2 v2.8h, v16.16b, v21.16b
  954. sqrshrun v4.8b, v4.8h, #5
  955. mov v7.16b, v18.16b
  956. sqrshrun2 v4.16b, v5.8h, #5
  957. umlsl2 v2.8h, v17.16b, v30.16b
  958. ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
  959. umlal2 v2.8h, v18.16b, v31.16b
  960. ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
  961. umlal2 v2.8h, v19.16b, v31.16b
  962. ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
  963. umlsl2 v2.8h, v20.16b, v30.16b
  964. ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
  965. st1 {v4.16b}, [x0], #16
  966. sqrshrun2 v6.16b, v2.8h, #5
  967. ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
  968. ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
  969. ext v22.16b, v0.16b, v1.16b, #12
  970. ext v26.16b, v1.16b, v2.16b, #6
  971. ext v23.16b, v0.16b, v1.16b, #14
  972. st1 {v6.16b}, [x1], #16
  973. uaddl v3.8h, v16.8b, v21.8b
  974. ext v25.16b, v1.16b, v2.16b, #4
  975. umlsl v3.8h, v17.8b, v30.8b
  976. ext v24.16b, v1.16b, v2.16b, #2
  977. umlal v3.8h, v18.8b, v31.8b
  978. add v4.8h, v22.8h, v26.8h
  979. umlal v3.8h, v19.8b, v31.8b
  980. add v5.8h, v23.8h, v25.8h
  981. umlsl v3.8h, v20.8b, v30.8b
  982. add v6.8h, v24.8h, v1.8h
  983. ext v22.16b, v1.16b, v2.16b, #12
  984. ext v26.16b, v2.16b, v3.16b, #6
  985. ext v23.16b, v1.16b, v2.16b, #14
  986. ext v25.16b, v2.16b, v3.16b, #4
  987. ext v24.16b, v2.16b, v3.16b, #2
  988. add v22.8h, v22.8h, v26.8h
  989. add v23.8h, v23.8h, v25.8h
  990. add v24.8h, v24.8h, v2.8h
  991. sub v4.8h, v4.8h, v5.8h // a-b
  992. sub v5.8h, v5.8h, v6.8h // b-c
  993. sub v22.8h, v22.8h, v23.8h // a-b
  994. sub v23.8h, v23.8h, v24.8h // b-c
  995. sshr v4.8h, v4.8h, #2 // (a-b)/4
  996. sshr v22.8h, v22.8h, #2 // (a-b)/4
  997. sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
  998. sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
  999. sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
  1000. sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
  1001. add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1002. add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1003. sqrshrun v4.8b, v4.8h, #6
  1004. ld1 {v28.16b}, [x7], #16 // src[16:31]
  1005. mov v0.16b, v2.16b
  1006. ext v23.16b, v7.16b, v18.16b, #15
  1007. sqrshrun2 v4.16b, v22.8h, #6
  1008. mov v1.16b, v3.16b
  1009. ext v22.16b, v7.16b, v18.16b, #14
  1010. ext v24.16b, v18.16b, v28.16b, #1
  1011. ext v25.16b, v18.16b, v28.16b, #2
  1012. ext v26.16b, v18.16b, v28.16b, #3
  1013. st1 {v4.16b}, [x2], #16
  1014. b.gt 2b
  1015. subs w6, w6, #1
  1016. add x10, x10, x4
  1017. add x11, x11, x4
  1018. add x12, x12, x4
  1019. add x13, x13, x4
  1020. b.gt 1b
  1021. ret
  1022. endfunc
  1023. // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
  1024. // uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
  1025. // intptr_t dst_stride, int width, int height )
  1026. function frame_init_lowres_core_neon, export=1
  1027. ldr w8, [sp]
  1028. sub x10, x6, w7, uxtw // dst_stride - width
  1029. and x10, x10, #~15
  1030. 1:
  1031. mov w9, w7 // width
  1032. mov x11, x0 // src0
  1033. add x12, x0, x5 // src1 = src0 + src_stride
  1034. add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
  1035. ld2 {v0.16b,v1.16b}, [x11], #32
  1036. ld2 {v2.16b,v3.16b}, [x12], #32
  1037. ld2 {v4.16b,v5.16b}, [x13], #32
  1038. urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
  1039. urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
  1040. 2:
  1041. subs w9, w9, #16
  1042. urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
  1043. urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
  1044. ld2 {v0.16b,v1.16b}, [x11], #32
  1045. ld2 {v2.16b,v3.16b}, [x12], #32
  1046. ld2 {v4.16b,v5.16b}, [x13], #32
  1047. urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
  1048. urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
  1049. ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
  1050. ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
  1051. urhadd v16.16b, v20.16b, v21.16b
  1052. urhadd v18.16b, v22.16b, v23.16b
  1053. urhadd v17.16b, v21.16b, v24.16b
  1054. urhadd v19.16b, v23.16b, v25.16b
  1055. st1 {v16.16b}, [x1], #16
  1056. st1 {v18.16b}, [x3], #16
  1057. st1 {v17.16b}, [x2], #16
  1058. st1 {v19.16b}, [x4], #16
  1059. b.le 3f
  1060. subs w9, w9, #16
  1061. urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
  1062. urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
  1063. ld2 {v0.16b,v1.16b}, [x11], #32
  1064. ld2 {v2.16b,v3.16b}, [x12], #32
  1065. ld2 {v4.16b,v5.16b}, [x13], #32
  1066. urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
  1067. urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
  1068. ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
  1069. ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
  1070. urhadd v16.16b, v30.16b, v21.16b
  1071. urhadd v18.16b, v31.16b, v23.16b
  1072. urhadd v17.16b, v21.16b, v24.16b
  1073. urhadd v19.16b, v23.16b, v25.16b
  1074. st1 {v16.16b}, [x1], #16
  1075. st1 {v18.16b}, [x3], #16
  1076. st1 {v17.16b}, [x2], #16
  1077. st1 {v19.16b}, [x4], #16
  1078. b.gt 2b
  1079. 3:
  1080. subs w8, w8, #1
  1081. add x0, x0, x5, lsl #1
  1082. add x1, x1, x10
  1083. add x2, x2, x10
  1084. add x3, x3, x10
  1085. add x4, x4, x10
  1086. b.gt 1b
  1087. ret
  1088. endfunc
  1089. function load_deinterleave_chroma_fenc_neon, export=1
  1090. mov x4, #FENC_STRIDE/2
  1091. b load_deinterleave_chroma
  1092. endfunc
  1093. function load_deinterleave_chroma_fdec_neon, export=1
  1094. mov x4, #FDEC_STRIDE/2
  1095. load_deinterleave_chroma:
  1096. ld2 {v0.8b,v1.8b}, [x1], x2
  1097. ld2 {v2.8b,v3.8b}, [x1], x2
  1098. subs w3, w3, #2
  1099. st1 {v0.8b}, [x0], x4
  1100. st1 {v1.8b}, [x0], x4
  1101. st1 {v2.8b}, [x0], x4
  1102. st1 {v3.8b}, [x0], x4
  1103. b.gt load_deinterleave_chroma
  1104. ret
  1105. endfunc
  1106. function plane_copy_core_neon, export=1
  1107. add w8, w4, #15 // 32-bit write clears the upper 32-bit the register
  1108. and w4, w8, #~15
  1109. // safe use of the full reg since negative width makes no sense
  1110. sub x1, x1, x4
  1111. sub x3, x3, x4
  1112. 1:
  1113. mov w8, w4
  1114. 16:
  1115. tst w8, #16
  1116. b.eq 32f
  1117. subs w8, w8, #16
  1118. ldr q0, [x2], #16
  1119. str q0, [x0], #16
  1120. b.eq 0f
  1121. 32:
  1122. subs w8, w8, #32
  1123. ldp q0, q1, [x2], #32
  1124. stp q0, q1, [x0], #32
  1125. b.gt 32b
  1126. 0:
  1127. subs w5, w5, #1
  1128. add x2, x2, x3
  1129. add x0, x0, x1
  1130. b.gt 1b
  1131. ret
  1132. endfunc
  1133. function plane_copy_swap_core_neon, export=1
  1134. lsl w4, w4, #1
  1135. sub x1, x1, x4
  1136. sub x3, x3, x4
  1137. 1:
  1138. mov w8, w4
  1139. tbz w4, #4, 32f
  1140. subs w8, w8, #16
  1141. ld1 {v0.16b}, [x2], #16
  1142. rev16 v0.16b, v0.16b
  1143. st1 {v0.16b}, [x0], #16
  1144. b.eq 0f
  1145. 32:
  1146. subs w8, w8, #32
  1147. ld1 {v0.16b,v1.16b}, [x2], #32
  1148. rev16 v0.16b, v0.16b
  1149. rev16 v1.16b, v1.16b
  1150. st1 {v0.16b,v1.16b}, [x0], #32
  1151. b.gt 32b
  1152. 0:
  1153. subs w5, w5, #1
  1154. add x2, x2, x3
  1155. add x0, x0, x1
  1156. b.gt 1b
  1157. ret
  1158. endfunc
  1159. function plane_copy_deinterleave_neon, export=1
  1160. add w9, w6, #15
  1161. and w9, w9, #0xfffffff0
  1162. sub x1, x1, x9
  1163. sub x3, x3, x9
  1164. sub x5, x5, x9, lsl #1
  1165. 1:
  1166. ld2 {v0.16b,v1.16b}, [x4], #32
  1167. subs w9, w9, #16
  1168. st1 {v0.16b}, [x0], #16
  1169. st1 {v1.16b}, [x2], #16
  1170. b.gt 1b
  1171. add x4, x4, x5
  1172. subs w7, w7, #1
  1173. add x0, x0, x1
  1174. add x2, x2, x3
  1175. mov w9, w6
  1176. b.gt 1b
  1177. ret
  1178. endfunc
  1179. .macro deinterleave_rgb
  1180. subs x11, x11, #8
  1181. st1 {v0.8b}, [x0], #8
  1182. st1 {v1.8b}, [x2], #8
  1183. st1 {v2.8b}, [x4], #8
  1184. b.gt 1b
  1185. subs w10, w10, #1
  1186. add x0, x0, x1
  1187. add x2, x2, x3
  1188. add x4, x4, x5
  1189. add x6, x6, x7
  1190. mov x11, x9
  1191. b.gt 1b
  1192. .endm
  1193. function plane_copy_deinterleave_rgb_neon, export=1
  1194. #if SYS_MACOSX
  1195. ldr w8, [sp]
  1196. ldp w9, w10, [sp, #4]
  1197. #else
  1198. ldr x8, [sp]
  1199. ldp x9, x10, [sp, #8]
  1200. #endif
  1201. cmp w8, #3
  1202. uxtw x9, w9
  1203. add x11, x9, #7
  1204. and x11, x11, #~7
  1205. sub x1, x1, x11
  1206. sub x3, x3, x11
  1207. sub x5, x5, x11
  1208. b.ne 4f
  1209. sub x7, x7, x11, lsl #1
  1210. sub x7, x7, x11
  1211. 1:
  1212. ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
  1213. deinterleave_rgb
  1214. ret
  1215. 4:
  1216. sub x7, x7, x11, lsl #2
  1217. 1:
  1218. ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
  1219. deinterleave_rgb
  1220. ret
  1221. endfunc
  1222. function plane_copy_interleave_core_neon, export=1
  1223. add w9, w6, #15
  1224. and w9, w9, #0xfffffff0
  1225. sub x1, x1, x9, lsl #1
  1226. sub x3, x3, x9
  1227. sub x5, x5, x9
  1228. 1:
  1229. ld1 {v0.16b}, [x2], #16
  1230. ld1 {v1.16b}, [x4], #16
  1231. subs w9, w9, #16
  1232. st2 {v0.16b,v1.16b}, [x0], #32
  1233. b.gt 1b
  1234. subs w7, w7, #1
  1235. add x0, x0, x1
  1236. add x2, x2, x3
  1237. add x4, x4, x5
  1238. mov w9, w6
  1239. b.gt 1b
  1240. ret
  1241. endfunc
  1242. function store_interleave_chroma_neon, export=1
  1243. mov x5, #FDEC_STRIDE
  1244. 1:
  1245. ld1 {v0.8b}, [x2], x5
  1246. ld1 {v1.8b}, [x3], x5
  1247. ld1 {v2.8b}, [x2], x5
  1248. ld1 {v3.8b}, [x3], x5
  1249. subs w4, w4, #2
  1250. zip1 v4.16b, v0.16b, v1.16b
  1251. zip1 v5.16b, v2.16b, v3.16b
  1252. st1 {v4.16b}, [x0], x1
  1253. st1 {v5.16b}, [x0], x1
  1254. b.gt 1b
  1255. ret
  1256. endfunc
  1257. .macro integral4h p1, p2
  1258. ext v1.8b, \p1\().8b, \p2\().8b, #1
  1259. ext v2.8b, \p1\().8b, \p2\().8b, #2
  1260. ext v3.8b, \p1\().8b, \p2\().8b, #3
  1261. uaddl v0.8h, \p1\().8b, v1.8b
  1262. uaddl v4.8h, v2.8b, v3.8b
  1263. add v0.8h, v0.8h, v4.8h
  1264. add v0.8h, v0.8h, v5.8h
  1265. .endm
  1266. function integral_init4h_neon, export=1
  1267. sub x3, x0, x2, lsl #1
  1268. ld1 {v6.8b,v7.8b}, [x1], #16
  1269. 1:
  1270. subs x2, x2, #16
  1271. ld1 {v5.8h}, [x3], #16
  1272. integral4h v6, v7
  1273. ld1 {v6.8b}, [x1], #8
  1274. ld1 {v5.8h}, [x3], #16
  1275. st1 {v0.8h}, [x0], #16
  1276. integral4h v7, v6
  1277. ld1 {v7.8b}, [x1], #8
  1278. st1 {v0.8h}, [x0], #16
  1279. b.gt 1b
  1280. ret
  1281. endfunc
  1282. .macro integral8h p1, p2, s
  1283. ext v1.8b, \p1\().8b, \p2\().8b, #1
  1284. ext v2.8b, \p1\().8b, \p2\().8b, #2
  1285. ext v3.8b, \p1\().8b, \p2\().8b, #3
  1286. ext v4.8b, \p1\().8b, \p2\().8b, #4
  1287. ext v5.8b, \p1\().8b, \p2\().8b, #5
  1288. ext v6.8b, \p1\().8b, \p2\().8b, #6
  1289. ext v7.8b, \p1\().8b, \p2\().8b, #7
  1290. uaddl v0.8h, \p1\().8b, v1.8b
  1291. uaddl v2.8h, v2.8b, v3.8b
  1292. uaddl v4.8h, v4.8b, v5.8b
  1293. uaddl v6.8h, v6.8b, v7.8b
  1294. add v0.8h, v0.8h, v2.8h
  1295. add v4.8h, v4.8h, v6.8h
  1296. add v0.8h, v0.8h, v4.8h
  1297. add v0.8h, v0.8h, \s\().8h
  1298. .endm
  1299. function integral_init8h_neon, export=1
  1300. sub x3, x0, x2, lsl #1
  1301. ld1 {v16.8b,v17.8b}, [x1], #16
  1302. 1:
  1303. subs x2, x2, #16
  1304. ld1 {v18.8h}, [x3], #16
  1305. integral8h v16, v17, v18
  1306. ld1 {v16.8b}, [x1], #8
  1307. ld1 {v18.8h}, [x3], #16
  1308. st1 {v0.8h}, [x0], #16
  1309. integral8h v17, v16, v18
  1310. ld1 {v17.8b}, [x1], #8
  1311. st1 {v0.8h}, [x0], #16
  1312. b.gt 1b
  1313. ret
  1314. endfunc
  1315. function integral_init4v_neon, export=1
  1316. mov x3, x0
  1317. add x4, x0, x2, lsl #3
  1318. add x8, x0, x2, lsl #4
  1319. sub x2, x2, #8
  1320. ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
  1321. ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
  1322. 1:
  1323. subs x2, x2, #16
  1324. ld1 {v24.8h,v25.8h}, [x4], #32
  1325. ext v0.16b, v20.16b, v21.16b, #8
  1326. ext v1.16b, v21.16b, v22.16b, #8
  1327. ext v2.16b, v16.16b, v17.16b, #8
  1328. ext v3.16b, v17.16b, v18.16b, #8
  1329. sub v24.8h, v24.8h, v20.8h
  1330. sub v25.8h, v25.8h, v21.8h
  1331. add v0.8h, v0.8h, v20.8h
  1332. add v1.8h, v1.8h, v21.8h
  1333. add v2.8h, v2.8h, v16.8h
  1334. add v3.8h, v3.8h, v17.8h
  1335. st1 {v24.8h}, [x1], #16
  1336. st1 {v25.8h}, [x1], #16
  1337. mov v20.16b, v22.16b
  1338. mov v16.16b, v18.16b
  1339. sub v0.8h, v2.8h, v0.8h
  1340. sub v1.8h, v3.8h, v1.8h
  1341. ld1 {v21.8h,v22.8h}, [x3], #32
  1342. ld1 {v17.8h,v18.8h}, [x8], #32
  1343. st1 {v0.8h}, [x0], #16
  1344. st1 {v1.8h}, [x0], #16
  1345. b.gt 1b
  1346. 2:
  1347. ret
  1348. endfunc
  1349. function integral_init8v_neon, export=1
  1350. add x2, x0, x1, lsl #4
  1351. sub x1, x1, #8
  1352. ands x3, x1, #16 - 1
  1353. b.eq 1f
  1354. subs x1, x1, #8
  1355. ld1 {v0.8h}, [x0]
  1356. ld1 {v2.8h}, [x2], #16
  1357. sub v4.8h, v2.8h, v0.8h
  1358. st1 {v4.8h}, [x0], #16
  1359. b.le 2f
  1360. 1:
  1361. subs x1, x1, #16
  1362. ld1 {v0.8h,v1.8h}, [x0]
  1363. ld1 {v2.8h,v3.8h}, [x2], #32
  1364. sub v4.8h, v2.8h, v0.8h
  1365. sub v5.8h, v3.8h, v1.8h
  1366. st1 {v4.8h}, [x0], #16
  1367. st1 {v5.8h}, [x0], #16
  1368. b.gt 1b
  1369. 2:
  1370. ret
  1371. endfunc
  1372. function mbtree_propagate_cost_neon, export=1
  1373. ld1r {v5.4s}, [x5]
  1374. 8:
  1375. subs w6, w6, #8
  1376. ld1 {v1.8h}, [x1], #16
  1377. ld1 {v2.8h}, [x2], #16
  1378. ld1 {v3.8h}, [x3], #16
  1379. ld1 {v4.8h}, [x4], #16
  1380. bic v3.8h, #0xc0, lsl #8
  1381. umin v3.8h, v2.8h, v3.8h
  1382. umull v20.4s, v2.4h, v4.4h // propagate_intra
  1383. umull2 v21.4s, v2.8h, v4.8h // propagate_intra
  1384. usubl v22.4s, v2.4h, v3.4h // propagate_num
  1385. usubl2 v23.4s, v2.8h, v3.8h // propagate_num
  1386. uxtl v26.4s, v2.4h // propagate_denom
  1387. uxtl2 v27.4s, v2.8h // propagate_denom
  1388. uxtl v24.4s, v1.4h
  1389. uxtl2 v25.4s, v1.8h
  1390. ucvtf v20.4s, v20.4s
  1391. ucvtf v21.4s, v21.4s
  1392. ucvtf v26.4s, v26.4s
  1393. ucvtf v27.4s, v27.4s
  1394. ucvtf v22.4s, v22.4s
  1395. ucvtf v23.4s, v23.4s
  1396. frecpe v28.4s, v26.4s
  1397. frecpe v29.4s, v27.4s
  1398. ucvtf v24.4s, v24.4s
  1399. ucvtf v25.4s, v25.4s
  1400. frecps v30.4s, v28.4s, v26.4s
  1401. frecps v31.4s, v29.4s, v27.4s
  1402. fmla v24.4s, v20.4s, v5.4s // propagate_amount
  1403. fmla v25.4s, v21.4s, v5.4s // propagate_amount
  1404. fmul v28.4s, v28.4s, v30.4s
  1405. fmul v29.4s, v29.4s, v31.4s
  1406. fmul v16.4s, v24.4s, v22.4s
  1407. fmul v17.4s, v25.4s, v23.4s
  1408. fmul v18.4s, v16.4s, v28.4s
  1409. fmul v19.4s, v17.4s, v29.4s
  1410. fcvtns v20.4s, v18.4s
  1411. fcvtns v21.4s, v19.4s
  1412. sqxtn v0.4h, v20.4s
  1413. sqxtn2 v0.8h, v21.4s
  1414. st1 {v0.8h}, [x0], #16
  1415. b.gt 8b
  1416. ret
  1417. endfunc
  1418. const pw_0to15, align=5
  1419. .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  1420. endconst
  1421. function mbtree_propagate_list_internal_neon, export=1
  1422. movrel x11, pw_0to15
  1423. dup v31.8h, w4 // bipred_weight
  1424. movi v30.8h, #0xc0, lsl #8
  1425. ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
  1426. movi v28.4s, #4
  1427. movi v27.8h, #31
  1428. movi v26.8h, #32
  1429. dup v24.8h, w5 // mb_y
  1430. zip1 v29.8h, v29.8h, v24.8h
  1431. 8:
  1432. subs w6, w6, #8
  1433. ld1 {v1.8h}, [x1], #16 // propagate_amount
  1434. ld1 {v2.8h}, [x2], #16 // lowres_cost
  1435. and v2.16b, v2.16b, v30.16b
  1436. cmeq v25.8h, v2.8h, v30.8h
  1437. umull v16.4s, v1.4h, v31.4h
  1438. umull2 v17.4s, v1.8h, v31.8h
  1439. rshrn v16.4h, v16.4s, #6
  1440. rshrn2 v16.8h, v17.4s, #6
  1441. bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
  1442. // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
  1443. ld1 {v4.8h,v5.8h}, [x0], #32
  1444. sshr v6.8h, v4.8h, #5
  1445. sshr v7.8h, v5.8h, #5
  1446. add v6.8h, v6.8h, v29.8h
  1447. add v29.8h, v29.8h, v28.8h
  1448. add v7.8h, v7.8h, v29.8h
  1449. add v29.8h, v29.8h, v28.8h
  1450. st1 {v6.8h,v7.8h}, [x3], #32
  1451. and v4.16b, v4.16b, v27.16b
  1452. and v5.16b, v5.16b, v27.16b
  1453. uzp1 v6.8h, v4.8h, v5.8h // x & 31
  1454. uzp2 v7.8h, v4.8h, v5.8h // y & 31
  1455. sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
  1456. sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
  1457. mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
  1458. mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
  1459. mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
  1460. mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
  1461. umull v6.4s, v19.4h, v25.4h
  1462. umull2 v7.4s, v19.8h, v25.8h
  1463. umull v4.4s, v18.4h, v25.4h
  1464. umull2 v5.4s, v18.8h, v25.8h
  1465. umull v2.4s, v17.4h, v25.4h
  1466. umull2 v3.4s, v17.8h, v25.8h
  1467. umull v0.4s, v16.4h, v25.4h
  1468. umull2 v1.4s, v16.8h, v25.8h
  1469. rshrn v19.4h, v6.4s, #10
  1470. rshrn2 v19.8h, v7.4s, #10
  1471. rshrn v18.4h, v4.4s, #10
  1472. rshrn2 v18.8h, v5.4s, #10
  1473. rshrn v17.4h, v2.4s, #10
  1474. rshrn2 v17.8h, v3.4s, #10
  1475. rshrn v16.4h, v0.4s, #10
  1476. rshrn2 v16.8h, v1.4s, #10
  1477. zip1 v0.8h, v16.8h, v17.8h
  1478. zip2 v1.8h, v16.8h, v17.8h
  1479. zip1 v2.8h, v18.8h, v19.8h
  1480. zip2 v3.8h, v18.8h, v19.8h
  1481. st1 {v0.8h,v1.8h}, [x3], #32
  1482. st1 {v2.8h,v3.8h}, [x3], #32
  1483. b.ge 8b
  1484. ret
  1485. endfunc
  1486. function memcpy_aligned_neon, export=1
  1487. tst x2, #16
  1488. b.eq 32f
  1489. sub x2, x2, #16
  1490. ldr q0, [x1], #16
  1491. str q0, [x0], #16
  1492. 32:
  1493. tst x2, #32
  1494. b.eq 640f
  1495. sub x2, x2, #32
  1496. ldp q0, q1, [x1], #32
  1497. stp q0, q1, [x0], #32
  1498. 640:
  1499. cbz x2, 1f
  1500. 64:
  1501. subs x2, x2, #64
  1502. ldp q0, q1, [x1, #32]
  1503. ldp q2, q3, [x1], #64
  1504. stp q0, q1, [x0, #32]
  1505. stp q2, q3, [x0], #64
  1506. b.gt 64b
  1507. 1:
  1508. ret
  1509. endfunc
  1510. function memzero_aligned_neon, export=1
  1511. movi v0.16b, #0
  1512. movi v1.16b, #0
  1513. 1:
  1514. subs x1, x1, #128
  1515. stp q0, q1, [x0, #96]
  1516. stp q0, q1, [x0, #64]
  1517. stp q0, q1, [x0, #32]
  1518. stp q0, q1, [x0], 128
  1519. b.gt 1b
  1520. ret
  1521. endfunc
  1522. // void mbtree_fix8_pack( int16_t *dst, float *src, int count )
  1523. function mbtree_fix8_pack_neon, export=1
  1524. subs w3, w2, #8
  1525. b.lt 2f
  1526. 1:
  1527. subs w3, w3, #8
  1528. ld1 {v0.4s,v1.4s}, [x1], #32
  1529. fcvtzs v0.4s, v0.4s, #8
  1530. fcvtzs v1.4s, v1.4s, #8
  1531. sqxtn v2.4h, v0.4s
  1532. sqxtn2 v2.8h, v1.4s
  1533. rev16 v3.16b, v2.16b
  1534. st1 {v3.8h}, [x0], #16
  1535. b.ge 1b
  1536. 2:
  1537. adds w3, w3, #8
  1538. b.eq 4f
  1539. 3:
  1540. subs w3, w3, #1
  1541. ldr s0, [x1], #4
  1542. fcvtzs w4, s0, #8
  1543. rev16 w5, w4
  1544. strh w5, [x0], #2
  1545. b.gt 3b
  1546. 4:
  1547. ret
  1548. endfunc
  1549. // void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
  1550. function mbtree_fix8_unpack_neon, export=1
  1551. subs w3, w2, #8
  1552. b.lt 2f
  1553. 1:
  1554. subs w3, w3, #8
  1555. ld1 {v0.8h}, [x1], #16
  1556. rev16 v1.16b, v0.16b
  1557. sxtl v2.4s, v1.4h
  1558. sxtl2 v3.4s, v1.8h
  1559. scvtf v4.4s, v2.4s, #8
  1560. scvtf v5.4s, v3.4s, #8
  1561. st1 {v4.4s,v5.4s}, [x0], #32
  1562. b.ge 1b
  1563. 2:
  1564. adds w3, w3, #8
  1565. b.eq 4f
  1566. 3:
  1567. subs w3, w3, #1
  1568. ldrh w4, [x1], #2
  1569. rev16 w5, w4
  1570. sxth w6, w5
  1571. scvtf s0, w6, #8
  1572. str s0, [x0], #4
  1573. b.gt 3b
  1574. 4:
  1575. ret
  1576. endfunc