mc-a.S 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938
  1. /*****************************************************************************
  2. * mc.S: arm motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Mans Rullgard <mans@mansr.com>
  8. * Stefan Groenroos <stefan.gronroos@gmail.com>
  9. * Janne Grunau <janne-x264@jannau.net>
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. *
  25. * This program is also available under a commercial proprietary license.
  26. * For more information, contact us at licensing@x264.com.
  27. *****************************************************************************/
  28. #include "asm.S"
  29. const pw_0to15, align=4
  30. .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  31. endconst
  32. .text
  33. // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
  34. // They also use nothing above armv5te, but we don't care about pre-armv6
  35. // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
  36. function prefetch_ref_arm
  37. sub r2, r2, #1
  38. add r0, r0, #64
  39. and r2, r2, r1
  40. add r0, r0, r2, lsl #3
  41. add r2, r1, r1, lsl #1
  42. pld [r0]
  43. pld [r0, r1]
  44. pld [r0, r1, lsl #1]
  45. add r3, r0, r1, lsl #2
  46. pld [r0, r2]
  47. pld [r3]
  48. pld [r3, r1]
  49. pld [r3, r1, lsl #1]
  50. pld [r3, r2]
  51. bx lr
  52. endfunc
  53. // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
  54. // uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
  55. function prefetch_fenc_arm
  56. ldr ip, [sp]
  57. push {lr}
  58. and lr, ip, #3
  59. smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed
  60. and ip, ip, #6
  61. smulbb ip, ip, r3
  62. add r0, r0, #64
  63. add r2, r2, #64
  64. add r0, r0, lr, lsl #2
  65. pld [r0]
  66. add lr, r0, r1, lsl #1
  67. pld [r0, r1]
  68. pld [lr]
  69. add r2, r2, ip, lsl #2
  70. pld [lr, r1]
  71. pld [r2]
  72. add ip, r2, r3, lsl #1
  73. pld [r2, r3]
  74. pld [ip]
  75. pld [ip, r3]
  76. pop {pc}
  77. endfunc
  78. // void *memcpy_aligned( void *dst, const void *src, size_t n )
  79. function memcpy_aligned_neon
  80. orr r3, r0, r1, lsr #1
  81. movrel ip, memcpy_table
  82. and r3, r3, #0xc
  83. ldr pc, [ip, r3]
  84. endfunc
  85. .macro MEMCPY_ALIGNED srcalign dstalign
  86. function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0
  87. mov r3, r0
  88. .if \srcalign == 8 && \dstalign == 8
  89. sub r2, #16
  90. vld1.64 {d0}, [r1,:64]!
  91. vst1.64 {d0}, [r3,:64]!
  92. .set r1align, 128
  93. .set r3align, 128
  94. .else
  95. .set r1align, \srcalign * 8
  96. .set r3align, \dstalign * 8
  97. .endif
  98. tst r2, #16
  99. beq 32f
  100. sub r2, #16
  101. vld1.64 {d0-d1}, [r1,:r1align]!
  102. vst1.64 {d0-d1}, [r3,:r3align]!
  103. 32: // n is a multiple of 32
  104. tst r2, #32
  105. beq 640f
  106. sub r2, #32
  107. vld1.64 {d0-d3}, [r1,:r1align]!
  108. vst1.64 {d0-d3}, [r3,:r3align]!
  109. 640: // n is a multiple of 64
  110. cmp r2, #0
  111. beq 1f
  112. 64:
  113. subs r2, #64
  114. vld1.64 {d0-d3}, [r1,:r1align]!
  115. vld1.64 {d4-d7}, [r1,:r1align]!
  116. vst1.64 {d0-d3}, [r3,:r3align]!
  117. vst1.64 {d4-d7}, [r3,:r3align]!
  118. bgt 64b
  119. 1: // end
  120. .if \srcalign == 8 && \dstalign == 8
  121. vld1.64 {d0}, [r1,:64]!
  122. vst1.64 {d0}, [r3,:64]!
  123. .endif
  124. bx lr
  125. endfunc
  126. .endm
  127. MEMCPY_ALIGNED 16, 16
  128. MEMCPY_ALIGNED 16, 8
  129. MEMCPY_ALIGNED 8, 16
  130. MEMCPY_ALIGNED 8, 8
  131. const memcpy_table, align=2, relocate=1
  132. .word memcpy_aligned_16_16_neon
  133. .word memcpy_aligned_16_8_neon
  134. .word memcpy_aligned_8_16_neon
  135. .word memcpy_aligned_8_8_neon
  136. endconst
  137. .text
  138. .ltorg
  139. // void memzero_aligned( void *dst, size_t n )
  140. function memzero_aligned_neon
  141. vmov.i8 q0, #0
  142. vmov.i8 q1, #0
  143. memzero_loop:
  144. subs r1, #128
  145. .rept 4
  146. vst1.64 {d0-d3}, [r0,:128]!
  147. .endr
  148. bgt memzero_loop
  149. bx lr
  150. endfunc
  151. // void pixel_avg( uint8_t *dst, intptr_t dst_stride,
  152. // uint8_t *src1, intptr_t src1_stride,
  153. // uint8_t *src2, intptr_t src2_stride, int weight );
  154. .macro AVGH w h
  155. function pixel_avg_\w\()x\h\()_neon
  156. ldr ip, [sp, #8]
  157. push {r4-r6,lr}
  158. cmp ip, #32
  159. ldrd r4, r5, [sp, #16]
  160. mov lr, #\h
  161. beq pixel_avg_w\w\()_neon
  162. rsbs r6, ip, #64
  163. blt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
  164. cmp ip, #0
  165. bge pixel_avg_weight_w\w\()_add_add_neon
  166. b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
  167. endfunc
  168. .endm
  169. AVGH 4, 2
  170. AVGH 4, 4
  171. AVGH 4, 8
  172. AVGH 4, 16
  173. AVGH 8, 4
  174. AVGH 8, 8
  175. AVGH 8, 16
  176. AVGH 16, 8
  177. AVGH 16, 16
  178. // 0 < weight < 64
  179. .macro load_weights_add_add
  180. vdup.8 d30, ip
  181. vdup.8 d31, r6
  182. .endm
  183. .macro load_add_add d1 d2
  184. vld1.32 {\d1}, [r2], r3
  185. vld1.32 {\d2}, [r4], r5
  186. .endm
  187. .macro weight_add_add dst s1 s2
  188. vmull.u8 \dst, \s1, d30
  189. vmlal.u8 \dst, \s2, d31
  190. .endm
  191. // weight > 64
  192. .macro load_weights_add_sub
  193. rsb r6, #0
  194. vdup.8 d30, ip
  195. vdup.8 d31, r6
  196. .endm
  197. .macro load_add_sub d1 d2
  198. vld1.32 {\d1}, [r2], r3
  199. vld1.32 {\d2}, [r4], r5
  200. .endm
  201. .macro weight_add_sub dst s1 s2
  202. vmull.u8 \dst, \s1, d30
  203. vmlsl.u8 \dst, \s2, d31
  204. .endm
  205. // weight < 0
  206. .macro load_weights_sub_add
  207. rsb ip, #0
  208. vdup.8 d31, r6
  209. vdup.8 d30, ip
  210. .endm
  211. .macro load_sub_add d1 d2
  212. vld1.32 {\d2}, [r4], r5
  213. vld1.32 {\d1}, [r2], r3
  214. .endm
  215. .macro weight_sub_add dst s1 s2
  216. vmull.u8 \dst, \s2, d31
  217. vmlsl.u8 \dst, \s1, d30
  218. .endm
  219. .macro AVG_WEIGHT ext
  220. function pixel_avg_weight_w4_\ext\()_neon, export=0
  221. load_weights_\ext
  222. 1: // height loop
  223. subs lr, lr, #2
  224. load_\ext d0[], d1[]
  225. weight_\ext q8, d0, d1
  226. load_\ext d2[], d3[]
  227. vqrshrun.s16 d0, q8, #6
  228. weight_\ext q9, d2, d3
  229. vst1.32 {d0[0]}, [r0,:32], r1
  230. vqrshrun.s16 d1, q9, #6
  231. vst1.32 {d1[0]}, [r0,:32], r1
  232. bgt 1b
  233. pop {r4-r6,pc}
  234. endfunc
  235. function pixel_avg_weight_w8_\ext\()_neon, export=0
  236. load_weights_\ext
  237. 1: // height loop
  238. subs lr, lr, #4
  239. load_\ext d0, d1
  240. weight_\ext q8, d0, d1
  241. load_\ext d2, d3
  242. weight_\ext q9, d2, d3
  243. load_\ext d4, d5
  244. weight_\ext q10, d4, d5
  245. load_\ext d6, d7
  246. weight_\ext q11, d6, d7
  247. vqrshrun.s16 d0, q8, #6
  248. vqrshrun.s16 d1, q9, #6
  249. vqrshrun.s16 d2, q10, #6
  250. vqrshrun.s16 d3, q11, #6
  251. vst1.64 {d0}, [r0,:64], r1
  252. vst1.64 {d1}, [r0,:64], r1
  253. vst1.64 {d2}, [r0,:64], r1
  254. vst1.64 {d3}, [r0,:64], r1
  255. bgt 1b
  256. pop {r4-r6,pc}
  257. endfunc
  258. function pixel_avg_weight_w16_\ext\()_neon, export=0
  259. load_weights_\ext
  260. 1: // height loop
  261. subs lr, lr, #2
  262. load_\ext d0-d1, d2-d3
  263. weight_\ext q8, d0, d2
  264. weight_\ext q9, d1, d3
  265. load_\ext d4-d5, d6-d7
  266. weight_\ext q10, d4, d6
  267. weight_\ext q11, d5, d7
  268. vqrshrun.s16 d0, q8, #6
  269. vqrshrun.s16 d1, q9, #6
  270. vqrshrun.s16 d2, q10, #6
  271. vqrshrun.s16 d3, q11, #6
  272. vst1.64 {d0-d1}, [r0,:128], r1
  273. vst1.64 {d2-d3}, [r0,:128], r1
  274. bgt 1b
  275. pop {r4-r6,pc}
  276. endfunc
  277. .endm
  278. AVG_WEIGHT add_add
  279. AVG_WEIGHT add_sub
  280. AVG_WEIGHT sub_add
  281. function pixel_avg_w4_neon, export=0
  282. subs lr, lr, #2
  283. vld1.32 {d0[]}, [r2], r3
  284. vld1.32 {d2[]}, [r4], r5
  285. vrhadd.u8 d0, d0, d2
  286. vld1.32 {d1[]}, [r2], r3
  287. vld1.32 {d3[]}, [r4], r5
  288. vrhadd.u8 d1, d1, d3
  289. vst1.32 {d0[0]}, [r0,:32], r1
  290. vst1.32 {d1[0]}, [r0,:32], r1
  291. bgt pixel_avg_w4_neon
  292. pop {r4-r6,pc}
  293. endfunc
  294. function pixel_avg_w8_neon, export=0
  295. subs lr, lr, #4
  296. vld1.64 {d0}, [r2], r3
  297. vld1.64 {d2}, [r4], r5
  298. vrhadd.u8 d0, d0, d2
  299. vld1.64 {d1}, [r2], r3
  300. vld1.64 {d3}, [r4], r5
  301. vrhadd.u8 d1, d1, d3
  302. vst1.64 {d0}, [r0,:64], r1
  303. vld1.64 {d2}, [r2], r3
  304. vld1.64 {d4}, [r4], r5
  305. vrhadd.u8 d2, d2, d4
  306. vst1.64 {d1}, [r0,:64], r1
  307. vld1.64 {d3}, [r2], r3
  308. vld1.64 {d5}, [r4], r5
  309. vrhadd.u8 d3, d3, d5
  310. vst1.64 {d2}, [r0,:64], r1
  311. vst1.64 {d3}, [r0,:64], r1
  312. bgt pixel_avg_w8_neon
  313. pop {r4-r6,pc}
  314. endfunc
  315. function pixel_avg_w16_neon, export=0
  316. subs lr, lr, #4
  317. vld1.64 {d0-d1}, [r2], r3
  318. vld1.64 {d2-d3}, [r4], r5
  319. vrhadd.u8 q0, q0, q1
  320. vld1.64 {d2-d3}, [r2], r3
  321. vld1.64 {d4-d5}, [r4], r5
  322. vrhadd.u8 q1, q1, q2
  323. vst1.64 {d0-d1}, [r0,:128], r1
  324. vld1.64 {d4-d5}, [r2], r3
  325. vld1.64 {d6-d7}, [r4], r5
  326. vrhadd.u8 q2, q2, q3
  327. vst1.64 {d2-d3}, [r0,:128], r1
  328. vld1.64 {d6-d7}, [r2], r3
  329. vld1.64 {d0-d1}, [r4], r5
  330. vrhadd.u8 q3, q3, q0
  331. vst1.64 {d4-d5}, [r0,:128], r1
  332. vst1.64 {d6-d7}, [r0,:128], r1
  333. bgt pixel_avg_w16_neon
  334. pop {r4-r6,pc}
  335. endfunc
  336. function pixel_avg2_w4_neon
  337. ldr ip, [sp, #4]
  338. push {lr}
  339. ldr lr, [sp, #4]
  340. avg2_w4_loop:
  341. subs ip, ip, #2
  342. vld1.32 {d0[]}, [r2], r3
  343. vld1.32 {d2[]}, [lr], r3
  344. vrhadd.u8 d0, d0, d2
  345. vld1.32 {d1[]}, [r2], r3
  346. vld1.32 {d3[]}, [lr], r3
  347. vrhadd.u8 d1, d1, d3
  348. vst1.32 {d0[0]}, [r0,:32], r1
  349. vst1.32 {d1[0]}, [r0,:32], r1
  350. bgt avg2_w4_loop
  351. pop {pc}
  352. endfunc
  353. function pixel_avg2_w8_neon
  354. ldr ip, [sp, #4]
  355. push {lr}
  356. ldr lr, [sp, #4]
  357. avg2_w8_loop:
  358. subs ip, ip, #2
  359. vld1.64 {d0}, [r2], r3
  360. vld1.64 {d2}, [lr], r3
  361. vrhadd.u8 d0, d0, d2
  362. vld1.64 {d1}, [r2], r3
  363. vld1.64 {d3}, [lr], r3
  364. vrhadd.u8 d1, d1, d3
  365. vst1.64 {d0}, [r0,:64], r1
  366. vst1.64 {d1}, [r0,:64], r1
  367. bgt avg2_w8_loop
  368. pop {pc}
  369. endfunc
  370. function pixel_avg2_w16_neon
  371. ldr ip, [sp, #4]
  372. push {lr}
  373. ldr lr, [sp, #4]
  374. avg2_w16_loop:
  375. subs ip, ip, #2
  376. vld1.64 {d0-d1}, [r2], r3
  377. vld1.64 {d2-d3}, [lr], r3
  378. vrhadd.u8 q0, q0, q1
  379. vld1.64 {d4-d5}, [r2], r3
  380. vld1.64 {d6-d7}, [lr], r3
  381. vrhadd.u8 q2, q2, q3
  382. vst1.64 {d0-d1}, [r0,:128], r1
  383. vst1.64 {d4-d5}, [r0,:128], r1
  384. bgt avg2_w16_loop
  385. pop {pc}
  386. endfunc
  387. function pixel_avg2_w20_neon
  388. ldr ip, [sp, #4]
  389. push {lr}
  390. sub r1, r1, #16
  391. ldr lr, [sp, #4]
  392. avg2_w20_loop:
  393. subs ip, ip, #2
  394. vld1.64 {d0-d2}, [r2], r3
  395. vld1.64 {d4-d6}, [lr], r3
  396. vrhadd.u8 q0, q0, q2
  397. vrhadd.u8 d2, d2, d6
  398. vld1.64 {d4-d6}, [r2], r3
  399. vld1.64 {d16-d18},[lr], r3
  400. vrhadd.u8 q2, q2, q8
  401. vst1.64 {d0-d1}, [r0,:128]!
  402. vrhadd.u8 d6, d6, d18
  403. vst1.32 {d2[0]}, [r0,:32], r1
  404. vst1.64 {d4-d5}, [r0,:128]!
  405. vst1.32 {d6[0]}, [r0,:32], r1
  406. bgt avg2_w20_loop
  407. pop {pc}
  408. endfunc
  409. .macro weight_prologue type
  410. push {r4-r5,lr}
  411. ldr r4, [sp, #4*3] // weight_t
  412. ldr ip, [sp, #4*3+4] // h
  413. .ifc \type, full
  414. ldr lr, [r4, #32] // denom
  415. .endif
  416. ldrd r4, r5, [r4, #32+4] // scale, offset
  417. vdup.8 d0, r4
  418. vdup.16 q1, r5
  419. .ifc \type, full
  420. rsb lr, lr, #0
  421. vdup.16 q2, lr
  422. .endif
  423. .endm
  424. // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride,
  425. // const x264_weight_t *weight, int height )
  426. function mc_weight_w20_neon
  427. weight_prologue full
  428. sub r1, #16
  429. weight20_loop:
  430. subs ip, #2
  431. vld1.8 {d17-d19}, [r2], r3
  432. vmull.u8 q10, d17, d0
  433. vmull.u8 q11, d18, d0
  434. vld1.8 {d16-d18}, [r2], r3
  435. vmull.u8 q12, d16, d0
  436. vmull.u8 q13, d17, d0
  437. vtrn.32 d19, d18
  438. vmull.u8 q14, d19, d0
  439. vrshl.s16 q10, q10, q2
  440. vrshl.s16 q11, q11, q2
  441. vrshl.s16 q12, q12, q2
  442. vrshl.s16 q13, q13, q2
  443. vrshl.s16 q14, q14, q2
  444. vadd.s16 q10, q10, q1
  445. vadd.s16 q11, q11, q1
  446. vadd.s16 q12, q12, q1
  447. vadd.s16 q13, q13, q1
  448. vadd.s16 q14, q14, q1
  449. vqmovun.s16 d16, q10
  450. vqmovun.s16 d17, q11
  451. vqmovun.s16 d18, q12
  452. vqmovun.s16 d19, q13
  453. vqmovun.s16 d20, q14
  454. vst1.8 {d16-d17}, [r0,:128]!
  455. vst1.32 {d20[0]}, [r0,:32], r1
  456. vst1.8 {d18-d19}, [r0,:128]!
  457. vst1.32 {d20[1]}, [r0,:32], r1
  458. bgt weight20_loop
  459. pop {r4-r5,pc}
  460. endfunc
  461. function mc_weight_w16_neon
  462. weight_prologue full
  463. weight16_loop:
  464. subs ip, #2
  465. vld1.8 {d16-d17}, [r2], r3
  466. vld1.8 {d18-d19}, [r2], r3
  467. vmull.u8 q10, d16, d0
  468. vmull.u8 q11, d17, d0
  469. vmull.u8 q12, d18, d0
  470. vmull.u8 q13, d19, d0
  471. vrshl.s16 q10, q10, q2
  472. vrshl.s16 q11, q11, q2
  473. vrshl.s16 q12, q12, q2
  474. vrshl.s16 q13, q13, q2
  475. vadd.s16 q10, q10, q1
  476. vadd.s16 q11, q11, q1
  477. vadd.s16 q12, q12, q1
  478. vadd.s16 q13, q13, q1
  479. vqmovun.s16 d16, q10
  480. vqmovun.s16 d17, q11
  481. vqmovun.s16 d18, q12
  482. vqmovun.s16 d19, q13
  483. vst1.8 {d16-d17}, [r0,:128], r1
  484. vst1.8 {d18-d19}, [r0,:128], r1
  485. bgt weight16_loop
  486. pop {r4-r5,pc}
  487. endfunc
  488. function mc_weight_w8_neon
  489. weight_prologue full
  490. weight8_loop:
  491. subs ip, #2
  492. vld1.8 {d16}, [r2], r3
  493. vld1.8 {d18}, [r2], r3
  494. vmull.u8 q8, d16, d0
  495. vmull.u8 q9, d18, d0
  496. vrshl.s16 q8, q8, q2
  497. vrshl.s16 q9, q9, q2
  498. vadd.s16 q8, q8, q1
  499. vadd.s16 q9, q9, q1
  500. vqmovun.s16 d16, q8
  501. vqmovun.s16 d18, q9
  502. vst1.8 {d16}, [r0,:64], r1
  503. vst1.8 {d18}, [r0,:64], r1
  504. bgt weight8_loop
  505. pop {r4-r5,pc}
  506. endfunc
  507. function mc_weight_w4_neon
  508. weight_prologue full
  509. weight4_loop:
  510. subs ip, #2
  511. vld1.32 {d16[0]}, [r2], r3
  512. vld1.32 {d16[1]}, [r2], r3
  513. vmull.u8 q8, d16, d0
  514. vrshl.s16 q8, q8, q2
  515. vadd.s16 q8, q8, q1
  516. vqmovun.s16 d16, q8
  517. vst1.32 {d16[0]}, [r0], r1
  518. vst1.32 {d16[1]}, [r0], r1
  519. bgt weight4_loop
  520. pop {r4-r5,pc}
  521. endfunc
  522. function mc_weight_w20_nodenom_neon
  523. weight_prologue nodenom
  524. sub r1, #16
  525. weight20_nodenom_loop:
  526. subs ip, #2
  527. vld1.8 {d26-d28}, [r2], r3
  528. vmov q8, q1
  529. vmov q9, q1
  530. vld1.8 {d29-d31}, [r2], r3
  531. vmov q10, q1
  532. vmov q11, q1
  533. vmov q12, q1
  534. vtrn.32 d28, d31
  535. vmlal.u8 q8, d26, d0
  536. vmlal.u8 q9, d27, d0
  537. vmlal.u8 q10, d29, d0
  538. vmlal.u8 q11, d30, d0
  539. vmlal.u8 q12, d28, d0
  540. vqmovun.s16 d16, q8
  541. vqmovun.s16 d17, q9
  542. vqmovun.s16 d18, q10
  543. vqmovun.s16 d19, q11
  544. vqmovun.s16 d20, q12
  545. vst1.8 {d16-d17}, [r0,:128]!
  546. vst1.32 {d20[0]}, [r0,:32], r1
  547. vst1.8 {d18-d19}, [r0,:128]!
  548. vst1.32 {d20[1]}, [r0,:32], r1
  549. bgt weight20_nodenom_loop
  550. pop {r4-r5,pc}
  551. endfunc
  552. function mc_weight_w16_nodenom_neon
  553. weight_prologue nodenom
  554. weight16_nodenom_loop:
  555. subs ip, #2
  556. vld1.8 {d16-d17}, [r2], r3
  557. vld1.8 {d18-d19}, [r2], r3
  558. vmov q12, q1
  559. vmov q13, q1
  560. vmov q14, q1
  561. vmov q15, q1
  562. vmlal.u8 q12, d16, d0
  563. vmlal.u8 q13, d17, d0
  564. vmlal.u8 q14, d18, d0
  565. vmlal.u8 q15, d19, d0
  566. vqmovun.s16 d16, q12
  567. vqmovun.s16 d17, q13
  568. vqmovun.s16 d18, q14
  569. vqmovun.s16 d19, q15
  570. vst1.8 {d16-d17}, [r0,:128], r1
  571. vst1.8 {d18-d19}, [r0,:128], r1
  572. bgt weight16_nodenom_loop
  573. pop {r4-r5,pc}
  574. endfunc
  575. function mc_weight_w8_nodenom_neon
  576. weight_prologue nodenom
  577. weight8_nodenom_loop:
  578. subs ip, #2
  579. vld1.8 {d16}, [r2], r3
  580. vld1.8 {d18}, [r2], r3
  581. vmov q10, q1
  582. vmov q11, q1
  583. vmlal.u8 q10, d16, d0
  584. vmlal.u8 q11, d18, d0
  585. vqmovun.s16 d16, q10
  586. vqmovun.s16 d17, q11
  587. vst1.8 {d16}, [r0,:64], r1
  588. vst1.8 {d17}, [r0,:64], r1
  589. bgt weight8_nodenom_loop
  590. pop {r4-r5,pc}
  591. endfunc
  592. function mc_weight_w4_nodenom_neon
  593. weight_prologue nodenom
  594. weight4_nodenom_loop:
  595. subs ip, #2
  596. vld1.32 {d16[0]}, [r2], r3
  597. vld1.32 {d16[1]}, [r2], r3
  598. vmov q10, q1
  599. vmlal.u8 q10, d16, d0
  600. vqmovun.s16 d16, q10
  601. vst1.32 {d16[0]}, [r0], r1
  602. vst1.32 {d16[1]}, [r0], r1
  603. bgt weight4_nodenom_loop
  604. pop {r4-r5,pc}
  605. endfunc
  606. .macro weight_simple_prologue
  607. push {lr}
  608. ldr lr, [sp, #4] // weight_t
  609. ldr ip, [sp, #8] // h
  610. ldr lr, [lr] // offset
  611. vdup.8 q1, lr
  612. .endm
  613. .macro weight_simple name op
  614. function mc_weight_w20_\name\()_neon
  615. weight_simple_prologue
  616. weight20_\name\()_loop:
  617. subs ip, #2
  618. vld1.8 {d16-d18}, [r2], r3
  619. vld1.8 {d19-d21}, [r2], r3
  620. \op q8, q8, q1
  621. \op q9, q9, q1
  622. \op q10, q10, q1
  623. vst1.8 {d16-d18}, [r0,:64], r1
  624. vst1.8 {d19-d21}, [r0,:64], r1
  625. bgt weight20_\name\()_loop
  626. pop {pc}
  627. endfunc
  628. function mc_weight_w16_\name\()_neon
  629. weight_simple_prologue
  630. weight16_\name\()_loop:
  631. subs ip, #2
  632. vld1.8 {d16-d17}, [r2], r3
  633. vld1.8 {d18-d19}, [r2], r3
  634. \op q8, q8, q1
  635. \op q9, q9, q1
  636. vst1.8 {d16-d17}, [r0,:128], r1
  637. vst1.8 {d18-d19}, [r0,:128], r1
  638. bgt weight16_\name\()_loop
  639. pop {pc}
  640. endfunc
  641. function mc_weight_w8_\name\()_neon
  642. weight_simple_prologue
  643. weight8_\name\()_loop:
  644. subs ip, #2
  645. vld1.8 {d16}, [r2], r3
  646. vld1.8 {d17}, [r2], r3
  647. \op q8, q8, q1
  648. vst1.8 {d16}, [r0,:64], r1
  649. vst1.8 {d17}, [r0,:64], r1
  650. bgt weight8_\name\()_loop
  651. pop {pc}
  652. endfunc
  653. function mc_weight_w4_\name\()_neon
  654. weight_simple_prologue
  655. weight4_\name\()_loop:
  656. subs ip, #2
  657. vld1.32 {d16[]}, [r2], r3
  658. vld1.32 {d17[]}, [r2], r3
  659. \op q8, q8, q1
  660. vst1.32 {d16[0]}, [r0], r1
  661. vst1.32 {d17[0]}, [r0], r1
  662. bgt weight4_\name\()_loop
  663. pop {pc}
  664. endfunc
  665. .endm
  666. weight_simple offsetadd, vqadd.u8
  667. weight_simple offsetsub, vqsub.u8
  668. // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
  669. function mc_copy_w4_neon
  670. ldr ip, [sp]
  671. copy_w4_loop:
  672. subs ip, ip, #4
  673. vld1.32 {d0[]}, [r2], r3
  674. vld1.32 {d1[]}, [r2], r3
  675. vld1.32 {d2[]}, [r2], r3
  676. vld1.32 {d3[]}, [r2], r3
  677. vst1.32 {d0[0]}, [r0,:32], r1
  678. vst1.32 {d1[0]}, [r0,:32], r1
  679. vst1.32 {d2[0]}, [r0,:32], r1
  680. vst1.32 {d3[0]}, [r0,:32], r1
  681. bgt copy_w4_loop
  682. bx lr
  683. endfunc
  684. function mc_copy_w8_neon
  685. ldr ip, [sp]
  686. copy_w8_loop:
  687. subs ip, ip, #4
  688. vld1.32 {d0}, [r2], r3
  689. vld1.32 {d1}, [r2], r3
  690. vld1.32 {d2}, [r2], r3
  691. vld1.32 {d3}, [r2], r3
  692. vst1.32 {d0}, [r0,:64], r1
  693. vst1.32 {d1}, [r0,:64], r1
  694. vst1.32 {d2}, [r0,:64], r1
  695. vst1.32 {d3}, [r0,:64], r1
  696. bgt copy_w8_loop
  697. bx lr
  698. endfunc
  699. function mc_copy_w16_neon
  700. ldr ip, [sp]
  701. copy_w16_loop:
  702. subs ip, ip, #4
  703. vld1.32 {d0-d1}, [r2], r3
  704. vld1.32 {d2-d3}, [r2], r3
  705. vld1.32 {d4-d5}, [r2], r3
  706. vld1.32 {d6-d7}, [r2], r3
  707. vst1.32 {d0-d1}, [r0,:128], r1
  708. vst1.32 {d2-d3}, [r0,:128], r1
  709. vst1.32 {d4-d5}, [r0,:128], r1
  710. vst1.32 {d6-d7}, [r0,:128], r1
  711. bgt copy_w16_loop
  712. bx lr
  713. endfunc
  714. function mc_copy_w16_aligned_neon
  715. ldr ip, [sp]
  716. copy_w16_aligned_loop:
  717. subs ip, ip, #4
  718. vld1.32 {d0-d1}, [r2,:128], r3
  719. vld1.32 {d2-d3}, [r2,:128], r3
  720. vld1.32 {d4-d5}, [r2,:128], r3
  721. vld1.32 {d6-d7}, [r2,:128], r3
  722. vst1.32 {d0-d1}, [r0,:128], r1
  723. vst1.32 {d2-d3}, [r0,:128], r1
  724. vst1.32 {d4-d5}, [r0,:128], r1
  725. vst1.32 {d6-d7}, [r0,:128], r1
  726. bgt copy_w16_aligned_loop
  727. bx lr
  728. endfunc
  729. // void mc_chroma( uint8_t *dst, intptr_t i_dst_stride,
  730. // uint8_t *src, intptr_t i_src_stride,
  731. // int dx, int dy, int i_width, int i_height );
  732. function mc_chroma_neon
  733. push {r4-r8, lr}
  734. vpush {d8-d11}
  735. ldrd r4, r5, [sp, #56]
  736. ldrd r6, r7, [sp, #64]
  737. asr lr, r6, #3
  738. mul lr, r4, lr
  739. add r3, r3, r5, asr #2
  740. cmp r7, #4
  741. and r5, r5, #7
  742. and r6, r6, #7
  743. add r3, r3, lr
  744. bic r3, r3, #0x1
  745. pld [r3]
  746. pld [r3, r4]
  747. bgt mc_chroma_w8
  748. beq mc_chroma_w4
  749. .macro CHROMA_MC_START r00, r01, r10, r11
  750. muls lr, r5, r6
  751. rsb r7, lr, r6, lsl #3
  752. rsb ip, lr, r5, lsl #3
  753. sub r5, lr, r5, lsl #3
  754. sub r5, r5, r6, lsl #3
  755. add r5, r5, #64
  756. beq 2f
  757. vld2.8 {\r00-\r01}, [r3], r4
  758. vdup.8 d0, r5
  759. vdup.8 d1, ip
  760. vdup.8 d2, r7
  761. vld2.8 {\r10-\r11}, [r3], r4
  762. vdup.8 d3, lr
  763. ldr r5, [sp, #72]
  764. .endm
  765. .macro CHROMA_MC width, align
  766. mc_chroma_w\width:
  767. CHROMA_MC_START d4, d5, d8, d9
  768. vext.8 d6, d4, d6, #1
  769. vext.8 d7, d5, d7, #1
  770. vext.8 d10, d8, d10, #1
  771. vext.8 d11, d9, d11, #1
  772. // since the element size varies, there's a different index for the 2nd store
  773. .if \width == 4
  774. .set st2, 1
  775. .else
  776. .set st2, 2
  777. .endif
  778. vtrn.32 d4, d6
  779. vtrn.32 d5, d7
  780. vtrn.32 d8, d10
  781. vtrn.32 d9, d11
  782. vtrn.32 d0, d1
  783. vtrn.32 d2, d3
  784. 1: // height loop, interpolate xy
  785. vmull.u8 q8, d4, d0
  786. vmlal.u8 q8, d8, d2
  787. vmull.u8 q9, d5, d0
  788. vmlal.u8 q9, d9, d2
  789. vld2.8 {d4-d5}, [r3], r4
  790. vext.8 d6, d4, d6, #1
  791. vext.8 d7, d5, d7, #1
  792. vadd.i16 d16, d16, d17
  793. vadd.i16 d17, d18, d19
  794. vtrn.32 d4, d6
  795. vtrn.32 d5, d7
  796. vmull.u8 q10, d8, d0
  797. vmlal.u8 q10, d4, d2
  798. vmull.u8 q11, d9, d0
  799. vmlal.u8 q11, d5, d2
  800. vld2.8 {d8-d9}, [r3], r4
  801. vrshrn.u16 d16, q8, #6
  802. vext.8 d10, d8, d10, #1
  803. vext.8 d11, d9, d11, #1
  804. vadd.i16 d18, d20, d21
  805. vadd.i16 d19, d22, d23
  806. vtrn.32 d8, d10
  807. vtrn.32 d9, d11
  808. vrshrn.u16 d18, q9, #6
  809. subs r5, r5, #2
  810. pld [r3]
  811. pld [r3, r4]
  812. vst1.\align {d16[0]}, [r0,:\align], r2
  813. vst1.\align {d16[st2]}, [r1,:\align], r2
  814. vst1.\align {d18[0]}, [r0,:\align], r2
  815. vst1.\align {d18[st2]}, [r1,:\align], r2
  816. bgt 1b
  817. vpop {d8-d11}
  818. pop {r4-r8, pc}
  819. 2: // dx or dy are 0
  820. tst r7, r7
  821. add ip, ip, r7
  822. vdup.8 d0, r5
  823. ldr r5, [sp, #72]
  824. vdup.8 d1, ip
  825. beq 4f
  826. vld1.64 {d4}, [r3], r4
  827. vld1.64 {d6}, [r3], r4
  828. 3: // vertical interpolation loop
  829. vmull.u8 q8, d4, d0
  830. vmlal.u8 q8, d6, d1
  831. vmull.u8 q9, d6, d0
  832. vld1.64 {d4}, [r3], r4
  833. vmlal.u8 q9, d4, d1
  834. vld1.64 {d6}, [r3], r4
  835. vrshrn.u16 d16, q8, #6 // uvuvuvuv
  836. vrshrn.u16 d17, q9, #6 // uvuvuvuv
  837. subs r5, r5, #2
  838. vuzp.8 d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv
  839. pld [r3]
  840. pld [r3, r4]
  841. vst1.\align {d16[0]}, [r0,:\align], r2
  842. vst1.\align {d16[st2]}, [r0,:\align], r2
  843. vst1.\align {d17[0]}, [r1,:\align], r2
  844. vst1.\align {d17[st2]}, [r1,:\align], r2
  845. bgt 3b
  846. vpop {d8-d11}
  847. pop {r4-r8, pc}
  848. 4: // dy is 0
  849. vld1.64 {d4-d5}, [r3], r4
  850. vld1.64 {d6-d7}, [r3], r4
  851. vext.8 d5, d4, d5, #2
  852. vext.8 d7, d6, d7, #2
  853. 5: // horizontal interpolation loop
  854. vmull.u8 q8, d4, d0
  855. vmlal.u8 q8, d5, d1
  856. vmull.u8 q9, d6, d0
  857. vmlal.u8 q9, d7, d1
  858. subs r5, r5, #2
  859. vld1.64 {d4-d5}, [r3], r4
  860. vld1.64 {d6-d7}, [r3], r4
  861. vext.8 d5, d4, d5, #2
  862. vrshrn.u16 d16, q8, #6
  863. vrshrn.u16 d17, q9, #6
  864. vext.8 d7, d6, d7, #2
  865. vuzp.8 d16, d17
  866. pld [r3]
  867. pld [r3, r4]
  868. vst1.\align {d16[0]}, [r0,:\align], r2
  869. vst1.\align {d16[st2]}, [r0,:\align], r2
  870. vst1.\align {d17[0]}, [r1,:\align], r2
  871. vst1.\align {d17[st2]}, [r1,:\align], r2
  872. bgt 5b
  873. vpop {d8-d11}
  874. pop {r4-r8, pc}
  875. .endm
  876. CHROMA_MC 2, 16
  877. CHROMA_MC 4, 32
  878. mc_chroma_w8:
  879. CHROMA_MC_START d4, d7, d8, d11
  880. vext.8 d5, d4, d5, #1
  881. vext.8 d9, d8, d9, #1
  882. vext.8 d7, d6, d7, #1
  883. vext.8 d11, d10, d11, #1
  884. 1: // height loop, interpolate xy
  885. vmull.u8 q8, d4, d0
  886. vmlal.u8 q8, d5, d1
  887. vmlal.u8 q8, d8, d2
  888. vmlal.u8 q8, d9, d3
  889. vmull.u8 q9, d6, d0
  890. vmlal.u8 q9, d7, d1
  891. vmlal.u8 q9, d10, d2
  892. vmlal.u8 q9, d11, d3
  893. vld2.8 {d4-d7}, [r3], r4
  894. vext.8 d5, d4, d5, #1
  895. vext.8 d7, d6, d7, #1
  896. vmull.u8 q10, d8, d0
  897. vmlal.u8 q10, d9, d1
  898. vmlal.u8 q10, d4, d2
  899. vmlal.u8 q10, d5, d3
  900. vmull.u8 q11, d10, d0
  901. vmlal.u8 q11, d11, d1
  902. vmlal.u8 q11, d6, d2
  903. vmlal.u8 q11, d7, d3
  904. subs r5, r5, #2
  905. vld2.8 {d8-d11}, [r3], r4
  906. vrshrn.u16 d16, q8, #6
  907. vrshrn.u16 d17, q9, #6
  908. vrshrn.u16 d18, q10, #6
  909. vext.8 d9, d8, d9, #1
  910. vrshrn.u16 d19, q11, #6
  911. vext.8 d11, d10, d11, #1
  912. pld [r3]
  913. pld [r3, r4]
  914. vst1.64 {d16}, [r0,:64], r2
  915. vst1.64 {d17}, [r1,:64], r2
  916. vst1.64 {d18}, [r0,:64], r2
  917. vst1.64 {d19}, [r1,:64], r2
  918. bgt 1b
  919. vpop {d8-d11}
  920. pop {r4-r8, pc}
  921. 2: // dx or dy are 0
  922. tst r7, r7
  923. add ip, ip, r7
  924. vdup.8 d0, r5
  925. ldr r5, [sp, #72]
  926. vdup.8 d1, ip
  927. beq 4f
  928. vld2.8 {d4-d5}, [r3], r4
  929. vld2.8 {d6-d7}, [r3], r4
  930. 3: // vertical interpolation loop
  931. vmull.u8 q8, d4, d0 //U
  932. vmlal.u8 q8, d6, d1
  933. vmull.u8 q9, d5, d0 //V
  934. vmlal.u8 q9, d7, d1
  935. vld2.8 {d4-d5}, [r3], r4
  936. vmull.u8 q10, d6, d0
  937. vmlal.u8 q10, d4, d1
  938. vmull.u8 q11, d7, d0
  939. vmlal.u8 q11, d5, d1
  940. vld2.8 {d6-d7}, [r3], r4
  941. vrshrn.u16 d16, q8, #6
  942. vrshrn.u16 d17, q9, #6
  943. vrshrn.u16 d18, q10, #6
  944. vrshrn.u16 d19, q11, #6
  945. subs r5, r5, #2
  946. pld [r3]
  947. pld [r3, r4]
  948. vst1.64 {d16}, [r0,:64], r2
  949. vst1.64 {d17}, [r1,:64], r2
  950. vst1.64 {d18}, [r0,:64], r2
  951. vst1.64 {d19}, [r1,:64], r2
  952. bgt 3b
  953. vpop {d8-d11}
  954. pop {r4-r8, pc}
  955. 4: // dy is 0
  956. vld2.8 {d4-d7}, [r3], r4
  957. vld2.8 {d8-d11}, [r3], r4
  958. vext.8 d5, d4, d5, #1
  959. vext.8 d7, d6, d7, #1
  960. vext.8 d9, d8, d9, #1
  961. vext.8 d11, d10, d11, #1
  962. 5: // horizontal interpolation loop
  963. subs r5, r5, #2
  964. vmull.u8 q8, d4, d0 //U
  965. vmlal.u8 q8, d5, d1
  966. vmull.u8 q9, d6, d0 //V
  967. vmlal.u8 q9, d7, d1
  968. vld2.8 {d4-d7}, [r3], r4
  969. vmull.u8 q10, d8, d0
  970. vmlal.u8 q10, d9, d1
  971. vmull.u8 q11, d10, d0
  972. vmlal.u8 q11, d11, d1
  973. vld2.8 {d8-d11}, [r3], r4
  974. vext.8 d5, d4, d5, #1
  975. vrshrn.u16 d16, q8, #6
  976. vext.8 d7, d6, d7, #1
  977. vrshrn.u16 d17, q9, #6
  978. vext.8 d9, d8, d9, #1
  979. vrshrn.u16 d18, q10, #6
  980. vext.8 d11, d10, d11, #1
  981. vrshrn.u16 d19, q11, #6
  982. pld [r3]
  983. pld [r3, r4]
  984. vst1.64 {d16}, [r0,:64], r2
  985. vst1.64 {d17}, [r1,:64], r2
  986. vst1.64 {d18}, [r0,:64], r2
  987. vst1.64 {d19}, [r1,:64], r2
  988. bgt 5b
  989. vpop {d8-d11}
  990. pop {r4-r8, pc}
  991. endfunc
  992. // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width )
  993. function hpel_filter_v_neon
  994. ldr ip, [sp]
  995. sub r1, r1, r3, lsl #1
  996. push {lr}
  997. add lr, r1, ip
  998. vmov.u8 d30, #5
  999. vmov.u8 d31, #20
  1000. filter_v_loop:
  1001. subs ip, ip, #16
  1002. vld1.64 {d0-d1}, [r1,:128], r3
  1003. vld1.64 {d2-d3}, [r1,:128], r3
  1004. vld1.64 {d4-d5}, [r1,:128], r3
  1005. vld1.64 {d6-d7}, [r1,:128], r3
  1006. vld1.64 {d16-d17}, [r1,:128], r3
  1007. vld1.64 {d18-d19}, [r1,:128], r3
  1008. sub r1, lr, ip
  1009. vaddl.u8 q10, d0, d18
  1010. vmlsl.u8 q10, d2, d30
  1011. vmlal.u8 q10, d4, d31
  1012. vmlal.u8 q10, d6, d31
  1013. vmlsl.u8 q10, d16, d30
  1014. vaddl.u8 q11, d1, d19
  1015. vmlsl.u8 q11, d3, d30
  1016. vmlal.u8 q11, d5, d31
  1017. vmlal.u8 q11, d7, d31
  1018. vmlsl.u8 q11, d17, d30
  1019. vqrshrun.s16 d0, q10, #5
  1020. vst1.64 {d20-d21}, [r2,:128]!
  1021. vqrshrun.s16 d1, q11, #5
  1022. vst1.64 {d22-d23}, [r2,:128]!
  1023. vst1.64 {d0-d1}, [r0,:128]!
  1024. bgt filter_v_loop
  1025. pop {pc}
  1026. endfunc
  1027. // hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
  1028. function hpel_filter_c_neon
  1029. sub r1, #16
  1030. vld1.64 {d0-d3}, [r1,:128]!
  1031. // unrolled 2x: 4% faster
  1032. filter_c_loop:
  1033. subs r2, r2, #16
  1034. vld1.64 {d4-d7}, [r1,:128]!
  1035. vext.16 q8, q0, q1, #6
  1036. vext.16 q12, q1, q2, #3
  1037. vadd.s16 q8, q8, q12
  1038. vext.16 q9, q0, q1, #7
  1039. vext.16 q11, q1, q2, #2
  1040. vadd.s16 q9, q9, q11
  1041. vext.16 q10, q1, q2, #1
  1042. vext.16 q11, q1, q2, #6
  1043. vadd.s16 q10, q1, q10
  1044. vsub.s16 q8, q8, q9 // a-b
  1045. vext.16 q15, q2, q3, #3
  1046. vsub.s16 q9, q9, q10 // b-c
  1047. vext.16 q12, q1, q2, #7
  1048. vshr.s16 q8, q8, #2 // (a-b)/4
  1049. vadd.s16 q11, q11, q15
  1050. vext.16 q14, q2, q3, #2
  1051. vsub.s16 q8, q8, q9 // (a-b)/4-b+c
  1052. vadd.s16 q12, q12, q14
  1053. vext.16 q13, q2, q3, #1
  1054. vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
  1055. vadd.s16 q13, q2, q13
  1056. vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1057. vsub.s16 q11, q11, q12 // a-b
  1058. vsub.s16 q12, q12, q13 // b-c
  1059. vshr.s16 q11, q11, #2 // (a-b)/4
  1060. vqrshrun.s16 d30, q8, #6
  1061. vsub.s16 q11, q11, q12 // (a-b)/4-b+c
  1062. vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
  1063. vld1.64 {d0-d3}, [r1,:128]!
  1064. vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1065. vext.16 q8, q2, q3, #6
  1066. vqrshrun.s16 d31, q11, #6
  1067. vext.16 q12, q3, q0, #3
  1068. vadd.s16 q8, q8, q12
  1069. vext.16 q9, q2, q3, #7
  1070. vst1.64 {d30-d31}, [r0,:128]!
  1071. bxle lr
  1072. subs r2, r2, #16
  1073. vext.16 q11, q3, q0, #2
  1074. vadd.s16 q9, q9, q11
  1075. vext.16 q10, q3, q0, #1
  1076. vext.16 q11, q3, q0, #6
  1077. vadd.s16 q10, q3, q10
  1078. vsub.s16 q8, q8, q9 // a-b
  1079. vext.16 q15, q0, q1, #3
  1080. vsub.s16 q9, q9, q10 // b-c
  1081. vext.16 q12, q3, q0, #7
  1082. vshr.s16 q8, q8, #2 // (a-b)/4
  1083. vadd.s16 q11, q11, q15
  1084. vext.16 q14, q0, q1, #2
  1085. vsub.s16 q8, q8, q9 // (a-b)/4-b+c
  1086. vadd.s16 q12, q12, q14
  1087. vext.16 q13, q0, q1, #1
  1088. vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
  1089. vadd.s16 q13, q0, q13
  1090. vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1091. vsub.s16 q11, q11, q12 // a-b
  1092. vsub.s16 q12, q12, q13 // b-c
  1093. vshr.s16 q11, q11, #2 // (a-b)/4
  1094. vqrshrun.s16 d30, q8, #6
  1095. vsub.s16 q11, q11, q12 // (a-b)/4-b+c
  1096. vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
  1097. vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  1098. vqrshrun.s16 d31, q11, #6
  1099. vst1.64 {d30-d31}, [r0,:128]!
  1100. bgt filter_c_loop
  1101. bx lr
  1102. endfunc
  1103. // hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
  1104. function hpel_filter_h_neon
  1105. sub r1, #16
  1106. vmov.u8 d30, #5
  1107. vld1.64 {d0-d3}, [r1,:128]!
  1108. vmov.u8 d31, #20
  1109. // unrolled 3x because it's 5% faster, due to mitigating
  1110. // the high latency of multiplication and vqrshrun
  1111. filter_h_loop:
  1112. subs r2, r2, #16
  1113. vld1.64 {d4-d5}, [r1,:128]!
  1114. vext.8 q8, q0, q1, #14
  1115. vext.8 q12, q1, q2, #3
  1116. vaddl.u8 q13, d16, d24
  1117. vext.8 q9, q0, q1, #15
  1118. vaddl.u8 q14, d17, d25
  1119. vext.8 q10, q1, q2, #1
  1120. vmlal.u8 q13, d2, d31
  1121. vmlsl.u8 q13, d18, d30
  1122. vext.8 q11, q1, q2, #2
  1123. vmlal.u8 q13, d20, d31
  1124. vmlsl.u8 q13, d22, d30
  1125. vmlsl.u8 q14, d19, d30
  1126. vmlal.u8 q14, d3, d31
  1127. vmlal.u8 q14, d21, d31
  1128. vmlsl.u8 q14, d23, d30
  1129. vqrshrun.s16 d6, q13, #5
  1130. vld1.64 {d0-d1}, [r1,:128]!
  1131. vext.8 q8, q1, q2, #14
  1132. vext.8 q12, q2, q0, #3
  1133. vaddl.u8 q13, d16, d24
  1134. vqrshrun.s16 d7, q14, #5
  1135. vext.8 q9, q1, q2, #15
  1136. vaddl.u8 q14, d17, d25
  1137. vst1.64 {d6-d7}, [r0,:128]!
  1138. bxle lr
  1139. subs r2, r2, #16
  1140. vext.8 q10, q2, q0, #1
  1141. vmlal.u8 q13, d4, d31
  1142. vmlsl.u8 q13, d18, d30
  1143. vext.8 q11, q2, q0, #2
  1144. vmlal.u8 q13, d20, d31
  1145. vmlsl.u8 q13, d22, d30
  1146. vmlsl.u8 q14, d19, d30
  1147. vmlal.u8 q14, d5, d31
  1148. vmlal.u8 q14, d21, d31
  1149. vmlsl.u8 q14, d23, d30
  1150. vqrshrun.s16 d6, q13, #5
  1151. vld1.64 {d2-d3}, [r1,:128]!
  1152. vext.8 q8, q2, q0, #14
  1153. vext.8 q12, q0, q1, #3
  1154. vaddl.u8 q13, d16, d24
  1155. vqrshrun.s16 d7, q14, #5
  1156. vext.8 q9, q2, q0, #15
  1157. vaddl.u8 q14, d17, d25
  1158. vst1.64 {d6-d7}, [r0,:128]!
  1159. bxle lr
  1160. subs r2, r2, #16
  1161. vext.8 q10, q0, q1, #1
  1162. vmlal.u8 q13, d0, d31
  1163. vmlsl.u8 q13, d18, d30
  1164. vext.8 q11, q0, q1, #2
  1165. vmlal.u8 q13, d20, d31
  1166. vmlsl.u8 q13, d22, d30
  1167. vmlsl.u8 q14, d19, d30
  1168. vmlal.u8 q14, d1, d31
  1169. vmlal.u8 q14, d21, d31
  1170. vmlsl.u8 q14, d23, d30
  1171. vqrshrun.s16 d6, q13, #5
  1172. vqrshrun.s16 d7, q14, #5
  1173. vst1.64 {d6-d7}, [r0,:128]!
  1174. bgt filter_h_loop
  1175. bx lr
  1176. endfunc
  1177. // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
  1178. // uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width,
  1179. // int height )
  1180. function frame_init_lowres_core_neon
  1181. push {r4-r10,lr}
  1182. vpush {d8-d15}
  1183. ldrd r4, r5, [sp, #96]
  1184. ldrd r6, r7, [sp, #104]
  1185. ldr lr, [sp, #112]
  1186. sub r10, r6, r7 // dst_stride - width
  1187. and r10, r10, #~15
  1188. lowres_yloop:
  1189. mov ip, r7 // width
  1190. mov r6, r0 // src0
  1191. add r8, r0, r5 // src1 = src0 + src_stride
  1192. add r9, r0, r5, lsl #1 // src2 = src1 + src_stride
  1193. vld2.8 {d8, d10}, [r6,:128]!
  1194. vld2.8 {d12,d14}, [r8,:128]!
  1195. vld2.8 {d16,d18}, [r9,:128]!
  1196. lowres_xloop:
  1197. subs ip, ip, #16
  1198. vld2.8 {d9, d11}, [r6,:128]!
  1199. vld2.8 {d13,d15}, [r8,:128]!
  1200. vrhadd.u8 q0, q4, q6
  1201. vld2.8 {d17,d19}, [r9,:128]!
  1202. vrhadd.u8 q5, q5, q7
  1203. vld2.8 {d20,d22}, [r6,:128]!
  1204. vrhadd.u8 q1, q6, q8
  1205. vld2.8 {d24,d26}, [r8,:128]!
  1206. vrhadd.u8 q7, q7, q9
  1207. vext.8 q4, q4, q10, #1
  1208. vrhadd.u8 q0, q0, q5
  1209. vext.8 q6, q6, q12, #1
  1210. vrhadd.u8 q1, q1, q7
  1211. vld2.8 {d28,d30}, [r9,:128]!
  1212. vrhadd.u8 q4, q4, q6
  1213. vext.8 q8, q8, q14, #1
  1214. vrhadd.u8 q6, q6, q8
  1215. vst1.64 {d0-d1}, [r1,:128]!
  1216. vrhadd.u8 q2, q4, q5
  1217. vst1.64 {d2-d3}, [r3,:128]!
  1218. vrhadd.u8 q3, q6, q7
  1219. vst1.64 {d4-d5}, [r2,:128]!
  1220. vst1.64 {d6-d7}, [r4,:128]!
  1221. ble lowres_xloop_end
  1222. subs ip, ip, #16
  1223. vld2.8 {d21,d23}, [r6,:128]!
  1224. vld2.8 {d25,d27}, [r8,:128]!
  1225. vrhadd.u8 q0, q10, q12
  1226. vld2.8 {d29,d31}, [r9,:128]!
  1227. vrhadd.u8 q11, q11, q13
  1228. vld2.8 {d8, d10}, [r6,:128]!
  1229. vrhadd.u8 q1, q12, q14
  1230. vld2.8 {d12,d14}, [r8,:128]!
  1231. vrhadd.u8 q13, q13, q15
  1232. vext.8 q10, q10, q4, #1
  1233. vrhadd.u8 q0, q0, q11
  1234. vext.8 q12, q12, q6, #1
  1235. vrhadd.u8 q1, q1, q13
  1236. vld2.8 {d16,d18}, [r9,:128]!
  1237. vrhadd.u8 q10, q10, q12
  1238. vext.8 q14, q14, q8, #1
  1239. vrhadd.u8 q12, q12, q14
  1240. vst1.64 {d0-d1}, [r1,:128]!
  1241. vrhadd.u8 q2, q10, q11
  1242. vst1.64 {d2-d3}, [r3,:128]!
  1243. vrhadd.u8 q3, q12, q13
  1244. vst1.64 {d4-d5}, [r2,:128]!
  1245. vst1.64 {d6-d7}, [r4,:128]!
  1246. bgt lowres_xloop
  1247. lowres_xloop_end:
  1248. subs lr, lr, #1
  1249. add r0, r0, r5, lsl #1
  1250. add r1, r1, r10
  1251. add r2, r2, r10
  1252. add r3, r3, r10
  1253. add r4, r4, r10
  1254. bgt lowres_yloop
  1255. vpop {d8-d15}
  1256. pop {r4-r10,pc}
  1257. endfunc
  1258. function load_deinterleave_chroma_fdec_neon
  1259. mov ip, #FDEC_STRIDE/2
  1260. 1:
  1261. vld2.8 {d0-d1}, [r1,:128], r2
  1262. subs r3, r3, #1
  1263. pld [r1]
  1264. vst1.8 {d0}, [r0,:64], ip
  1265. vst1.8 {d1}, [r0,:64], ip
  1266. bgt 1b
  1267. bx lr
  1268. endfunc
  1269. function load_deinterleave_chroma_fenc_neon
  1270. mov ip, #FENC_STRIDE/2
  1271. 1:
  1272. vld2.8 {d0-d1}, [r1,:128], r2
  1273. subs r3, r3, #1
  1274. pld [r1]
  1275. vst1.8 {d0}, [r0,:64], ip
  1276. vst1.8 {d1}, [r0,:64], ip
  1277. bgt 1b
  1278. bx lr
  1279. endfunc
  1280. function plane_copy_core_neon
  1281. push {r4,lr}
  1282. ldr r4, [sp, #8]
  1283. ldr lr, [sp, #12]
  1284. add r12, r4, #15
  1285. bic r4, r12, #15
  1286. sub r1, r1, r4
  1287. sub r3, r3, r4
  1288. 1:
  1289. mov r12, r4
  1290. 16:
  1291. tst r12, #16
  1292. beq 32f
  1293. subs r12, r12, #16
  1294. vld1.8 {q0}, [r2]!
  1295. vst1.8 {q0}, [r0]!
  1296. beq 0f
  1297. 32:
  1298. subs r12, r12, #32
  1299. vld1.8 {q0, q1}, [r2]!
  1300. vst1.8 {q0, q1}, [r0]!
  1301. bgt 32b
  1302. 0:
  1303. subs lr, lr, #1
  1304. add r2, r2, r3
  1305. add r0, r0, r1
  1306. bgt 1b
  1307. pop {r4,pc}
  1308. endfunc
  1309. function plane_copy_deinterleave_neon
  1310. push {r4-r7, lr}
  1311. ldrd r6, r7, [sp, #28]
  1312. ldrd r4, r5, [sp, #20]
  1313. add lr, r6, #15
  1314. bic lr, lr, #15
  1315. sub r1, r1, lr
  1316. sub r3, r3, lr
  1317. sub r5, r5, lr, lsl #1
  1318. block:
  1319. vld2.8 {d0-d3}, [r4,:128]!
  1320. subs lr, lr, #16
  1321. vst1.8 {q0}, [r0]!
  1322. vst1.8 {q1}, [r2]!
  1323. bgt block
  1324. add r4, r4, r5
  1325. subs r7, r7, #1
  1326. add r0, r0, r1
  1327. add r2, r2, r3
  1328. mov lr, r6
  1329. bgt block
  1330. pop {r4-r7, pc}
  1331. endfunc
  1332. function plane_copy_deinterleave_rgb_neon
  1333. push {r4-r8, r10, r11, lr}
  1334. ldrd r4, r5, [sp, #32]
  1335. ldrd r6, r7, [sp, #40]
  1336. ldr r8, [sp, #48]
  1337. ldrd r10, r11, [sp, #52]
  1338. add lr, r10, #7
  1339. subs r8, r8, #3
  1340. bic lr, lr, #7
  1341. sub r7, r7, lr, lsl #1
  1342. sub r1, r1, lr
  1343. sub r3, r3, lr
  1344. sub r5, r5, lr
  1345. subne r7, r7, lr, lsl #1
  1346. subeq r7, r7, lr
  1347. bne block4
  1348. block3:
  1349. vld3.8 {d0,d1,d2}, [r6]!
  1350. subs lr, lr, #8
  1351. vst1.8 {d0}, [r0]!
  1352. vst1.8 {d1}, [r2]!
  1353. vst1.8 {d2}, [r4]!
  1354. bgt block3
  1355. subs r11, r11, #1
  1356. add r0, r0, r1
  1357. add r2, r2, r3
  1358. add r4, r4, r5
  1359. add r6, r6, r7
  1360. mov lr, r10
  1361. bgt block3
  1362. pop {r4-r8, r10, r11, pc}
  1363. block4:
  1364. vld4.8 {d0,d1,d2,d3}, [r6]!
  1365. subs lr, lr, #8
  1366. vst1.8 {d0}, [r0]!
  1367. vst1.8 {d1}, [r2]!
  1368. vst1.8 {d2}, [r4]!
  1369. bgt block4
  1370. subs r11, r11, #1
  1371. add r0, r0, r1
  1372. add r2, r2, r3
  1373. add r4, r4, r5
  1374. add r6, r6, r7
  1375. mov lr, r10
  1376. bgt block4
  1377. pop {r4-r8, r10, r11, pc}
  1378. endfunc
  1379. function plane_copy_interleave_core_neon
  1380. push {r4-r7, lr}
  1381. ldrd r6, r7, [sp, #28]
  1382. ldrd r4, r5, [sp, #20]
  1383. add lr, r6, #15
  1384. bic lr, lr, #15
  1385. sub r1, r1, lr, lsl #1
  1386. sub r3, r3, lr
  1387. sub r5, r5, lr
  1388. blocki:
  1389. vld1.8 {q0}, [r2]!
  1390. vld1.8 {q1}, [r4]!
  1391. subs lr, lr, #16
  1392. vst2.8 {d0,d2}, [r0]!
  1393. vst2.8 {d1,d3}, [r0]!
  1394. bgt blocki
  1395. subs r7, r7, #1
  1396. add r0, r0, r1
  1397. add r2, r2, r3
  1398. add r4, r4, r5
  1399. mov lr, r6
  1400. bgt blocki
  1401. pop {r4-r7, pc}
  1402. endfunc
  1403. function plane_copy_swap_core_neon
  1404. push {r4-r5, lr}
  1405. ldrd r4, r5, [sp, #12]
  1406. add lr, r4, #15
  1407. bic lr, lr, #15
  1408. sub r1, r1, lr, lsl #1
  1409. sub r3, r3, lr, lsl #1
  1410. 1:
  1411. vld1.8 {q0, q1}, [r2]!
  1412. subs lr, lr, #16
  1413. vrev16.8 q0, q0
  1414. vrev16.8 q1, q1
  1415. vst1.8 {q0, q1}, [r0]!
  1416. bgt 1b
  1417. subs r5, r5, #1
  1418. add r0, r0, r1
  1419. add r2, r2, r3
  1420. mov lr, r4
  1421. bgt 1b
  1422. pop {r4-r5, pc}
  1423. endfunc
  1424. function store_interleave_chroma_neon
  1425. push {lr}
  1426. ldr lr, [sp, #4]
  1427. mov ip, #FDEC_STRIDE
  1428. 1:
  1429. vld1.8 {d0}, [r2], ip
  1430. vld1.8 {d1}, [r3], ip
  1431. subs lr, lr, #1
  1432. vst2.8 {d0,d1}, [r0,:128], r1
  1433. bgt 1b
  1434. pop {pc}
  1435. endfunc
  1436. .macro integral4h p1, p2
  1437. vext.8 d1, \p1, \p2, #1
  1438. vext.8 d2, \p1, \p2, #2
  1439. vext.8 d3, \p1, \p2, #3
  1440. vaddl.u8 q0, \p1, d1
  1441. vaddl.u8 q1, d2, d3
  1442. vadd.u16 q0, q0, q1
  1443. vadd.u16 q0, q0, q2
  1444. .endm
  1445. function integral_init4h_neon
  1446. sub r3, r0, r2, lsl #1
  1447. vld1.8 {d6, d7}, [r1, :128]!
  1448. 1:
  1449. subs r2, r2, #16
  1450. vld1.16 {q2}, [r3, :128]!
  1451. integral4h d6, d7
  1452. vld1.8 {d6}, [r1, :64]!
  1453. vld1.16 {q2}, [r3, :128]!
  1454. vst1.16 {q0}, [r0, :128]!
  1455. integral4h d7, d6
  1456. vld1.8 {d7}, [r1, :64]!
  1457. vst1.16 {q0}, [r0, :128]!
  1458. bgt 1b
  1459. bx lr
  1460. endfunc
  1461. .macro integral8h p1, p2, s
  1462. vext.8 d1, \p1, \p2, #1
  1463. vext.8 d2, \p1, \p2, #2
  1464. vext.8 d3, \p1, \p2, #3
  1465. vext.8 d4, \p1, \p2, #4
  1466. vext.8 d5, \p1, \p2, #5
  1467. vext.8 d6, \p1, \p2, #6
  1468. vext.8 d7, \p1, \p2, #7
  1469. vaddl.u8 q0, \p1, d1
  1470. vaddl.u8 q1, d2, d3
  1471. vaddl.u8 q2, d4, d5
  1472. vaddl.u8 q3, d6, d7
  1473. vadd.u16 q0, q0, q1
  1474. vadd.u16 q2, q2, q3
  1475. vadd.u16 q0, q0, q2
  1476. vadd.u16 q0, q0, \s
  1477. .endm
  1478. function integral_init8h_neon
  1479. sub r3, r0, r2, lsl #1
  1480. vld1.8 {d16, d17}, [r1, :128]!
  1481. 1:
  1482. subs r2, r2, #16
  1483. vld1.16 {q9}, [r3, :128]!
  1484. integral8h d16, d17, q9
  1485. vld1.8 {d16}, [r1, :64]!
  1486. vld1.16 {q9}, [r3, :128]!
  1487. vst1.16 {q0}, [r0, :128]!
  1488. integral8h d17, d16, q9
  1489. vld1.8 {d17}, [r1, :64]!
  1490. vst1.16 {q0}, [r0, :128]!
  1491. bgt 1b
  1492. bx lr
  1493. endfunc
  1494. function integral_init4v_neon
  1495. push {r4-r5}
  1496. mov r3, r0
  1497. add r4, r0, r2, lsl #3
  1498. add r5, r0, r2, lsl #4
  1499. sub r2, r2, #8
  1500. vld1.16 {q11, q12}, [r3]!
  1501. vld1.16 {q8, q9}, [r5]!
  1502. vld1.16 {q13}, [r3]!
  1503. vld1.16 {q10}, [r5]!
  1504. 1:
  1505. subs r2, r2, #16
  1506. vld1.16 {q14, q15}, [r4]!
  1507. vext.8 q0, q11, q12, #8
  1508. vext.8 q1, q12, q13, #8
  1509. vext.8 q2, q8, q9, #8
  1510. vext.8 q3, q9, q10, #8
  1511. vsub.u16 q14, q14, q11
  1512. vsub.u16 q15, q15, q12
  1513. vadd.u16 q0, q0, q11
  1514. vadd.u16 q1, q1, q12
  1515. vadd.u16 q2, q2, q8
  1516. vadd.u16 q3, q3, q9
  1517. vst1.16 {q14}, [r1]!
  1518. vst1.16 {q15}, [r1]!
  1519. vmov q11, q13
  1520. vmov q8, q10
  1521. vsub.u16 q0, q2, q0
  1522. vsub.u16 q1, q3, q1
  1523. vld1.16 {q12, q13}, [r3]!
  1524. vld1.16 {q9, q10}, [r5]!
  1525. vst1.16 {q0}, [r0]!
  1526. vst1.16 {q1}, [r0]!
  1527. bgt 1b
  1528. 2:
  1529. pop {r4-r5}
  1530. bx lr
  1531. endfunc
  1532. function integral_init8v_neon
  1533. add r2, r0, r1, lsl #4
  1534. sub r1, r1, #8
  1535. ands r3, r1, #16 - 1
  1536. beq 1f
  1537. subs r1, r1, #8
  1538. vld1.16 {q0}, [r0]
  1539. vld1.16 {q2}, [r2]!
  1540. vsub.u16 q8, q2, q0
  1541. vst1.16 {q8}, [r0]!
  1542. ble 2f
  1543. 1:
  1544. subs r1, r1, #16
  1545. vld1.16 {q0, q1}, [r0]
  1546. vld1.16 {q2, q3}, [r2]!
  1547. vsub.u16 q8, q2, q0
  1548. vsub.u16 q9, q3, q1
  1549. vst1.16 {q8}, [r0]!
  1550. vst1.16 {q9}, [r0]!
  1551. bgt 1b
  1552. 2:
  1553. bx lr
  1554. endfunc
  1555. function mbtree_propagate_cost_neon
  1556. push {r4-r5,lr}
  1557. ldrd r4, r5, [sp, #12]
  1558. ldr lr, [sp, #20]
  1559. vld1.32 {d6[], d7[]}, [r5]
  1560. 8:
  1561. subs lr, lr, #8
  1562. vld1.16 {q8}, [r1]!
  1563. vld1.16 {q9}, [r2]!
  1564. vld1.16 {q10}, [r3]!
  1565. vld1.16 {q11}, [r4]!
  1566. vbic.u16 q10, #0xc000
  1567. vmin.u16 q10, q9, q10
  1568. vmull.u16 q12, d18, d22 @ propagate_intra
  1569. vmull.u16 q13, d19, d23 @ propagate_intra
  1570. vsubl.u16 q14, d18, d20 @ propagate_num
  1571. vsubl.u16 q15, d19, d21 @ propagate_num
  1572. vmovl.u16 q10, d18 @ propagate_denom
  1573. vmovl.u16 q11, d19 @ propagate_denom
  1574. vmovl.u16 q9, d17
  1575. vmovl.u16 q8, d16
  1576. vcvt.f32.s32 q12, q12
  1577. vcvt.f32.s32 q13, q13
  1578. vcvt.f32.s32 q14, q14
  1579. vcvt.f32.s32 q15, q15
  1580. vcvt.f32.s32 q10, q10
  1581. vcvt.f32.s32 q11, q11
  1582. vrecpe.f32 q0, q10
  1583. vrecpe.f32 q1, q11
  1584. vcvt.f32.s32 q8, q8
  1585. vcvt.f32.s32 q9, q9
  1586. vrecps.f32 q10, q0, q10
  1587. vrecps.f32 q11, q1, q11
  1588. vmla.f32 q8, q12, q3 @ propagate_amount
  1589. vmla.f32 q9, q13, q3 @ propagate_amount
  1590. vmul.f32 q0, q0, q10
  1591. vmul.f32 q1, q1, q11
  1592. vmul.f32 q8, q8, q14
  1593. vmul.f32 q9, q9, q15
  1594. vmul.f32 q0, q8, q0
  1595. vmul.f32 q1, q9, q1
  1596. vcvt.s32.f32 q0, q0
  1597. vcvt.s32.f32 q1, q1
  1598. vqmovn.s32 d0, q0
  1599. vqmovn.s32 d1, q1
  1600. vst1.16 {q0}, [r0]!
  1601. bgt 8b
  1602. pop {r4-r5,pc}
  1603. endfunc
  1604. function mbtree_propagate_list_internal_neon
  1605. vld1.16 {d4[]}, [sp] @ bipred_weight
  1606. movrel r12, pw_0to15
  1607. vmov.u16 q10, #0xc000
  1608. vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y
  1609. ldrh r12, [sp, #4]
  1610. vmov.u32 q11, #4
  1611. vmov.u8 q3, #32
  1612. vdup.u16 q8, r12 @ mb_y
  1613. vzip.u16 q0, q8
  1614. ldr r12, [sp, #8]
  1615. 8:
  1616. subs r12, r12, #8
  1617. vld1.16 {q14}, [r1, :128]! @ propagate_amount
  1618. vld1.16 {q15}, [r2]! @ lowres_cost
  1619. vld1.16 {q8, q9}, [r0]!
  1620. vand q15, q15, q10
  1621. vceq.u16 q1, q15, q10
  1622. vmull.u16 q12, d28, d4
  1623. vmull.u16 q13, d29, d4
  1624. vrshrn.u32 d30, q12, #6
  1625. vrshrn.u32 d31, q13, #6
  1626. vbsl q1, q15, q14 @ if( lists_used == 3 )
  1627. @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
  1628. vshr.s16 q12, q8, #5
  1629. vshr.s16 q13, q9, #5
  1630. vuzp.16 q8, q9 @ x & 31, y & 31
  1631. vadd.s16 q12, q12, q0
  1632. vadd.s16 q0, q0, q11
  1633. vmovn.i16 d16, q8
  1634. vmovn.i16 d17, q9
  1635. vadd.s16 q13, q13, q0
  1636. vbic.i16 q8, #128+64+32
  1637. vadd.s16 q0, q0, q11
  1638. vbic.i16 q8, #(128+64+32)<<8
  1639. vst1.16 {q12, q13}, [r3, :128]!
  1640. vsub.i8 q9, q3, q8
  1641. vmull.u8 q12, d17, d16 @ idx3weight = y*x
  1642. vmull.u8 q14, d19, d16 @ idx1weight = (32-y)*x
  1643. vmull.u8 q15, d19, d18 @ idx0weight = (32-y)*(32-x)
  1644. vmull.u8 q13, d17, d18 @ idx2weight = y*(32-x)
  1645. vmull.u16 q9, d28, d2 @ idx1weight
  1646. vmull.u16 q8, d29, d3
  1647. vmull.u16 q14, d30, d2 @ idx0weight
  1648. vmull.u16 q15, d31, d3
  1649. vrshrn.u32 d18, q9, #10 @ idx1weight
  1650. vrshrn.u32 d19, q8, #10
  1651. vrshrn.u32 d16, q14, #10 @ idx0weight
  1652. vrshrn.u32 d17, q15, #10
  1653. vmull.u16 q14, d24, d2 @ idx3weight
  1654. vmull.u16 q15, d25, d3
  1655. vzip.16 q8, q9
  1656. vmull.u16 q12, d26, d2 @ idx2weight
  1657. vmull.u16 q13, d27, d3
  1658. vst1.16 {q8, q9}, [r3, :128]!
  1659. vrshrn.u32 d19, q15, #10 @ idx3weight
  1660. vrshrn.u32 d18, q14, #10
  1661. vrshrn.u32 d16, q12, #10 @ idx2weight
  1662. vrshrn.u32 d17, q13, #10
  1663. vzip.16 q8, q9
  1664. vst1.16 {q8, q9}, [r3, :128]!
  1665. bge 8b
  1666. bx lr
  1667. endfunc
  1668. @ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
  1669. function mbtree_fix8_pack_neon, export=1
  1670. subs r3, r2, #8
  1671. blt 2f
  1672. 1:
  1673. subs r3, r3, #8
  1674. vld1.32 {q0,q1}, [r1,:128]!
  1675. vcvt.s32.f32 q0, q0, #8
  1676. vcvt.s32.f32 q1, q1, #8
  1677. vqmovn.s32 d4, q0
  1678. vqmovn.s32 d5, q1
  1679. vrev16.8 q3, q2
  1680. vst1.16 {q3}, [r0,:128]!
  1681. bge 1b
  1682. 2:
  1683. adds r3, r3, #8
  1684. bxeq lr
  1685. 3:
  1686. subs r3, r3, #1
  1687. vld1.32 {d0[0]}, [r1]!
  1688. vcvt.s32.f32 s0, s0, #8
  1689. vrev16.8 d0, d0
  1690. vst1.16 {d0[0]}, [r0]!
  1691. bgt 3b
  1692. bx lr
  1693. endfunc
  1694. @ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
  1695. function mbtree_fix8_unpack_neon, export=1
  1696. subs r3, r2, #8
  1697. blt 2f
  1698. 1:
  1699. subs r3, r3, #8
  1700. vld1.16 {q0}, [r1,:128]!
  1701. vrev16.8 q1, q0
  1702. vmovl.s16 q0, d2
  1703. vmovl.s16 q1, d3
  1704. vcvt.f32.s32 q0, q0, #8
  1705. vcvt.f32.s32 q1, q1, #8
  1706. vst1.32 {q0,q1}, [r0,:128]!
  1707. bge 1b
  1708. 2:
  1709. adds r3, r3, #8
  1710. bxeq lr
  1711. 3:
  1712. subs r3, r3, #1
  1713. vld1.16 {d0[0]}, [r1]!
  1714. vrev16.8 d0, d0
  1715. vmovl.s16 q0, d0
  1716. vcvt.f32.s32 d0, d0, #8
  1717. vst1.32 {d0[0]}, [r0]!
  1718. bgt 3b
  1719. bx lr
  1720. endfunc