predict-a.S 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808
  1. /*****************************************************************************
  2. * predict.S: arm intra prediction
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Mans Rullgard <mans@mansr.com>
  8. * Martin Storsjo <martin@martin.st>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include "asm.S"
  28. const p16weight, align=4
  29. .short 1,2,3,4,5,6,7,8
  30. endconst
  31. .text
  32. .macro ldcol.8 rd, rs, rt, n=8, hi=0
  33. .if \n == 8 || \hi == 0
  34. vld1.8 {\rd[0]}, [\rs], \rt
  35. vld1.8 {\rd[1]}, [\rs], \rt
  36. vld1.8 {\rd[2]}, [\rs], \rt
  37. vld1.8 {\rd[3]}, [\rs], \rt
  38. .endif
  39. .if \n == 8 || \hi == 1
  40. vld1.8 {\rd[4]}, [\rs], \rt
  41. vld1.8 {\rd[5]}, [\rs], \rt
  42. vld1.8 {\rd[6]}, [\rs], \rt
  43. vld1.8 {\rd[7]}, [\rs], \rt
  44. .endif
  45. .endm
  46. .macro ldcol.16 rd1, rd2, rs, rt, ru
  47. add \ru, \rs, \rt, lsl #3
  48. vld1.8 {\rd1[0]}, [\rs], \rt
  49. vld1.8 {\rd2[0]}, [\ru], \rt
  50. vld1.8 {\rd1[1]}, [\rs], \rt
  51. vld1.8 {\rd2[1]}, [\ru], \rt
  52. vld1.8 {\rd1[2]}, [\rs], \rt
  53. vld1.8 {\rd2[2]}, [\ru], \rt
  54. vld1.8 {\rd1[3]}, [\rs], \rt
  55. vld1.8 {\rd2[3]}, [\ru], \rt
  56. vld1.8 {\rd1[4]}, [\rs], \rt
  57. vld1.8 {\rd2[4]}, [\ru], \rt
  58. vld1.8 {\rd1[5]}, [\rs], \rt
  59. vld1.8 {\rd2[5]}, [\ru], \rt
  60. vld1.8 {\rd1[6]}, [\rs], \rt
  61. vld1.8 {\rd2[6]}, [\ru], \rt
  62. vld1.8 {\rd1[7]}, [\rs], \rt
  63. vld1.8 {\rd2[7]}, [\ru], \rt
  64. .endm
  65. .macro add16x8 dq, dl, dh, rl, rh
  66. vaddl.u8 \dq, \rl, \rh
  67. vadd.u16 \dl, \dl, \dh
  68. vpadd.u16 \dl, \dl, \dl
  69. vpadd.u16 \dl, \dl, \dl
  70. .endm
  71. // because gcc doesn't believe in using the free shift in add
  72. function predict_4x4_h_armv6
  73. ldrb r1, [r0, #0*FDEC_STRIDE-1]
  74. ldrb r2, [r0, #1*FDEC_STRIDE-1]
  75. ldrb r3, [r0, #2*FDEC_STRIDE-1]
  76. ldrb ip, [r0, #3*FDEC_STRIDE-1]
  77. add r1, r1, r1, lsl #8
  78. add r2, r2, r2, lsl #8
  79. add r3, r3, r3, lsl #8
  80. add ip, ip, ip, lsl #8
  81. add r1, r1, r1, lsl #16
  82. str r1, [r0, #0*FDEC_STRIDE]
  83. add r2, r2, r2, lsl #16
  84. str r2, [r0, #1*FDEC_STRIDE]
  85. add r3, r3, r3, lsl #16
  86. str r3, [r0, #2*FDEC_STRIDE]
  87. add ip, ip, ip, lsl #16
  88. str ip, [r0, #3*FDEC_STRIDE]
  89. bx lr
  90. endfunc
  91. function predict_4x4_v_armv6
  92. ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
  93. str r1, [r0, #0 + 0 * FDEC_STRIDE]
  94. str r1, [r0, #0 + 1 * FDEC_STRIDE]
  95. str r1, [r0, #0 + 2 * FDEC_STRIDE]
  96. str r1, [r0, #0 + 3 * FDEC_STRIDE]
  97. bx lr
  98. endfunc
  99. function predict_4x4_dc_armv6
  100. mov ip, #0
  101. ldr r1, [r0, #-FDEC_STRIDE]
  102. ldrb r2, [r0, #0*FDEC_STRIDE-1]
  103. ldrb r3, [r0, #1*FDEC_STRIDE-1]
  104. usad8 r1, r1, ip
  105. add r2, r2, #4
  106. ldrb ip, [r0, #2*FDEC_STRIDE-1]
  107. add r2, r2, r3
  108. ldrb r3, [r0, #3*FDEC_STRIDE-1]
  109. add r2, r2, ip
  110. add r2, r2, r3
  111. add r1, r1, r2
  112. lsr r1, r1, #3
  113. add r1, r1, r1, lsl #8
  114. add r1, r1, r1, lsl #16
  115. str r1, [r0, #0*FDEC_STRIDE]
  116. str r1, [r0, #1*FDEC_STRIDE]
  117. str r1, [r0, #2*FDEC_STRIDE]
  118. str r1, [r0, #3*FDEC_STRIDE]
  119. bx lr
  120. endfunc
  121. function predict_4x4_dc_top_neon
  122. mov r12, #FDEC_STRIDE
  123. sub r1, r0, #FDEC_STRIDE
  124. vld1.32 d1[], [r1,:32]
  125. vpaddl.u8 d1, d1
  126. vpadd.u16 d1, d1, d1
  127. vrshr.u16 d1, d1, #2
  128. vdup.8 d1, d1[0]
  129. vst1.32 d1[0], [r0,:32], r12
  130. vst1.32 d1[0], [r0,:32], r12
  131. vst1.32 d1[0], [r0,:32], r12
  132. vst1.32 d1[0], [r0,:32], r12
  133. bx lr
  134. endfunc
  135. // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
  136. .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
  137. uhadd8 \a1, \a1, \c1
  138. uhadd8 \a2, \a2, \c2
  139. uhadd8 \c1, \a1, \b1
  140. uhadd8 \c2, \a2, \b2
  141. eor \a1, \a1, \b1
  142. eor \a2, \a2, \b2
  143. and \a1, \a1, \pb_1
  144. and \a2, \a2, \pb_1
  145. uadd8 \a1, \a1, \c1
  146. uadd8 \a2, \a2, \c2
  147. .endm
  148. function predict_4x4_ddr_armv6
  149. ldr r1, [r0, # -FDEC_STRIDE]
  150. ldrb r2, [r0, # -FDEC_STRIDE-1]
  151. ldrb r3, [r0, #0*FDEC_STRIDE-1]
  152. push {r4-r6,lr}
  153. add r2, r2, r1, lsl #8
  154. ldrb r4, [r0, #1*FDEC_STRIDE-1]
  155. add r3, r3, r2, lsl #8
  156. ldrb r5, [r0, #2*FDEC_STRIDE-1]
  157. ldrb r6, [r0, #3*FDEC_STRIDE-1]
  158. add r4, r4, r3, lsl #8
  159. add r5, r5, r4, lsl #8
  160. add r6, r6, r5, lsl #8
  161. ldr ip, =0x01010101
  162. PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
  163. str r1, [r0, #0*FDEC_STRIDE]
  164. lsl r2, r1, #8
  165. lsl r3, r1, #16
  166. lsl r4, r4, #8
  167. lsl r5, r1, #24
  168. add r2, r2, r4, lsr #24
  169. str r2, [r0, #1*FDEC_STRIDE]
  170. add r3, r3, r4, lsr #16
  171. str r3, [r0, #2*FDEC_STRIDE]
  172. add r5, r5, r4, lsr #8
  173. str r5, [r0, #3*FDEC_STRIDE]
  174. pop {r4-r6,pc}
  175. endfunc
  176. function predict_4x4_ddl_neon
  177. sub r0, #FDEC_STRIDE
  178. mov ip, #FDEC_STRIDE
  179. vld1.64 {d0}, [r0], ip
  180. vdup.8 d3, d0[7]
  181. vext.8 d1, d0, d0, #1
  182. vext.8 d2, d0, d3, #2
  183. vhadd.u8 d0, d0, d2
  184. vrhadd.u8 d0, d0, d1
  185. vst1.32 {d0[0]}, [r0,:32], ip
  186. vext.8 d1, d0, d0, #1
  187. vext.8 d2, d0, d0, #2
  188. vst1.32 {d1[0]}, [r0,:32], ip
  189. vext.8 d3, d0, d0, #3
  190. vst1.32 {d2[0]}, [r0,:32], ip
  191. vst1.32 {d3[0]}, [r0,:32], ip
  192. bx lr
  193. endfunc
  194. function predict_8x8_dc_neon
  195. mov ip, #0
  196. ldrd r2, r3, [r1, #8]
  197. push {r4-r5,lr}
  198. ldrd r4, r5, [r1, #16]
  199. lsl r3, r3, #8
  200. ldrb lr, [r1, #7]
  201. usad8 r2, r2, ip
  202. usad8 r3, r3, ip
  203. usada8 r2, r4, ip, r2
  204. add lr, lr, #8
  205. usada8 r3, r5, ip, r3
  206. add r2, r2, lr
  207. mov ip, #FDEC_STRIDE
  208. add r2, r2, r3
  209. lsr r2, r2, #4
  210. vdup.8 d0, r2
  211. .rept 8
  212. vst1.64 {d0}, [r0,:64], ip
  213. .endr
  214. pop {r4-r5,pc}
  215. endfunc
  216. function predict_8x8_h_neon
  217. add r1, r1, #7
  218. mov ip, #FDEC_STRIDE
  219. vld1.64 {d16}, [r1]
  220. vdup.8 d0, d16[7]
  221. vdup.8 d1, d16[6]
  222. vst1.64 {d0}, [r0,:64], ip
  223. vdup.8 d2, d16[5]
  224. vst1.64 {d1}, [r0,:64], ip
  225. vdup.8 d3, d16[4]
  226. vst1.64 {d2}, [r0,:64], ip
  227. vdup.8 d4, d16[3]
  228. vst1.64 {d3}, [r0,:64], ip
  229. vdup.8 d5, d16[2]
  230. vst1.64 {d4}, [r0,:64], ip
  231. vdup.8 d6, d16[1]
  232. vst1.64 {d5}, [r0,:64], ip
  233. vdup.8 d7, d16[0]
  234. vst1.64 {d6}, [r0,:64], ip
  235. vst1.64 {d7}, [r0,:64], ip
  236. bx lr
  237. endfunc
  238. function predict_8x8_v_neon
  239. add r1, r1, #16
  240. mov r12, #FDEC_STRIDE
  241. vld1.8 {d0}, [r1,:64]
  242. .rept 8
  243. vst1.8 {d0}, [r0,:64], r12
  244. .endr
  245. bx lr
  246. endfunc
  247. function predict_8x8_ddl_neon
  248. add r1, #16
  249. vld1.8 {d0, d1}, [r1,:128]
  250. vmov.i8 q3, #0
  251. vrev64.8 d2, d1
  252. vext.8 q8, q3, q0, #15
  253. vext.8 q2, q0, q1, #1
  254. vhadd.u8 q8, q2
  255. mov r12, #FDEC_STRIDE
  256. vrhadd.u8 q0, q8
  257. vext.8 d2, d0, d1, #1
  258. vext.8 d3, d0, d1, #2
  259. vst1.8 d2, [r0,:64], r12
  260. vext.8 d2, d0, d1, #3
  261. vst1.8 d3, [r0,:64], r12
  262. vext.8 d3, d0, d1, #4
  263. vst1.8 d2, [r0,:64], r12
  264. vext.8 d2, d0, d1, #5
  265. vst1.8 d3, [r0,:64], r12
  266. vext.8 d3, d0, d1, #6
  267. vst1.8 d2, [r0,:64], r12
  268. vext.8 d2, d0, d1, #7
  269. vst1.8 d3, [r0,:64], r12
  270. vst1.8 d2, [r0,:64], r12
  271. vst1.8 d1, [r0,:64], r12
  272. bx lr
  273. endfunc
  274. function predict_8x8_ddr_neon
  275. vld1.8 {d0-d3}, [r1,:128]
  276. vext.8 q2, q0, q1, #7
  277. vext.8 q3, q0, q1, #9
  278. vhadd.u8 q2, q2, q3
  279. vrhadd.u8 d0, d1, d4
  280. vrhadd.u8 d1, d2, d5
  281. add r0, #7*FDEC_STRIDE
  282. mov r12, #-1*FDEC_STRIDE
  283. vext.8 d2, d0, d1, #1
  284. vst1.8 {d0}, [r0,:64], r12
  285. vext.8 d4, d0, d1, #2
  286. vst1.8 {d2}, [r0,:64], r12
  287. vext.8 d5, d0, d1, #3
  288. vst1.8 {d4}, [r0,:64], r12
  289. vext.8 d4, d0, d1, #4
  290. vst1.8 {d5}, [r0,:64], r12
  291. vext.8 d5, d0, d1, #5
  292. vst1.8 {d4}, [r0,:64], r12
  293. vext.8 d4, d0, d1, #6
  294. vst1.8 {d5}, [r0,:64], r12
  295. vext.8 d5, d0, d1, #7
  296. vst1.8 {d4}, [r0,:64], r12
  297. vst1.8 {d5}, [r0,:64], r12
  298. bx lr
  299. endfunc
  300. function predict_8x8_vl_neon
  301. add r1, #16
  302. mov r12, #FDEC_STRIDE
  303. vld1.8 {d0, d1}, [r1,:128]
  304. vext.8 q1, q1, q0, #15
  305. vext.8 q2, q0, q2, #1
  306. vrhadd.u8 q3, q0, q2
  307. vhadd.u8 q1, q1, q2
  308. vrhadd.u8 q0, q0, q1
  309. vext.8 d2, d0, d1, #1
  310. vst1.8 {d6}, [r0,:64], r12
  311. vext.8 d3, d6, d7, #1
  312. vst1.8 {d2}, [r0,:64], r12
  313. vext.8 d2, d0, d1, #2
  314. vst1.8 {d3}, [r0,:64], r12
  315. vext.8 d3, d6, d7, #2
  316. vst1.8 {d2}, [r0,:64], r12
  317. vext.8 d2, d0, d1, #3
  318. vst1.8 {d3}, [r0,:64], r12
  319. vext.8 d3, d6, d7, #3
  320. vst1.8 {d2}, [r0,:64], r12
  321. vext.8 d2, d0, d1, #4
  322. vst1.8 {d3}, [r0,:64], r12
  323. vst1.8 {d2}, [r0,:64], r12
  324. bx lr
  325. endfunc
  326. function predict_8x8_vr_neon
  327. add r1, #8
  328. mov r12, #FDEC_STRIDE
  329. vld1.8 {d4,d5}, [r1,:64]
  330. vext.8 q1, q2, q2, #14
  331. vext.8 q0, q2, q2, #15
  332. vhadd.u8 q3, q2, q1
  333. vrhadd.u8 q2, q2, q0
  334. vrhadd.u8 q0, q0, q3
  335. vmov d2, d0
  336. vst1.8 {d5}, [r0,:64], r12
  337. vuzp.8 d2, d0
  338. vst1.8 {d1}, [r0,:64], r12
  339. vext.8 d6, d0, d5, #7
  340. vext.8 d3, d2, d1, #7
  341. vst1.8 {d6}, [r0,:64], r12
  342. vst1.8 {d3}, [r0,:64], r12
  343. vext.8 d6, d0, d5, #6
  344. vext.8 d3, d2, d1, #6
  345. vst1.8 {d6}, [r0,:64], r12
  346. vst1.8 {d3}, [r0,:64], r12
  347. vext.8 d6, d0, d5, #5
  348. vext.8 d3, d2, d1, #5
  349. vst1.8 {d6}, [r0,:64], r12
  350. vst1.8 {d3}, [r0,:64], r12
  351. bx lr
  352. endfunc
  353. function predict_8x8_hd_neon
  354. mov r12, #FDEC_STRIDE
  355. add r1, #7
  356. vld1.8 {d2,d3}, [r1]
  357. vext.8 q3, q1, q1, #1
  358. vext.8 q2, q1, q1, #2
  359. vrhadd.u8 q8, q1, q3
  360. vhadd.u8 q1, q2
  361. vrhadd.u8 q0, q1, q3
  362. vzip.8 d16, d0
  363. vext.8 d2, d0, d1, #6
  364. vext.8 d3, d0, d1, #4
  365. vst1.8 {d2}, [r0,:64], r12
  366. vext.8 d2, d0, d1, #2
  367. vst1.8 {d3}, [r0,:64], r12
  368. vst1.8 {d2}, [r0,:64], r12
  369. vext.8 d2, d16, d0, #6
  370. vst1.8 {d0}, [r0,:64], r12
  371. vext.8 d3, d16, d0, #4
  372. vst1.8 {d2}, [r0,:64], r12
  373. vext.8 d2, d16, d0, #2
  374. vst1.8 {d3}, [r0,:64], r12
  375. vst1.8 {d2}, [r0,:64], r12
  376. vst1.8 {d16}, [r0,:64], r12
  377. bx lr
  378. endfunc
  379. function predict_8x8_hu_neon
  380. mov r12, #FDEC_STRIDE
  381. add r1, #7
  382. vld1.8 {d7}, [r1]
  383. vdup.8 d6, d7[0]
  384. vrev64.8 d7, d7
  385. vext.8 d4, d7, d6, #2
  386. vext.8 d2, d7, d6, #1
  387. vhadd.u8 d16, d7, d4
  388. vrhadd.u8 d0, d2, d7
  389. vrhadd.u8 d1, d16, d2
  390. vzip.8 d0, d1
  391. vdup.16 q1, d1[3]
  392. vext.8 q2, q0, q1, #2
  393. vext.8 q3, q0, q1, #4
  394. vext.8 q8, q0, q1, #6
  395. vst1.8 {d0}, [r0,:64], r12
  396. vst1.8 {d4}, [r0,:64], r12
  397. vst1.8 {d6}, [r0,:64], r12
  398. vst1.8 {d16}, [r0,:64], r12
  399. vst1.8 {d1}, [r0,:64], r12
  400. vst1.8 {d5}, [r0,:64], r12
  401. vst1.8 {d7}, [r0,:64], r12
  402. vst1.8 {d17}, [r0,:64]
  403. bx lr
  404. endfunc
  405. function predict_8x8c_dc_top_neon
  406. sub r2, r0, #FDEC_STRIDE
  407. mov r1, #FDEC_STRIDE
  408. vld1.8 {d0}, [r2,:64]
  409. vpaddl.u8 d0, d0
  410. vpadd.u16 d0, d0, d0
  411. vrshrn.u16 d0, q0, #2
  412. vdup.8 d1, d0[1]
  413. vdup.8 d0, d0[0]
  414. vtrn.32 d0, d1
  415. b pred8x8_dc_end
  416. endfunc
  417. function predict_8x8c_dc_left_neon
  418. mov r1, #FDEC_STRIDE
  419. sub r2, r0, #1
  420. ldcol.8 d0, r2, r1
  421. vpaddl.u8 d0, d0
  422. vpadd.u16 d0, d0, d0
  423. vrshrn.u16 d0, q0, #2
  424. vdup.8 d1, d0[1]
  425. vdup.8 d0, d0[0]
  426. b pred8x8_dc_end
  427. endfunc
  428. function predict_8x8c_dc_neon
  429. sub r2, r0, #FDEC_STRIDE
  430. mov r1, #FDEC_STRIDE
  431. vld1.8 {d0}, [r2,:64]
  432. sub r2, r0, #1
  433. ldcol.8 d1, r2, r1
  434. vtrn.32 d0, d1
  435. vpaddl.u8 q0, q0
  436. vpadd.u16 d0, d0, d1
  437. vpadd.u16 d1, d0, d0
  438. vrshrn.u16 d2, q0, #3
  439. vrshrn.u16 d3, q0, #2
  440. vdup.8 d0, d2[4]
  441. vdup.8 d1, d3[3]
  442. vdup.8 d4, d3[2]
  443. vdup.8 d5, d2[5]
  444. vtrn.32 q0, q2
  445. pred8x8_dc_end:
  446. add r2, r0, r1, lsl #2
  447. .rept 4
  448. vst1.8 {d0}, [r0,:64], r1
  449. vst1.8 {d1}, [r2,:64], r1
  450. .endr
  451. bx lr
  452. endfunc
  453. function predict_8x8c_h_neon
  454. sub r1, r0, #1
  455. mov ip, #FDEC_STRIDE
  456. .rept 4
  457. vld1.8 {d0[]}, [r1], ip
  458. vld1.8 {d2[]}, [r1], ip
  459. vst1.64 {d0}, [r0,:64], ip
  460. vst1.64 {d2}, [r0,:64], ip
  461. .endr
  462. bx lr
  463. endfunc
  464. function predict_8x8c_v_neon
  465. sub r0, r0, #FDEC_STRIDE
  466. mov ip, #FDEC_STRIDE
  467. vld1.64 {d0}, [r0,:64], ip
  468. .rept 8
  469. vst1.64 {d0}, [r0,:64], ip
  470. .endr
  471. bx lr
  472. endfunc
  473. function predict_8x8c_p_neon
  474. sub r3, r0, #FDEC_STRIDE
  475. mov r1, #FDEC_STRIDE
  476. add r2, r3, #4
  477. sub r3, r3, #1
  478. vld1.32 {d0[0]}, [r3]
  479. vld1.32 {d2[0]}, [r2,:32], r1
  480. ldcol.8 d0, r3, r1, 4, hi=1
  481. add r3, r3, r1
  482. ldcol.8 d3, r3, r1, 4
  483. vaddl.u8 q8, d2, d3
  484. vrev32.8 d0, d0
  485. vtrn.32 d2, d3
  486. vsubl.u8 q2, d2, d0
  487. movrel r3, p16weight
  488. vld1.16 {q0}, [r3,:128]
  489. vmul.s16 d4, d4, d0
  490. vmul.s16 d5, d5, d0
  491. vpadd.i16 d4, d4, d5
  492. vpaddl.s16 d4, d4
  493. vshl.i32 d5, d4, #4
  494. vadd.s32 d4, d4, d5
  495. vrshrn.s32 d4, q2, #5
  496. mov r3, #0
  497. vtrn.16 d4, d5
  498. vadd.i16 d2, d4, d5
  499. vshl.i16 d3, d2, #2
  500. vrev64.16 d16, d16
  501. vsub.i16 d3, d3, d2
  502. vadd.i16 d16, d16, d0
  503. vshl.i16 d2, d16, #4
  504. vsub.i16 d2, d2, d3
  505. vext.16 q0, q0, q0, #7
  506. vmov.16 d0[0], r3
  507. vmul.i16 q0, q0, d4[0]
  508. vdup.16 q1, d2[0]
  509. vdup.16 q3, d5[0]
  510. vadd.i16 q1, q1, q0
  511. mov r3, #8
  512. 1:
  513. vqshrun.s16 d0, q1, #5
  514. vadd.i16 q1, q1, q3
  515. vst1.8 {d0}, [r0,:64], r1
  516. subs r3, r3, #1
  517. bne 1b
  518. bx lr
  519. endfunc
  520. function predict_8x16c_dc_top_neon
  521. sub r2, r0, #FDEC_STRIDE
  522. mov r1, #FDEC_STRIDE
  523. vld1.8 {d0}, [r2,:64]
  524. vpaddl.u8 d0, d0
  525. vpadd.u16 d0, d0, d0
  526. vrshrn.u16 d0, q0, #2
  527. vdup.8 d1, d0[1]
  528. vdup.8 d0, d0[0]
  529. vtrn.32 d0, d1
  530. add r2, r0, r1, lsl #2
  531. .rept 4
  532. vst1.8 {d0}, [r0,:64], r1
  533. vst1.8 {d1}, [r2,:64], r1
  534. .endr
  535. add r2, r2, r1, lsl #2
  536. add r0, r0, r1, lsl #2
  537. .rept 4
  538. vst1.8 {d0}, [r0,:64], r1
  539. vst1.8 {d1}, [r2,:64], r1
  540. .endr
  541. bx lr
  542. endfunc
  543. function predict_8x16c_h_neon
  544. sub r1, r0, #1
  545. mov ip, #FDEC_STRIDE
  546. .rept 8
  547. vld1.8 {d0[]}, [r1], ip
  548. vld1.8 {d2[]}, [r1], ip
  549. vst1.64 {d0}, [r0,:64], ip
  550. vst1.64 {d2}, [r0,:64], ip
  551. .endr
  552. bx lr
  553. endfunc
  554. function predict_8x16c_p_neon
  555. sub r3, r0, #FDEC_STRIDE
  556. mov r1, #FDEC_STRIDE
  557. add r2, r3, #4
  558. sub r3, r3, #1
  559. vld1.32 {d0[0]}, [r3]
  560. vld1.32 {d2[0]}, [r2,:32], r1
  561. ldcol.8 d1, r3, r1
  562. add r3, r3, r1
  563. ldcol.8 d3, r3, r1
  564. vrev64.32 d16, d3
  565. vaddl.u8 q8, d2, d16
  566. vrev32.8 d0, d0
  567. vsubl.u8 q2, d2, d0
  568. vrev64.8 d1, d1
  569. vsubl.u8 q3, d3, d1
  570. movrel r3, p16weight
  571. vld1.16 {q0}, [r3,:128]
  572. vmul.s16 d4, d4, d0
  573. vmul.s16 q3, q3, q0
  574. vpadd.i16 d4, d4, d5
  575. vpadd.i16 d6, d6, d7
  576. vpaddl.s16 d4, d4 @ d4[0] = H
  577. vpaddl.s16 d6, d6
  578. vpadd.s32 d6, d6 @ d6[0] = V
  579. vshl.i32 d5, d4, #4
  580. vadd.s32 d4, d4, d5 @ d4[0] = 17*H
  581. vshl.i32 d7, d6, #2
  582. vrshrn.s32 d4, q2, #5 @ d4[0] = b
  583. vadd.s32 d6, d6, d7 @ d6[0] = 5*V
  584. vrshrn.s32 d6, q3, #6 @ d6[0] = c
  585. mov r3, #0
  586. vshl.i16 d3, d4, #2
  587. vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
  588. vshl.i16 d2, d6, #3
  589. vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
  590. vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
  591. vrev64.16 d16, d16
  592. vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
  593. vshl.i16 d2, d16, #4 @ d3[0] = a + 16
  594. vsub.i16 d2, d2, d3 @ i00
  595. vext.16 q0, q0, q0, #7
  596. vmov.16 d0[0], r3
  597. vmul.i16 q0, q0, d4[0]
  598. vdup.16 q1, d2[0]
  599. vdup.16 q3, d6[0]
  600. vadd.i16 q1, q1, q0
  601. mov r3, #16
  602. 1:
  603. vqshrun.s16 d0, q1, #5
  604. vadd.i16 q1, q1, q3
  605. vst1.8 {d0}, [r0,:64], r1
  606. subs r3, r3, #1
  607. bne 1b
  608. bx lr
  609. endfunc
  610. function predict_16x16_dc_top_neon
  611. sub r2, r0, #FDEC_STRIDE
  612. mov r1, #FDEC_STRIDE
  613. vld1.8 {q0}, [r2,:128]
  614. add16x8 q0, d0, d1, d0, d1
  615. vrshrn.u16 d0, q0, #4
  616. vdup.8 q0, d0[0]
  617. b pred16x16_dc_end
  618. endfunc
  619. function predict_16x16_dc_left_neon
  620. mov r1, #FDEC_STRIDE
  621. sub r2, r0, #1
  622. ldcol.8 d0, r2, r1
  623. ldcol.8 d1, r2, r1
  624. add16x8 q0, d0, d1, d0, d1
  625. vrshrn.u16 d0, q0, #4
  626. vdup.8 q0, d0[0]
  627. b pred16x16_dc_end
  628. endfunc
  629. function predict_16x16_dc_neon
  630. sub r3, r0, #FDEC_STRIDE
  631. sub r0, r0, #1
  632. vld1.64 {d0-d1}, [r3,:128]
  633. ldrb ip, [r0], #FDEC_STRIDE
  634. vaddl.u8 q0, d0, d1
  635. ldrb r1, [r0], #FDEC_STRIDE
  636. vadd.u16 d0, d0, d1
  637. vpadd.u16 d0, d0, d0
  638. vpadd.u16 d0, d0, d0
  639. .rept 4
  640. ldrb r2, [r0], #FDEC_STRIDE
  641. add ip, ip, r1
  642. ldrb r3, [r0], #FDEC_STRIDE
  643. add ip, ip, r2
  644. ldrb r1, [r0], #FDEC_STRIDE
  645. add ip, ip, r3
  646. .endr
  647. ldrb r2, [r0], #FDEC_STRIDE
  648. add ip, ip, r1
  649. ldrb r3, [r0], #FDEC_STRIDE
  650. add ip, ip, r2
  651. sub r0, r0, #FDEC_STRIDE*16
  652. add ip, ip, r3
  653. vdup.16 d1, ip
  654. vadd.u16 d0, d0, d1
  655. mov r1, #FDEC_STRIDE
  656. add r0, r0, #1
  657. vrshr.u16 d0, d0, #5
  658. vdup.8 q0, d0[0]
  659. pred16x16_dc_end:
  660. .rept 16
  661. vst1.64 {d0-d1}, [r0,:128], r1
  662. .endr
  663. bx lr
  664. endfunc
  665. function predict_16x16_h_neon
  666. sub r1, r0, #1
  667. mov ip, #FDEC_STRIDE
  668. .rept 8
  669. vld1.8 {d0[]}, [r1], ip
  670. vmov d1, d0
  671. vld1.8 {d2[]}, [r1], ip
  672. vmov d3, d2
  673. vst1.64 {d0-d1}, [r0,:128], ip
  674. vst1.64 {d2-d3}, [r0,:128], ip
  675. .endr
  676. bx lr
  677. endfunc
  678. function predict_16x16_v_neon
  679. sub r0, r0, #FDEC_STRIDE
  680. mov ip, #FDEC_STRIDE
  681. vld1.64 {d0-d1}, [r0,:128], ip
  682. .rept 16
  683. vst1.64 {d0-d1}, [r0,:128], ip
  684. .endr
  685. bx lr
  686. endfunc
  687. function predict_16x16_p_neon
  688. sub r3, r0, #FDEC_STRIDE
  689. mov r1, #FDEC_STRIDE
  690. add r2, r3, #8
  691. sub r3, r3, #1
  692. vld1.8 {d0}, [r3]
  693. vld1.8 {d2}, [r2,:64], r1
  694. ldcol.8 d1, r3, r1
  695. add r3, r3, r1
  696. ldcol.8 d3, r3, r1
  697. vrev64.8 q0, q0
  698. vaddl.u8 q8, d2, d3
  699. vsubl.u8 q2, d2, d0
  700. vsubl.u8 q3, d3, d1
  701. movrel r3, p16weight
  702. vld1.8 {q0}, [r3,:128]
  703. vmul.s16 q2, q2, q0
  704. vmul.s16 q3, q3, q0
  705. vadd.i16 d4, d4, d5
  706. vadd.i16 d5, d6, d7
  707. vpadd.i16 d4, d4, d5
  708. vpadd.i16 d4, d4, d4
  709. vshll.s16 q3, d4, #2
  710. vaddw.s16 q2, q3, d4
  711. vrshrn.s32 d4, q2, #6
  712. mov r3, #0
  713. vtrn.16 d4, d5
  714. vadd.i16 d2, d4, d5
  715. vshl.i16 d3, d2, #3
  716. vrev64.16 d16, d17
  717. vsub.i16 d3, d3, d2
  718. vadd.i16 d16, d16, d0
  719. vshl.i16 d2, d16, #4
  720. vsub.i16 d2, d2, d3
  721. vshl.i16 d3, d4, #4
  722. vext.16 q0, q0, q0, #7
  723. vsub.i16 d6, d5, d3
  724. vmov.16 d0[0], r3
  725. vmul.i16 q0, q0, d4[0]
  726. vdup.16 q1, d2[0]
  727. vdup.16 q2, d4[0]
  728. vdup.16 q3, d6[0]
  729. vshl.i16 q2, q2, #3
  730. vadd.i16 q1, q1, q0
  731. vadd.i16 q3, q3, q2
  732. mov r3, #16
  733. 1:
  734. vqshrun.s16 d0, q1, #5
  735. vadd.i16 q1, q1, q2
  736. vqshrun.s16 d1, q1, #5
  737. vadd.i16 q1, q1, q3
  738. vst1.8 {q0}, [r0,:128], r1
  739. subs r3, r3, #1
  740. bne 1b
  741. bx lr
  742. endfunc