predict-a.asm 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181
  1. ;*****************************************************************************
  2. ;* predict-a.asm: x86 intra prediction
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Holger Lubitz <holger@lubitz.org>
  8. ;* Fiona Glaser <fiona@x264.com>
  9. ;* Henrik Gramner <henrik@gramner.com>
  10. ;*
  11. ;* This program is free software; you can redistribute it and/or modify
  12. ;* it under the terms of the GNU General Public License as published by
  13. ;* the Free Software Foundation; either version 2 of the License, or
  14. ;* (at your option) any later version.
  15. ;*
  16. ;* This program is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. ;* GNU General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU General Public License
  22. ;* along with this program; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. ;*
  25. ;* This program is also available under a commercial proprietary license.
  26. ;* For more information, contact us at licensing@x264.com.
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA 32
  31. pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
  32. pw_m3: times 16 dw -3
  33. pw_m7: times 16 dw -7
  34. pb_00s_ff: times 8 db 0
  35. pb_0s_ff: times 7 db 0
  36. db 0xff
  37. shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
  38. shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  39. shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
  40. shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
  41. pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
  42. SECTION .text
  43. cextern pb_0
  44. cextern pb_1
  45. cextern pb_3
  46. cextern pw_1
  47. cextern pw_2
  48. cextern pw_4
  49. cextern pw_8
  50. cextern pw_16
  51. cextern pw_00ff
  52. cextern pw_pixel_max
  53. cextern pw_0to15
  54. %macro STORE8 1
  55. mova [r0+0*FDEC_STRIDEB], %1
  56. mova [r0+1*FDEC_STRIDEB], %1
  57. add r0, 4*FDEC_STRIDEB
  58. mova [r0-2*FDEC_STRIDEB], %1
  59. mova [r0-1*FDEC_STRIDEB], %1
  60. mova [r0+0*FDEC_STRIDEB], %1
  61. mova [r0+1*FDEC_STRIDEB], %1
  62. mova [r0+2*FDEC_STRIDEB], %1
  63. mova [r0+3*FDEC_STRIDEB], %1
  64. %endmacro
  65. %macro STORE16 1-4
  66. %if %0 > 1
  67. mov r1d, 2*%0
  68. .loop:
  69. mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
  70. mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
  71. mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
  72. mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
  73. %ifidn %0, 4
  74. mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
  75. mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
  76. mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
  77. mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
  78. add r0, 2*FDEC_STRIDEB
  79. %else ; %0 == 2
  80. add r0, 4*FDEC_STRIDEB
  81. mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
  82. mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
  83. mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
  84. mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
  85. %endif
  86. dec r1d
  87. jg .loop
  88. %else ; %0 == 1
  89. STORE8 %1
  90. %if HIGH_BIT_DEPTH ; Different code paths to reduce code size
  91. add r0, 6*FDEC_STRIDEB
  92. mova [r0-2*FDEC_STRIDEB], %1
  93. mova [r0-1*FDEC_STRIDEB], %1
  94. mova [r0+0*FDEC_STRIDEB], %1
  95. mova [r0+1*FDEC_STRIDEB], %1
  96. add r0, 4*FDEC_STRIDEB
  97. mova [r0-2*FDEC_STRIDEB], %1
  98. mova [r0-1*FDEC_STRIDEB], %1
  99. mova [r0+0*FDEC_STRIDEB], %1
  100. mova [r0+1*FDEC_STRIDEB], %1
  101. %else
  102. add r0, 8*FDEC_STRIDE
  103. mova [r0-4*FDEC_STRIDE], %1
  104. mova [r0-3*FDEC_STRIDE], %1
  105. mova [r0-2*FDEC_STRIDE], %1
  106. mova [r0-1*FDEC_STRIDE], %1
  107. mova [r0+0*FDEC_STRIDE], %1
  108. mova [r0+1*FDEC_STRIDE], %1
  109. mova [r0+2*FDEC_STRIDE], %1
  110. mova [r0+3*FDEC_STRIDE], %1
  111. %endif ; HIGH_BIT_DEPTH
  112. %endif
  113. %endmacro
  114. %macro PRED_H_LOAD 2 ; reg, offset
  115. %if cpuflag(avx2)
  116. vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
  117. %elif HIGH_BIT_DEPTH
  118. movd %1, [r0+(%2)*FDEC_STRIDEB-4]
  119. SPLATW %1, %1, 1
  120. %else
  121. SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
  122. %endif
  123. %endmacro
  124. %macro PRED_H_STORE 3 ; reg, offset, width
  125. %assign %%w %3*SIZEOF_PIXEL
  126. %if %%w == 8
  127. movq [r0+(%2)*FDEC_STRIDEB], %1
  128. %else
  129. %assign %%i 0
  130. %rep %%w/mmsize
  131. mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
  132. %assign %%i %%i+mmsize
  133. %endrep
  134. %endif
  135. %endmacro
  136. %macro PRED_H_4ROWS 2 ; width, inc_ptr
  137. PRED_H_LOAD m0, 0
  138. PRED_H_LOAD m1, 1
  139. PRED_H_STORE m0, 0, %1
  140. PRED_H_STORE m1, 1, %1
  141. PRED_H_LOAD m0, 2
  142. %if %2
  143. add r0, 4*FDEC_STRIDEB
  144. %endif
  145. PRED_H_LOAD m1, 3-4*%2
  146. PRED_H_STORE m0, 2-4*%2, %1
  147. PRED_H_STORE m1, 3-4*%2, %1
  148. %endmacro
  149. ; dest, left, right, src, tmp
  150. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  151. %macro PRED8x8_LOWPASS 4-5
  152. %if HIGH_BIT_DEPTH
  153. paddw %2, %3
  154. psrlw %2, 1
  155. pavgw %1, %4, %2
  156. %else
  157. mova %5, %2
  158. pavgb %2, %3
  159. pxor %3, %5
  160. pand %3, [pb_1]
  161. psubusb %2, %3
  162. pavgb %1, %4, %2
  163. %endif
  164. %endmacro
  165. ;-----------------------------------------------------------------------------
  166. ; void predict_4x4_h( pixel *src )
  167. ;-----------------------------------------------------------------------------
  168. %if HIGH_BIT_DEPTH
  169. INIT_XMM avx2
  170. cglobal predict_4x4_h, 1,1
  171. PRED_H_4ROWS 4, 0
  172. RET
  173. %endif
  174. ;-----------------------------------------------------------------------------
  175. ; void predict_4x4_ddl( pixel *src )
  176. ;-----------------------------------------------------------------------------
  177. %macro PREDICT_4x4_DDL 0
  178. cglobal predict_4x4_ddl, 1,1
  179. movu m1, [r0-FDEC_STRIDEB]
  180. PSLLPIX m2, m1, 1
  181. mova m0, m1
  182. %if HIGH_BIT_DEPTH
  183. PSRLPIX m1, m1, 1
  184. pshufhw m1, m1, q2210
  185. %else
  186. pxor m1, m2
  187. PSRLPIX m1, m1, 1
  188. pxor m1, m0
  189. %endif
  190. PRED8x8_LOWPASS m0, m2, m1, m0, m3
  191. %assign Y 0
  192. %rep 4
  193. PSRLPIX m0, m0, 1
  194. movh [r0+Y*FDEC_STRIDEB], m0
  195. %assign Y (Y+1)
  196. %endrep
  197. RET
  198. %endmacro
  199. %if HIGH_BIT_DEPTH
  200. INIT_XMM sse2
  201. PREDICT_4x4_DDL
  202. INIT_XMM avx
  203. PREDICT_4x4_DDL
  204. INIT_MMX mmx2
  205. cglobal predict_4x4_ddl, 1,2
  206. movu m1, [r0-FDEC_STRIDEB+4]
  207. PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
  208. mova m3, [r0-FDEC_STRIDEB+8]
  209. mova [r0+0*FDEC_STRIDEB], m0
  210. pshufw m4, m3, q3321
  211. PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
  212. mova [r0+3*FDEC_STRIDEB], m2
  213. pshufw m1, m0, q0021
  214. punpckldq m1, m2
  215. mova [r0+1*FDEC_STRIDEB], m1
  216. psllq m0, 16
  217. PALIGNR m2, m0, 6, m0
  218. mova [r0+2*FDEC_STRIDEB], m2
  219. RET
  220. %else ; !HIGH_BIT_DEPTH
  221. INIT_MMX mmx2
  222. PREDICT_4x4_DDL
  223. %endif
  224. ;-----------------------------------------------------------------------------
  225. ; void predict_4x4_vr( pixel *src )
  226. ;-----------------------------------------------------------------------------
  227. %if HIGH_BIT_DEPTH == 0
  228. INIT_MMX ssse3
  229. cglobal predict_4x4_vr, 1,1
  230. movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
  231. mova m4, m1
  232. palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
  233. pavgb m4, m1
  234. palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
  235. mova m0, m1
  236. palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
  237. mova m2, m1
  238. palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
  239. PRED8x8_LOWPASS m2, m0, m1, m2, m3
  240. pshufw m0, m2, 0
  241. psrlq m2, 16
  242. movd [r0+0*FDEC_STRIDEB], m4
  243. palignr m4, m0, 7
  244. movd [r0+1*FDEC_STRIDEB], m2
  245. psllq m0, 8
  246. movd [r0+2*FDEC_STRIDEB], m4
  247. palignr m2, m0, 7
  248. movd [r0+3*FDEC_STRIDEB], m2
  249. RET
  250. %endif ; !HIGH_BIT_DEPTH
  251. ;-----------------------------------------------------------------------------
  252. ; void predict_4x4_ddr( pixel *src )
  253. ;-----------------------------------------------------------------------------
  254. %macro PREDICT_4x4 4
  255. cglobal predict_4x4_ddr, 1,1
  256. %if HIGH_BIT_DEPTH
  257. movu m2, [r0-1*FDEC_STRIDEB-8]
  258. pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
  259. pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
  260. pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
  261. movhps m3, [r0+3*FDEC_STRIDEB-8]
  262. %else ; !HIGH_BIT_DEPTH
  263. movd m0, [r0+2*FDEC_STRIDEB-4]
  264. movd m1, [r0+0*FDEC_STRIDEB-4]
  265. punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
  266. punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
  267. punpckhwd m0, m1
  268. movd m2, [r0-1*FDEC_STRIDEB]
  269. %if cpuflag(ssse3)
  270. palignr m2, m0, 4
  271. %else
  272. psllq m2, 32
  273. punpckhdq m0, m2
  274. SWAP 2, 0
  275. %endif
  276. movd m3, [r0+3*FDEC_STRIDEB-4]
  277. psllq m3, 32
  278. %endif ; !HIGH_BIT_DEPTH
  279. PSRLPIX m1, m2, 1
  280. mova m0, m2
  281. PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
  282. PRED8x8_LOWPASS m0, m2, m1, m0, m3
  283. %assign Y 3
  284. movh [r0+Y*FDEC_STRIDEB], m0
  285. %rep 3
  286. %assign Y (Y-1)
  287. PSRLPIX m0, m0, 1
  288. movh [r0+Y*FDEC_STRIDEB], m0
  289. %endrep
  290. RET
  291. ;-----------------------------------------------------------------------------
  292. ; void predict_4x4_vr( pixel *src )
  293. ;-----------------------------------------------------------------------------
  294. cglobal predict_4x4_vr, 1,1
  295. %if HIGH_BIT_DEPTH
  296. movu m1, [r0-1*FDEC_STRIDEB-8]
  297. pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
  298. pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
  299. pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
  300. %else ; !HIGH_BIT_DEPTH
  301. movd m0, [r0+2*FDEC_STRIDEB-4]
  302. movd m1, [r0+0*FDEC_STRIDEB-4]
  303. punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
  304. punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
  305. punpckhwd m0, m1
  306. movd m1, [r0-1*FDEC_STRIDEB]
  307. %if cpuflag(ssse3)
  308. palignr m1, m0, 4
  309. %else
  310. psllq m1, 32
  311. punpckhdq m0, m1
  312. SWAP 1, 0
  313. %endif
  314. %endif ; !HIGH_BIT_DEPTH
  315. PSRLPIX m2, m1, 1
  316. PSRLPIX m0, m1, 2
  317. pavg%1 m4, m1, m2
  318. PSRLPIX m4, m4, 3
  319. PRED8x8_LOWPASS m2, m0, m1, m2, m3
  320. PSLLPIX m0, m2, 6
  321. PSRLPIX m2, m2, 2
  322. movh [r0+0*FDEC_STRIDEB], m4
  323. PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
  324. movh [r0+1*FDEC_STRIDEB], m2
  325. PSLLPIX m0, m0, 1
  326. movh [r0+2*FDEC_STRIDEB], m4
  327. PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
  328. movh [r0+3*FDEC_STRIDEB], m2
  329. RET
  330. ;-----------------------------------------------------------------------------
  331. ; void predict_4x4_hd( pixel *src )
  332. ;-----------------------------------------------------------------------------
  333. cglobal predict_4x4_hd, 1,1
  334. %if HIGH_BIT_DEPTH
  335. movu m1, [r0-1*FDEC_STRIDEB-8]
  336. PSLLPIX m1, m1, 1
  337. pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
  338. pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
  339. pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
  340. pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
  341. %else
  342. movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
  343. punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
  344. PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
  345. movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
  346. punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
  347. movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
  348. punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
  349. punpckh%3 m1, m2 ; l0 l1 l2 l3
  350. punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
  351. %endif
  352. PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
  353. PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
  354. pavg%1 m5, m1, m2
  355. PRED8x8_LOWPASS m3, m1, m0, m2, m4
  356. punpckl%2 m5, m3
  357. PSRLPIX m3, m3, 4
  358. PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
  359. %assign Y 3
  360. movh [r0+Y*FDEC_STRIDEB], m5
  361. %rep 2
  362. %assign Y (Y-1)
  363. PSRLPIX m5, m5, 2
  364. movh [r0+Y*FDEC_STRIDEB], m5
  365. %endrep
  366. movh [r0+0*FDEC_STRIDEB], m3
  367. RET
  368. %endmacro ; PREDICT_4x4
  369. ;-----------------------------------------------------------------------------
  370. ; void predict_4x4_ddr( pixel *src )
  371. ;-----------------------------------------------------------------------------
  372. %if HIGH_BIT_DEPTH
  373. INIT_MMX mmx2
  374. cglobal predict_4x4_ddr, 1,1
  375. mova m0, [r0+1*FDEC_STRIDEB-8]
  376. punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
  377. mova m3, [r0+3*FDEC_STRIDEB-8]
  378. punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
  379. punpckhdq m3, m0
  380. pshufw m0, m3, q3321
  381. pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
  382. pshufw m1, m0, q3321
  383. PRED8x8_LOWPASS m0, m1, m3, m0
  384. movq [r0+3*FDEC_STRIDEB], m0
  385. movq m2, [r0-1*FDEC_STRIDEB-0]
  386. pshufw m4, m2, q2100
  387. pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
  388. movq m1, m4
  389. PALIGNR m4, m3, 6, m3
  390. PRED8x8_LOWPASS m1, m4, m2, m1
  391. movq [r0+0*FDEC_STRIDEB], m1
  392. pshufw m2, m0, q3321
  393. punpckldq m2, m1
  394. psllq m0, 16
  395. PALIGNR m1, m0, 6, m0
  396. movq [r0+1*FDEC_STRIDEB], m1
  397. movq [r0+2*FDEC_STRIDEB], m2
  398. movd [r0+3*FDEC_STRIDEB+4], m1
  399. RET
  400. ;-----------------------------------------------------------------------------
  401. ; void predict_4x4_hd( pixel *src )
  402. ;-----------------------------------------------------------------------------
  403. cglobal predict_4x4_hd, 1,1
  404. mova m0, [r0+1*FDEC_STRIDEB-8]
  405. punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
  406. mova m1, [r0+3*FDEC_STRIDEB-8]
  407. punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
  408. punpckhdq m1, m0
  409. mova m0, m1
  410. movu m3, [r0-1*FDEC_STRIDEB-2]
  411. pshufw m4, m1, q0032
  412. mova m7, m3
  413. punpckldq m4, m3
  414. PALIGNR m3, m1, 2, m2
  415. PRED8x8_LOWPASS m2, m4, m1, m3
  416. pavgw m0, m3
  417. punpcklwd m5, m0, m2
  418. punpckhwd m4, m0, m2
  419. mova [r0+3*FDEC_STRIDEB], m5
  420. mova [r0+1*FDEC_STRIDEB], m4
  421. psrlq m5, 32
  422. punpckldq m5, m4
  423. mova [r0+2*FDEC_STRIDEB], m5
  424. pshufw m4, m7, q2100
  425. mova m6, [r0-1*FDEC_STRIDEB+0]
  426. pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
  427. PRED8x8_LOWPASS m3, m4, m6, m7
  428. PALIGNR m3, m0, 6, m0
  429. mova [r0+0*FDEC_STRIDEB], m3
  430. RET
  431. INIT_XMM sse2
  432. PREDICT_4x4 w, wd, dq, qdq
  433. INIT_XMM ssse3
  434. PREDICT_4x4 w, wd, dq, qdq
  435. INIT_XMM avx
  436. PREDICT_4x4 w, wd, dq, qdq
  437. %else ; !HIGH_BIT_DEPTH
  438. INIT_MMX mmx2
  439. PREDICT_4x4 b, bw, wd, dq
  440. INIT_MMX ssse3
  441. %define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3
  442. PREDICT_4x4 b, bw, wd, dq
  443. %endif
  444. ;-----------------------------------------------------------------------------
  445. ; void predict_4x4_hu( pixel *src )
  446. ;-----------------------------------------------------------------------------
  447. %if HIGH_BIT_DEPTH
  448. INIT_MMX
  449. cglobal predict_4x4_hu_mmx2, 1,1
  450. movq m0, [r0+0*FDEC_STRIDEB-8]
  451. punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
  452. movq m1, [r0+2*FDEC_STRIDEB-8]
  453. punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
  454. punpckhdq m0, m1
  455. pshufw m1, m1, q3333
  456. movq [r0+3*FDEC_STRIDEB], m1
  457. pshufw m3, m0, q3321
  458. pshufw m4, m0, q3332
  459. pavgw m2, m0, m3
  460. PRED8x8_LOWPASS m3, m0, m4, m3
  461. punpcklwd m4, m2, m3
  462. mova [r0+0*FDEC_STRIDEB], m4
  463. psrlq m2, 16
  464. psrlq m3, 16
  465. punpcklwd m2, m3
  466. mova [r0+1*FDEC_STRIDEB], m2
  467. punpckhdq m2, m1
  468. mova [r0+2*FDEC_STRIDEB], m2
  469. RET
  470. %else ; !HIGH_BIT_DEPTH
  471. INIT_MMX
  472. cglobal predict_4x4_hu_mmx2, 1,1
  473. movd m1, [r0+0*FDEC_STRIDEB-4]
  474. punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
  475. movd m0, [r0+2*FDEC_STRIDEB-4]
  476. punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
  477. punpckhwd m1, m0
  478. movq m0, m1
  479. punpckhbw m1, m1
  480. pshufw m1, m1, q3333
  481. punpckhdq m0, m1
  482. movq m2, m0
  483. movq m3, m0
  484. movq m5, m0
  485. psrlq m3, 8
  486. psrlq m2, 16
  487. pavgb m5, m3
  488. PRED8x8_LOWPASS m3, m0, m2, m3, m4
  489. movd [r0+3*FDEC_STRIDEB], m1
  490. punpcklbw m5, m3
  491. movd [r0+0*FDEC_STRIDEB], m5
  492. psrlq m5, 16
  493. movd [r0+1*FDEC_STRIDEB], m5
  494. psrlq m5, 16
  495. movd [r0+2*FDEC_STRIDEB], m5
  496. RET
  497. %endif ; HIGH_BIT_DEPTH
  498. ;-----------------------------------------------------------------------------
  499. ; void predict_4x4_vl( pixel *src )
  500. ;-----------------------------------------------------------------------------
  501. %macro PREDICT_4x4_V1 1
  502. cglobal predict_4x4_vl, 1,1
  503. movu m1, [r0-FDEC_STRIDEB]
  504. PSRLPIX m3, m1, 1
  505. PSRLPIX m2, m1, 2
  506. pavg%1 m4, m3, m1
  507. PRED8x8_LOWPASS m0, m1, m2, m3, m5
  508. movh [r0+0*FDEC_STRIDEB], m4
  509. movh [r0+1*FDEC_STRIDEB], m0
  510. PSRLPIX m4, m4, 1
  511. PSRLPIX m0, m0, 1
  512. movh [r0+2*FDEC_STRIDEB], m4
  513. movh [r0+3*FDEC_STRIDEB], m0
  514. RET
  515. %endmacro
  516. %if HIGH_BIT_DEPTH
  517. INIT_XMM sse2
  518. PREDICT_4x4_V1 w
  519. INIT_XMM avx
  520. PREDICT_4x4_V1 w
  521. INIT_MMX mmx2
  522. cglobal predict_4x4_vl, 1,4
  523. mova m1, [r0-FDEC_STRIDEB+0]
  524. mova m2, [r0-FDEC_STRIDEB+8]
  525. mova m0, m2
  526. PALIGNR m2, m1, 4, m4
  527. PALIGNR m0, m1, 2, m4
  528. mova m3, m0
  529. pavgw m3, m1
  530. mova [r0+0*FDEC_STRIDEB], m3
  531. psrlq m3, 16
  532. mova [r0+2*FDEC_STRIDEB], m3
  533. PRED8x8_LOWPASS m0, m1, m2, m0
  534. mova [r0+1*FDEC_STRIDEB], m0
  535. psrlq m0, 16
  536. mova [r0+3*FDEC_STRIDEB], m0
  537. movzx r1d, word [r0-FDEC_STRIDEB+ 8]
  538. movzx r2d, word [r0-FDEC_STRIDEB+10]
  539. movzx r3d, word [r0-FDEC_STRIDEB+12]
  540. lea r1d, [r1+r2+1]
  541. add r3d, r2d
  542. lea r3d, [r3+r1+1]
  543. shr r1d, 1
  544. shr r3d, 2
  545. mov [r0+2*FDEC_STRIDEB+6], r1w
  546. mov [r0+3*FDEC_STRIDEB+6], r3w
  547. RET
  548. %else ; !HIGH_BIT_DEPTH
  549. INIT_MMX mmx2
  550. PREDICT_4x4_V1 b
  551. %endif
  552. ;-----------------------------------------------------------------------------
  553. ; void predict_4x4_dc( pixel *src )
  554. ;-----------------------------------------------------------------------------
  555. INIT_MMX mmx2
  556. %if HIGH_BIT_DEPTH
  557. cglobal predict_4x4_dc, 1,1
  558. mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
  559. paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
  560. paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
  561. paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
  562. psrlq m2, 48
  563. mova m0, [r0-FDEC_STRIDEB]
  564. HADDW m0, m1
  565. paddw m0, [pw_4]
  566. paddw m0, m2
  567. psrlw m0, 3
  568. SPLATW m0, m0
  569. mova [r0+0*FDEC_STRIDEB], m0
  570. mova [r0+1*FDEC_STRIDEB], m0
  571. mova [r0+2*FDEC_STRIDEB], m0
  572. mova [r0+3*FDEC_STRIDEB], m0
  573. RET
  574. %else ; !HIGH_BIT_DEPTH
  575. cglobal predict_4x4_dc, 1,4
  576. pxor mm7, mm7
  577. movd mm0, [r0-FDEC_STRIDEB]
  578. psadbw mm0, mm7
  579. movd r3d, mm0
  580. movzx r1d, byte [r0-1]
  581. %assign Y 1
  582. %rep 3
  583. movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
  584. add r1d, r2d
  585. %assign Y Y+1
  586. %endrep
  587. lea r1d, [r1+r3+4]
  588. shr r1d, 3
  589. imul r1d, 0x01010101
  590. mov [r0+FDEC_STRIDEB*0], r1d
  591. mov [r0+FDEC_STRIDEB*1], r1d
  592. mov [r0+FDEC_STRIDEB*2], r1d
  593. mov [r0+FDEC_STRIDEB*3], r1d
  594. RET
  595. %endif ; HIGH_BIT_DEPTH
  596. %macro PREDICT_FILTER 4
  597. ;-----------------------------------------------------------------------------
  598. ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
  599. ;-----------------------------------------------------------------------------
  600. cglobal predict_8x8_filter, 4,6,6
  601. add r0, 0x58*SIZEOF_PIXEL
  602. %define src r0-0x58*SIZEOF_PIXEL
  603. %if ARCH_X86_64 == 0
  604. mov r4, r1
  605. %define t1 r4
  606. %define t4 r1
  607. %else
  608. %define t1 r1
  609. %define t4 r4
  610. %endif
  611. test r3b, 1
  612. je .check_top
  613. mov t4d, r2d
  614. and t4d, 8
  615. neg t4
  616. mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  617. punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
  618. mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  619. punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  620. punpckh%2%3 m1, m0
  621. mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  622. punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  623. mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  624. punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  625. punpckh%2%3 m3, m2
  626. punpckh%3%4 m3, m1
  627. mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  628. mova m1, [src-1*FDEC_STRIDEB]
  629. PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
  630. PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
  631. PRED8x8_LOWPASS m3, m1, m4, m3, m5
  632. mova [t1+8*SIZEOF_PIXEL], m3
  633. movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
  634. movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
  635. lea t4d, [t4*3+2]
  636. add t4d, r5d
  637. shr t4d, 2
  638. mov [t1+7*SIZEOF_PIXEL], t4%1
  639. mov [t1+6*SIZEOF_PIXEL], t4%1
  640. test r3b, 2
  641. je .done
  642. .check_top:
  643. %if SIZEOF_PIXEL==1 && cpuflag(ssse3)
  644. INIT_XMM cpuname
  645. movu m3, [src-1*FDEC_STRIDEB]
  646. movhps m0, [src-1*FDEC_STRIDEB-8]
  647. test r2b, 8
  648. je .fix_lt_2
  649. .do_top:
  650. and r2d, 4
  651. %ifdef PIC
  652. lea r3, [shuf_fixtr]
  653. pshufb m3, [r3+r2*4]
  654. %else
  655. pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
  656. %endif
  657. psrldq m1, m3, 15
  658. PALIGNR m2, m3, m0, 15, m0
  659. PALIGNR m1, m3, 1, m5
  660. PRED8x8_LOWPASS m0, m2, m1, m3, m5
  661. mova [t1+16*SIZEOF_PIXEL], m0
  662. psrldq m0, 15
  663. movd [t1+32*SIZEOF_PIXEL], m0
  664. .done:
  665. REP_RET
  666. .fix_lt_2:
  667. pslldq m0, m3, 15
  668. jmp .do_top
  669. %else
  670. mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
  671. mova m3, [src-1*FDEC_STRIDEB]
  672. mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
  673. test r2b, 8
  674. je .fix_lt_2
  675. test r2b, 4
  676. je .fix_tr_1
  677. .do_top:
  678. PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
  679. PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
  680. PRED8x8_LOWPASS m4, m2, m0, m3, m5
  681. mova [t1+16*SIZEOF_PIXEL], m4
  682. test r3b, 4
  683. je .done
  684. PSRLPIX m5, m1, 7
  685. PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
  686. PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
  687. PRED8x8_LOWPASS m0, m2, m5, m1, m4
  688. mova [t1+24*SIZEOF_PIXEL], m0
  689. PSRLPIX m0, m0, 7
  690. movd [t1+32*SIZEOF_PIXEL], m0
  691. .done:
  692. REP_RET
  693. .fix_lt_2:
  694. PSLLPIX m0, m3, 7
  695. test r2b, 4
  696. jne .do_top
  697. .fix_tr_1:
  698. punpckh%1%2 m1, m3, m3
  699. pshuf%2 m1, m1, q3333
  700. jmp .do_top
  701. %endif
  702. %endmacro
  703. %if HIGH_BIT_DEPTH
  704. INIT_XMM sse2
  705. PREDICT_FILTER w, d, q, dq
  706. INIT_XMM ssse3
  707. PREDICT_FILTER w, d, q, dq
  708. INIT_XMM avx
  709. PREDICT_FILTER w, d, q, dq
  710. %else
  711. INIT_MMX mmx2
  712. PREDICT_FILTER b, w, d, q
  713. INIT_MMX ssse3
  714. PREDICT_FILTER b, w, d, q
  715. %endif
  716. ;-----------------------------------------------------------------------------
  717. ; void predict_8x8_v( pixel *src, pixel *edge )
  718. ;-----------------------------------------------------------------------------
  719. %macro PREDICT_8x8_V 0
  720. cglobal predict_8x8_v, 2,2
  721. mova m0, [r1+16*SIZEOF_PIXEL]
  722. STORE8 m0
  723. RET
  724. %endmacro
  725. %if HIGH_BIT_DEPTH
  726. INIT_XMM sse
  727. PREDICT_8x8_V
  728. %else
  729. INIT_MMX mmx2
  730. PREDICT_8x8_V
  731. %endif
  732. ;-----------------------------------------------------------------------------
  733. ; void predict_8x8_h( pixel *src, pixel edge[36] )
  734. ;-----------------------------------------------------------------------------
  735. %macro PREDICT_8x8_H 2
  736. cglobal predict_8x8_h, 2,2
  737. movu m1, [r1+7*SIZEOF_PIXEL]
  738. add r0, 4*FDEC_STRIDEB
  739. punpckl%1 m2, m1, m1
  740. punpckh%1 m1, m1
  741. %assign Y 0
  742. %rep 8
  743. %assign i 1+Y/4
  744. SPLAT%2 m0, m %+ i, (3-Y)&3
  745. mova [r0+(Y-4)*FDEC_STRIDEB], m0
  746. %assign Y Y+1
  747. %endrep
  748. RET
  749. %endmacro
  750. %if HIGH_BIT_DEPTH
  751. INIT_XMM sse2
  752. PREDICT_8x8_H wd, D
  753. %else
  754. INIT_MMX mmx2
  755. PREDICT_8x8_H bw, W
  756. %endif
  757. ;-----------------------------------------------------------------------------
  758. ; void predict_8x8_dc( pixel *src, pixel *edge );
  759. ;-----------------------------------------------------------------------------
  760. %if HIGH_BIT_DEPTH
  761. INIT_XMM sse2
  762. cglobal predict_8x8_dc, 2,2
  763. movu m0, [r1+14]
  764. paddw m0, [r1+32]
  765. HADDW m0, m1
  766. paddw m0, [pw_8]
  767. psrlw m0, 4
  768. SPLATW m0, m0
  769. STORE8 m0
  770. RET
  771. %else ; !HIGH_BIT_DEPTH
  772. INIT_MMX mmx2
  773. cglobal predict_8x8_dc, 2,2
  774. pxor mm0, mm0
  775. pxor mm1, mm1
  776. psadbw mm0, [r1+7]
  777. psadbw mm1, [r1+16]
  778. paddw mm0, [pw_8]
  779. paddw mm0, mm1
  780. psrlw mm0, 4
  781. pshufw mm0, mm0, 0
  782. packuswb mm0, mm0
  783. STORE8 mm0
  784. RET
  785. %endif ; HIGH_BIT_DEPTH
  786. ;-----------------------------------------------------------------------------
  787. ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
  788. ; void predict_8x8_dc_left( pixel *src, pixel *edge );
  789. ;-----------------------------------------------------------------------------
  790. %if HIGH_BIT_DEPTH
  791. %macro PREDICT_8x8_DC 3
  792. cglobal %1, 2,2
  793. %3 m0, [r1+%2]
  794. HADDW m0, m1
  795. paddw m0, [pw_4]
  796. psrlw m0, 3
  797. SPLATW m0, m0
  798. STORE8 m0
  799. RET
  800. %endmacro
  801. INIT_XMM sse2
  802. PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
  803. PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
  804. %else ; !HIGH_BIT_DEPTH
  805. %macro PREDICT_8x8_DC 2
  806. cglobal %1, 2,2
  807. pxor mm0, mm0
  808. psadbw mm0, [r1+%2]
  809. paddw mm0, [pw_4]
  810. psrlw mm0, 3
  811. pshufw mm0, mm0, 0
  812. packuswb mm0, mm0
  813. STORE8 mm0
  814. RET
  815. %endmacro
  816. INIT_MMX
  817. PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
  818. PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
  819. %endif ; HIGH_BIT_DEPTH
  820. ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
  821. ; size on the 8-bit mmx functions below if we know sse2 is available.
  822. %macro PREDICT_8x8_DDLR 0
  823. ;-----------------------------------------------------------------------------
  824. ; void predict_8x8_ddl( pixel *src, pixel *edge )
  825. ;-----------------------------------------------------------------------------
  826. cglobal predict_8x8_ddl, 2,2,7
  827. mova m0, [r1+16*SIZEOF_PIXEL]
  828. mova m1, [r1+24*SIZEOF_PIXEL]
  829. %if cpuflag(cache64)
  830. movd m5, [r1+32*SIZEOF_PIXEL]
  831. palignr m3, m1, m0, 1*SIZEOF_PIXEL
  832. palignr m5, m5, m1, 1*SIZEOF_PIXEL
  833. palignr m4, m1, m0, 7*SIZEOF_PIXEL
  834. %else
  835. movu m3, [r1+17*SIZEOF_PIXEL]
  836. movu m4, [r1+23*SIZEOF_PIXEL]
  837. movu m5, [r1+25*SIZEOF_PIXEL]
  838. %endif
  839. PSLLPIX m2, m0, 1
  840. add r0, FDEC_STRIDEB*4
  841. PRED8x8_LOWPASS m0, m2, m3, m0, m6
  842. PRED8x8_LOWPASS m1, m4, m5, m1, m6
  843. mova [r0+3*FDEC_STRIDEB], m1
  844. %assign Y 2
  845. %rep 6
  846. PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
  847. PSLLPIX m0, m0, 1
  848. mova [r0+Y*FDEC_STRIDEB], m1
  849. %assign Y (Y-1)
  850. %endrep
  851. PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
  852. mova [r0+Y*FDEC_STRIDEB], m1
  853. RET
  854. ;-----------------------------------------------------------------------------
  855. ; void predict_8x8_ddr( pixel *src, pixel *edge )
  856. ;-----------------------------------------------------------------------------
  857. cglobal predict_8x8_ddr, 2,2,7
  858. add r0, FDEC_STRIDEB*4
  859. mova m0, [r1+ 8*SIZEOF_PIXEL]
  860. mova m1, [r1+16*SIZEOF_PIXEL]
  861. ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
  862. movu m2, [r1+ 7*SIZEOF_PIXEL]
  863. movu m5, [r1+17*SIZEOF_PIXEL]
  864. %if cpuflag(cache64)
  865. palignr m3, m1, m0, 1*SIZEOF_PIXEL
  866. palignr m4, m1, m0, 7*SIZEOF_PIXEL
  867. %else
  868. movu m3, [r1+ 9*SIZEOF_PIXEL]
  869. movu m4, [r1+15*SIZEOF_PIXEL]
  870. %endif
  871. PRED8x8_LOWPASS m0, m2, m3, m0, m6
  872. PRED8x8_LOWPASS m1, m4, m5, m1, m6
  873. mova [r0+3*FDEC_STRIDEB], m0
  874. %assign Y -4
  875. %rep 6
  876. PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
  877. PSLLPIX m0, m0, 1
  878. mova [r0+Y*FDEC_STRIDEB], m1
  879. %assign Y (Y+1)
  880. %endrep
  881. PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
  882. mova [r0+Y*FDEC_STRIDEB], m1
  883. RET
  884. %endmacro ; PREDICT_8x8_DDLR
  885. %if HIGH_BIT_DEPTH
  886. INIT_XMM sse2
  887. PREDICT_8x8_DDLR
  888. INIT_XMM ssse3
  889. PREDICT_8x8_DDLR
  890. INIT_XMM cache64, ssse3
  891. PREDICT_8x8_DDLR
  892. %elif ARCH_X86_64 == 0
  893. INIT_MMX mmx2
  894. PREDICT_8x8_DDLR
  895. %endif
  896. ;-----------------------------------------------------------------------------
  897. ; void predict_8x8_hu( pixel *src, pixel *edge )
  898. ;-----------------------------------------------------------------------------
  899. %macro PREDICT_8x8_HU 2
  900. cglobal predict_8x8_hu, 2,2,8
  901. add r0, 4*FDEC_STRIDEB
  902. %if HIGH_BIT_DEPTH
  903. %if cpuflag(ssse3)
  904. movu m5, [r1+7*SIZEOF_PIXEL]
  905. pshufb m5, [pw_reverse]
  906. %else
  907. movq m6, [r1+7*SIZEOF_PIXEL]
  908. movq m5, [r1+11*SIZEOF_PIXEL]
  909. pshuflw m6, m6, q0123
  910. pshuflw m5, m5, q0123
  911. movlhps m5, m6
  912. %endif ; cpuflag
  913. psrldq m2, m5, 2
  914. pshufd m3, m5, q0321
  915. pshufhw m2, m2, q2210
  916. pshufhw m3, m3, q1110
  917. pavgw m4, m5, m2
  918. %else ; !HIGH_BIT_DEPTH
  919. movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
  920. pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
  921. psllq m1, 56 ; l7 .. .. .. .. .. .. ..
  922. mova m2, m0
  923. psllw m0, 8
  924. psrlw m2, 8
  925. por m2, m0
  926. mova m3, m2
  927. mova m4, m2
  928. mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
  929. psrlq m3, 16
  930. psrlq m2, 8
  931. por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
  932. punpckhbw m1, m1
  933. por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
  934. pavgb m4, m2
  935. %endif ; !HIGH_BIT_DEPTH
  936. PRED8x8_LOWPASS m2, m3, m5, m2, m6
  937. punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
  938. punpckl%2 m4, m2 ; p4 p3 p2 p1
  939. PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
  940. pshuf%1 m1, m0, q3321
  941. PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
  942. pshuf%1 m2, m0, q3332
  943. PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
  944. pshuf%1 m3, m0, q3333
  945. mova [r0-4*FDEC_STRIDEB], m4
  946. mova [r0-3*FDEC_STRIDEB], m5
  947. mova [r0-2*FDEC_STRIDEB], m6
  948. mova [r0-1*FDEC_STRIDEB], m7
  949. mova [r0+0*FDEC_STRIDEB], m0
  950. mova [r0+1*FDEC_STRIDEB], m1
  951. mova [r0+2*FDEC_STRIDEB], m2
  952. mova [r0+3*FDEC_STRIDEB], m3
  953. RET
  954. %endmacro
  955. %if HIGH_BIT_DEPTH
  956. INIT_XMM sse2
  957. PREDICT_8x8_HU d, wd
  958. INIT_XMM ssse3
  959. PREDICT_8x8_HU d, wd
  960. INIT_XMM avx
  961. PREDICT_8x8_HU d, wd
  962. %elif ARCH_X86_64 == 0
  963. INIT_MMX mmx2
  964. PREDICT_8x8_HU w, bw
  965. %endif
  966. ;-----------------------------------------------------------------------------
  967. ; void predict_8x8_vr( pixel *src, pixel *edge )
  968. ;-----------------------------------------------------------------------------
  969. %macro PREDICT_8x8_VR 1
  970. cglobal predict_8x8_vr, 2,3
  971. mova m2, [r1+16*SIZEOF_PIXEL]
  972. %ifidn cpuname, ssse3
  973. mova m0, [r1+8*SIZEOF_PIXEL]
  974. palignr m3, m2, m0, 7*SIZEOF_PIXEL
  975. palignr m1, m2, m0, 6*SIZEOF_PIXEL
  976. %else
  977. movu m3, [r1+15*SIZEOF_PIXEL]
  978. movu m1, [r1+14*SIZEOF_PIXEL]
  979. %endif
  980. pavg%1 m4, m3, m2
  981. add r0, FDEC_STRIDEB*4
  982. PRED8x8_LOWPASS m3, m1, m2, m3, m5
  983. mova [r0-4*FDEC_STRIDEB], m4
  984. mova [r0-3*FDEC_STRIDEB], m3
  985. mova m1, [r1+8*SIZEOF_PIXEL]
  986. PSLLPIX m0, m1, 1
  987. PSLLPIX m2, m1, 2
  988. PRED8x8_LOWPASS m0, m1, m2, m0, m6
  989. %assign Y -2
  990. %rep 5
  991. PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
  992. mova [r0+Y*FDEC_STRIDEB], m4
  993. PSLLPIX m0, m0, 1
  994. SWAP 3, 4
  995. %assign Y (Y+1)
  996. %endrep
  997. PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
  998. mova [r0+Y*FDEC_STRIDEB], m4
  999. RET
  1000. %endmacro
  1001. %if HIGH_BIT_DEPTH
  1002. INIT_XMM sse2
  1003. PREDICT_8x8_VR w
  1004. INIT_XMM ssse3
  1005. PREDICT_8x8_VR w
  1006. INIT_XMM avx
  1007. PREDICT_8x8_VR w
  1008. %elif ARCH_X86_64 == 0
  1009. INIT_MMX mmx2
  1010. PREDICT_8x8_VR b
  1011. %endif
  1012. %macro LOAD_PLANE_ARGS 0
  1013. %if cpuflag(avx2) && ARCH_X86_64 == 0
  1014. vpbroadcastw m0, r1m
  1015. vpbroadcastw m2, r2m
  1016. vpbroadcastw m4, r3m
  1017. %elif mmsize == 8 ; MMX is only used on x86_32
  1018. SPLATW m0, r1m
  1019. SPLATW m2, r2m
  1020. SPLATW m4, r3m
  1021. %else
  1022. movd xm0, r1m
  1023. movd xm2, r2m
  1024. movd xm4, r3m
  1025. SPLATW m0, xm0
  1026. SPLATW m2, xm2
  1027. SPLATW m4, xm4
  1028. %endif
  1029. %endmacro
  1030. ;-----------------------------------------------------------------------------
  1031. ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
  1032. ;-----------------------------------------------------------------------------
  1033. %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
  1034. %macro PREDICT_CHROMA_P_MMX 1
  1035. cglobal predict_8x%1c_p_core, 1,2
  1036. LOAD_PLANE_ARGS
  1037. movq m1, m2
  1038. pmullw m2, [pw_0to15]
  1039. psllw m1, 2
  1040. paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
  1041. paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
  1042. mov r1d, %1
  1043. ALIGN 4
  1044. .loop:
  1045. movq m5, m0
  1046. movq m6, m1
  1047. psraw m5, 5
  1048. psraw m6, 5
  1049. packuswb m5, m6
  1050. movq [r0], m5
  1051. paddsw m0, m4
  1052. paddsw m1, m4
  1053. add r0, FDEC_STRIDE
  1054. dec r1d
  1055. jg .loop
  1056. RET
  1057. %endmacro ; PREDICT_CHROMA_P_MMX
  1058. INIT_MMX mmx2
  1059. PREDICT_CHROMA_P_MMX 8
  1060. PREDICT_CHROMA_P_MMX 16
  1061. %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
  1062. %macro PREDICT_CHROMA_P 1
  1063. %if HIGH_BIT_DEPTH
  1064. cglobal predict_8x%1c_p_core, 1,2,7
  1065. LOAD_PLANE_ARGS
  1066. mova m3, [pw_pixel_max]
  1067. pxor m1, m1
  1068. pmullw m2, [pw_43210123] ; b
  1069. %if %1 == 16
  1070. pmullw m5, m4, [pw_m7] ; c
  1071. %else
  1072. pmullw m5, m4, [pw_m3]
  1073. %endif
  1074. paddw m5, [pw_16]
  1075. %if mmsize == 32
  1076. mova xm6, xm4
  1077. paddw m4, m4
  1078. paddw m5, m6
  1079. %endif
  1080. mov r1d, %1/(mmsize/16)
  1081. .loop:
  1082. paddsw m6, m2, m5
  1083. paddsw m6, m0
  1084. psraw m6, 5
  1085. CLIPW m6, m1, m3
  1086. paddw m5, m4
  1087. %if mmsize == 32
  1088. vextracti128 [r0], m6, 1
  1089. mova [r0+FDEC_STRIDEB], xm6
  1090. add r0, 2*FDEC_STRIDEB
  1091. %else
  1092. mova [r0], m6
  1093. add r0, FDEC_STRIDEB
  1094. %endif
  1095. dec r1d
  1096. jg .loop
  1097. RET
  1098. %else ; !HIGH_BIT_DEPTH
  1099. cglobal predict_8x%1c_p_core, 1,2
  1100. LOAD_PLANE_ARGS
  1101. %if mmsize == 32
  1102. vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
  1103. pmullw m2, m1
  1104. mova xm1, xm4 ; zero upper half
  1105. paddsw m4, m4
  1106. paddsw m0, m1
  1107. %else
  1108. pmullw m2, [pw_0to15]
  1109. %endif
  1110. paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
  1111. paddsw m1, m0, m4
  1112. paddsw m4, m4
  1113. mov r1d, %1/(mmsize/8)
  1114. .loop:
  1115. psraw m2, m0, 5
  1116. psraw m3, m1, 5
  1117. paddsw m0, m4
  1118. paddsw m1, m4
  1119. packuswb m2, m3
  1120. %if mmsize == 32
  1121. movq [r0+FDEC_STRIDE*1], xm2
  1122. movhps [r0+FDEC_STRIDE*3], xm2
  1123. vextracti128 xm2, m2, 1
  1124. movq [r0+FDEC_STRIDE*0], xm2
  1125. movhps [r0+FDEC_STRIDE*2], xm2
  1126. %else
  1127. movq [r0+FDEC_STRIDE*0], xm2
  1128. movhps [r0+FDEC_STRIDE*1], xm2
  1129. %endif
  1130. add r0, FDEC_STRIDE*mmsize/8
  1131. dec r1d
  1132. jg .loop
  1133. RET
  1134. %endif ; HIGH_BIT_DEPTH
  1135. %endmacro ; PREDICT_CHROMA_P
  1136. INIT_XMM sse2
  1137. PREDICT_CHROMA_P 8
  1138. PREDICT_CHROMA_P 16
  1139. INIT_XMM avx
  1140. PREDICT_CHROMA_P 8
  1141. PREDICT_CHROMA_P 16
  1142. INIT_YMM avx2
  1143. PREDICT_CHROMA_P 8
  1144. PREDICT_CHROMA_P 16
  1145. ;-----------------------------------------------------------------------------
  1146. ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
  1147. ;-----------------------------------------------------------------------------
  1148. %if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
  1149. INIT_MMX mmx2
  1150. cglobal predict_16x16_p_core, 1,2
  1151. LOAD_PLANE_ARGS
  1152. movq mm5, mm2
  1153. movq mm1, mm2
  1154. pmullw mm5, [pw_0to15]
  1155. psllw mm2, 3
  1156. psllw mm1, 2
  1157. movq mm3, mm2
  1158. paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
  1159. paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
  1160. paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
  1161. paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
  1162. mov r1d, 16
  1163. ALIGN 4
  1164. .loop:
  1165. movq mm5, mm0
  1166. movq mm6, mm1
  1167. psraw mm5, 5
  1168. psraw mm6, 5
  1169. packuswb mm5, mm6
  1170. movq [r0], mm5
  1171. movq mm5, mm2
  1172. movq mm6, mm3
  1173. psraw mm5, 5
  1174. psraw mm6, 5
  1175. packuswb mm5, mm6
  1176. movq [r0+8], mm5
  1177. paddsw mm0, mm4
  1178. paddsw mm1, mm4
  1179. paddsw mm2, mm4
  1180. paddsw mm3, mm4
  1181. add r0, FDEC_STRIDE
  1182. dec r1d
  1183. jg .loop
  1184. RET
  1185. %endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
  1186. %macro PREDICT_16x16_P 0
  1187. cglobal predict_16x16_p_core, 1,2,8
  1188. movd m0, r1m
  1189. movd m1, r2m
  1190. movd m2, r3m
  1191. SPLATW m0, m0, 0
  1192. SPLATW m1, m1, 0
  1193. SPLATW m2, m2, 0
  1194. pmullw m3, m1, [pw_0to15]
  1195. psllw m1, 3
  1196. %if HIGH_BIT_DEPTH
  1197. pxor m6, m6
  1198. mov r1d, 16
  1199. .loop:
  1200. mova m4, m0
  1201. mova m5, m0
  1202. mova m7, m3
  1203. paddsw m7, m6
  1204. paddsw m4, m7
  1205. paddsw m7, m1
  1206. paddsw m5, m7
  1207. psraw m4, 5
  1208. psraw m5, 5
  1209. CLIPW m4, [pb_0], [pw_pixel_max]
  1210. CLIPW m5, [pb_0], [pw_pixel_max]
  1211. mova [r0], m4
  1212. mova [r0+16], m5
  1213. add r0, FDEC_STRIDEB
  1214. paddw m6, m2
  1215. %else ; !HIGH_BIT_DEPTH
  1216. paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
  1217. paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
  1218. paddsw m7, m2, m2
  1219. mov r1d, 8
  1220. ALIGN 4
  1221. .loop:
  1222. psraw m3, m0, 5
  1223. psraw m4, m1, 5
  1224. paddsw m5, m0, m2
  1225. paddsw m6, m1, m2
  1226. psraw m5, 5
  1227. psraw m6, 5
  1228. packuswb m3, m4
  1229. packuswb m5, m6
  1230. mova [r0+FDEC_STRIDE*0], m3
  1231. mova [r0+FDEC_STRIDE*1], m5
  1232. paddsw m0, m7
  1233. paddsw m1, m7
  1234. add r0, FDEC_STRIDE*2
  1235. %endif ; !HIGH_BIT_DEPTH
  1236. dec r1d
  1237. jg .loop
  1238. RET
  1239. %endmacro ; PREDICT_16x16_P
  1240. INIT_XMM sse2
  1241. PREDICT_16x16_P
  1242. %if HIGH_BIT_DEPTH == 0
  1243. INIT_XMM avx
  1244. PREDICT_16x16_P
  1245. %endif
  1246. INIT_YMM avx2
  1247. cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
  1248. LOAD_PLANE_ARGS
  1249. %if HIGH_BIT_DEPTH
  1250. pmullw m2, [pw_0to15]
  1251. pxor m5, m5
  1252. pxor m6, m6
  1253. mova m7, [pw_pixel_max]
  1254. mov r1d, 8
  1255. .loop:
  1256. paddsw m1, m2, m5
  1257. paddw m5, m4
  1258. paddsw m1, m0
  1259. paddsw m3, m2, m5
  1260. psraw m1, 5
  1261. paddsw m3, m0
  1262. psraw m3, 5
  1263. CLIPW m1, m6, m7
  1264. mova [r0+0*FDEC_STRIDEB], m1
  1265. CLIPW m3, m6, m7
  1266. mova [r0+1*FDEC_STRIDEB], m3
  1267. paddw m5, m4
  1268. add r0, 2*FDEC_STRIDEB
  1269. %else ; !HIGH_BIT_DEPTH
  1270. vbroadcasti128 m1, [pw_0to15]
  1271. mova xm3, xm4 ; zero high bits
  1272. pmullw m1, m2
  1273. psllw m2, 3
  1274. paddsw m0, m3
  1275. paddsw m0, m1 ; X+1*C X+0*C
  1276. paddsw m1, m0, m2 ; Y+1*C Y+0*C
  1277. paddsw m4, m4
  1278. mov r1d, 4
  1279. .loop:
  1280. psraw m2, m0, 5
  1281. psraw m3, m1, 5
  1282. paddsw m0, m4
  1283. paddsw m1, m4
  1284. packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
  1285. vextracti128 [r0+0*FDEC_STRIDE], m2, 1
  1286. mova [r0+1*FDEC_STRIDE], xm2
  1287. psraw m2, m0, 5
  1288. psraw m3, m1, 5
  1289. paddsw m0, m4
  1290. paddsw m1, m4
  1291. packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
  1292. vextracti128 [r0+2*FDEC_STRIDE], m2, 1
  1293. mova [r0+3*FDEC_STRIDE], xm2
  1294. add r0, FDEC_STRIDE*4
  1295. %endif ; !HIGH_BIT_DEPTH
  1296. dec r1d
  1297. jg .loop
  1298. RET
  1299. %if HIGH_BIT_DEPTH == 0
  1300. %macro PREDICT_8x8 0
  1301. ;-----------------------------------------------------------------------------
  1302. ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
  1303. ;-----------------------------------------------------------------------------
  1304. cglobal predict_8x8_ddl, 2,2
  1305. mova m0, [r1+16]
  1306. %ifidn cpuname, ssse3
  1307. movd m2, [r1+32]
  1308. palignr m2, m0, 1
  1309. %else
  1310. movu m2, [r1+17]
  1311. %endif
  1312. pslldq m1, m0, 1
  1313. add r0, FDEC_STRIDE*4
  1314. PRED8x8_LOWPASS m0, m1, m2, m0, m3
  1315. %assign Y -4
  1316. %rep 8
  1317. psrldq m0, 1
  1318. movq [r0+Y*FDEC_STRIDE], m0
  1319. %assign Y (Y+1)
  1320. %endrep
  1321. RET
  1322. %ifnidn cpuname, ssse3
  1323. ;-----------------------------------------------------------------------------
  1324. ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
  1325. ;-----------------------------------------------------------------------------
  1326. cglobal predict_8x8_ddr, 2,2
  1327. movu m0, [r1+8]
  1328. movu m1, [r1+7]
  1329. psrldq m2, m0, 1
  1330. add r0, FDEC_STRIDE*4
  1331. PRED8x8_LOWPASS m0, m1, m2, m0, m3
  1332. psrldq m1, m0, 1
  1333. %assign Y 3
  1334. %rep 3
  1335. movq [r0+Y*FDEC_STRIDE], m0
  1336. movq [r0+(Y-1)*FDEC_STRIDE], m1
  1337. psrldq m0, 2
  1338. psrldq m1, 2
  1339. %assign Y (Y-2)
  1340. %endrep
  1341. movq [r0-3*FDEC_STRIDE], m0
  1342. movq [r0-4*FDEC_STRIDE], m1
  1343. RET
  1344. ;-----------------------------------------------------------------------------
  1345. ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
  1346. ;-----------------------------------------------------------------------------
  1347. cglobal predict_8x8_vl, 2,2
  1348. mova m0, [r1+16]
  1349. pslldq m1, m0, 1
  1350. psrldq m2, m0, 1
  1351. pavgb m3, m0, m2
  1352. add r0, FDEC_STRIDE*4
  1353. PRED8x8_LOWPASS m0, m1, m2, m0, m5
  1354. ; m0: (t0 + 2*t1 + t2 + 2) >> 2
  1355. ; m3: (t0 + t1 + 1) >> 1
  1356. %assign Y -4
  1357. %rep 3
  1358. psrldq m0, 1
  1359. movq [r0+ Y *FDEC_STRIDE], m3
  1360. movq [r0+(Y+1)*FDEC_STRIDE], m0
  1361. psrldq m3, 1
  1362. %assign Y (Y+2)
  1363. %endrep
  1364. psrldq m0, 1
  1365. movq [r0+ Y *FDEC_STRIDE], m3
  1366. movq [r0+(Y+1)*FDEC_STRIDE], m0
  1367. RET
  1368. %endif ; !ssse3
  1369. ;-----------------------------------------------------------------------------
  1370. ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
  1371. ;-----------------------------------------------------------------------------
  1372. cglobal predict_8x8_vr, 2,2
  1373. movu m2, [r1+8]
  1374. add r0, 4*FDEC_STRIDE
  1375. pslldq m1, m2, 2
  1376. pslldq m0, m2, 1
  1377. pavgb m3, m2, m0
  1378. PRED8x8_LOWPASS m0, m2, m1, m0, m4
  1379. movhps [r0-4*FDEC_STRIDE], m3
  1380. movhps [r0-3*FDEC_STRIDE], m0
  1381. %if cpuflag(ssse3)
  1382. punpckhqdq m3, m3
  1383. pshufb m0, [shuf_vr]
  1384. palignr m3, m0, 13
  1385. %else
  1386. mova m2, m0
  1387. mova m1, [pw_00ff]
  1388. pand m1, m0
  1389. psrlw m0, 8
  1390. packuswb m1, m0
  1391. pslldq m1, 4
  1392. movhlps m3, m1
  1393. shufps m1, m2, q3210
  1394. psrldq m3, 5
  1395. psrldq m1, 5
  1396. SWAP 0, 1
  1397. %endif
  1398. movq [r0+3*FDEC_STRIDE], m0
  1399. movq [r0+2*FDEC_STRIDE], m3
  1400. psrldq m0, 1
  1401. psrldq m3, 1
  1402. movq [r0+1*FDEC_STRIDE], m0
  1403. movq [r0+0*FDEC_STRIDE], m3
  1404. psrldq m0, 1
  1405. psrldq m3, 1
  1406. movq [r0-1*FDEC_STRIDE], m0
  1407. movq [r0-2*FDEC_STRIDE], m3
  1408. RET
  1409. %endmacro ; PREDICT_8x8
  1410. INIT_XMM sse2
  1411. PREDICT_8x8
  1412. INIT_XMM ssse3
  1413. PREDICT_8x8
  1414. INIT_XMM avx
  1415. PREDICT_8x8
  1416. %endif ; !HIGH_BIT_DEPTH
  1417. ;-----------------------------------------------------------------------------
  1418. ; void predict_8x8_vl( pixel *src, pixel *edge )
  1419. ;-----------------------------------------------------------------------------
  1420. %macro PREDICT_8x8_VL_10 1
  1421. cglobal predict_8x8_vl, 2,2,8
  1422. mova m0, [r1+16*SIZEOF_PIXEL]
  1423. mova m1, [r1+24*SIZEOF_PIXEL]
  1424. PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
  1425. PSRLPIX m4, m1, 1
  1426. pavg%1 m6, m0, m2
  1427. pavg%1 m7, m1, m4
  1428. add r0, FDEC_STRIDEB*4
  1429. mova [r0-4*FDEC_STRIDEB], m6
  1430. PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
  1431. mova [r0-2*FDEC_STRIDEB], m3
  1432. PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
  1433. mova [r0+0*FDEC_STRIDEB], m3
  1434. PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
  1435. mova [r0+2*FDEC_STRIDEB], m7
  1436. PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
  1437. PSLLPIX m5, m0, 1
  1438. PRED8x8_LOWPASS m0, m5, m2, m0, m7
  1439. PRED8x8_LOWPASS m1, m3, m4, m1, m7
  1440. PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
  1441. mova [r0-3*FDEC_STRIDEB], m4
  1442. PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
  1443. mova [r0-1*FDEC_STRIDEB], m4
  1444. PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
  1445. mova [r0+1*FDEC_STRIDEB], m4
  1446. PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
  1447. mova [r0+3*FDEC_STRIDEB], m1
  1448. RET
  1449. %endmacro
  1450. %if HIGH_BIT_DEPTH
  1451. INIT_XMM sse2
  1452. PREDICT_8x8_VL_10 w
  1453. INIT_XMM ssse3
  1454. PREDICT_8x8_VL_10 w
  1455. INIT_XMM avx
  1456. PREDICT_8x8_VL_10 w
  1457. %else
  1458. INIT_MMX mmx2
  1459. PREDICT_8x8_VL_10 b
  1460. %endif
  1461. ;-----------------------------------------------------------------------------
  1462. ; void predict_8x8_hd( pixel *src, pixel *edge )
  1463. ;-----------------------------------------------------------------------------
  1464. %macro PREDICT_8x8_HD 2
  1465. cglobal predict_8x8_hd, 2,2
  1466. add r0, 4*FDEC_STRIDEB
  1467. mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
  1468. movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
  1469. %ifidn cpuname, ssse3
  1470. mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
  1471. mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
  1472. palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
  1473. palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
  1474. %else
  1475. movu m2, [r1+15*SIZEOF_PIXEL]
  1476. movu m4, [r1+ 9*SIZEOF_PIXEL]
  1477. %endif ; cpuflag
  1478. pavg%1 m3, m0, m1
  1479. PRED8x8_LOWPASS m0, m4, m1, m0, m5
  1480. PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
  1481. PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
  1482. PRED8x8_LOWPASS m1, m4, m2, m1, m5
  1483. ; .. p11 p10 p9
  1484. punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
  1485. punpckl%2 m3, m0 ; p4 p3 p2 p1
  1486. mova [r0+3*FDEC_STRIDEB], m3
  1487. PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
  1488. mova [r0+2*FDEC_STRIDEB], m0
  1489. PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
  1490. mova [r0+1*FDEC_STRIDEB], m0
  1491. PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
  1492. mova [r0+0*FDEC_STRIDEB], m0
  1493. mova [r0-1*FDEC_STRIDEB], m2
  1494. PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
  1495. mova [r0-2*FDEC_STRIDEB], m0
  1496. PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
  1497. mova [r0-3*FDEC_STRIDEB], m0
  1498. PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
  1499. mova [r0-4*FDEC_STRIDEB], m1
  1500. RET
  1501. %endmacro
  1502. %if HIGH_BIT_DEPTH
  1503. INIT_XMM sse2
  1504. PREDICT_8x8_HD w, wd
  1505. INIT_XMM ssse3
  1506. PREDICT_8x8_HD w, wd
  1507. INIT_XMM avx
  1508. PREDICT_8x8_HD w, wd
  1509. %else
  1510. INIT_MMX mmx2
  1511. PREDICT_8x8_HD b, bw
  1512. ;-----------------------------------------------------------------------------
  1513. ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
  1514. ;-----------------------------------------------------------------------------
  1515. %macro PREDICT_8x8_HD 0
  1516. cglobal predict_8x8_hd, 2,2
  1517. add r0, 4*FDEC_STRIDE
  1518. movu m1, [r1+7]
  1519. movu m3, [r1+8]
  1520. movu m2, [r1+9]
  1521. pavgb m4, m1, m3
  1522. PRED8x8_LOWPASS m0, m1, m2, m3, m5
  1523. punpcklbw m4, m0
  1524. movhlps m0, m4
  1525. %assign Y 3
  1526. %rep 3
  1527. movq [r0+(Y)*FDEC_STRIDE], m4
  1528. movq [r0+(Y-4)*FDEC_STRIDE], m0
  1529. psrldq m4, 2
  1530. psrldq m0, 2
  1531. %assign Y (Y-1)
  1532. %endrep
  1533. movq [r0+(Y)*FDEC_STRIDE], m4
  1534. movq [r0+(Y-4)*FDEC_STRIDE], m0
  1535. RET
  1536. %endmacro
  1537. INIT_XMM sse2
  1538. PREDICT_8x8_HD
  1539. INIT_XMM avx
  1540. PREDICT_8x8_HD
  1541. %endif ; HIGH_BIT_DEPTH
  1542. %if HIGH_BIT_DEPTH == 0
  1543. ;-----------------------------------------------------------------------------
  1544. ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
  1545. ;-----------------------------------------------------------------------------
  1546. INIT_MMX
  1547. cglobal predict_8x8_hu_sse2, 2,2
  1548. add r0, 4*FDEC_STRIDE
  1549. movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
  1550. pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
  1551. movq mm2, mm0
  1552. psllw mm0, 8
  1553. psrlw mm2, 8
  1554. por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
  1555. psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
  1556. movq mm3, mm2
  1557. movq mm4, mm2
  1558. movq mm5, mm2
  1559. psrlq mm2, 8
  1560. psrlq mm3, 16
  1561. por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
  1562. punpckhbw mm1, mm1
  1563. por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
  1564. pavgb mm4, mm2
  1565. PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
  1566. movq2dq xmm0, mm4
  1567. movq2dq xmm1, mm1
  1568. punpcklbw xmm0, xmm1
  1569. punpckhbw mm4, mm1
  1570. %assign Y -4
  1571. %rep 3
  1572. movq [r0+Y*FDEC_STRIDE], xmm0
  1573. psrldq xmm0, 2
  1574. %assign Y (Y+1)
  1575. %endrep
  1576. pshufw mm5, mm4, q3321
  1577. pshufw mm6, mm4, q3332
  1578. pshufw mm7, mm4, q3333
  1579. movq [r0+Y*FDEC_STRIDE], xmm0
  1580. movq [r0+0*FDEC_STRIDE], mm4
  1581. movq [r0+1*FDEC_STRIDE], mm5
  1582. movq [r0+2*FDEC_STRIDE], mm6
  1583. movq [r0+3*FDEC_STRIDE], mm7
  1584. RET
  1585. INIT_XMM
  1586. cglobal predict_8x8_hu_ssse3, 2,2
  1587. add r0, 4*FDEC_STRIDE
  1588. movq m3, [r1+7]
  1589. pshufb m3, [shuf_hu]
  1590. psrldq m1, m3, 1
  1591. psrldq m2, m3, 2
  1592. pavgb m0, m1, m3
  1593. PRED8x8_LOWPASS m1, m3, m2, m1, m4
  1594. punpcklbw m0, m1
  1595. %assign Y -4
  1596. %rep 3
  1597. movq [r0+ Y *FDEC_STRIDE], m0
  1598. movhps [r0+(Y+4)*FDEC_STRIDE], m0
  1599. psrldq m0, 2
  1600. pshufhw m0, m0, q2210
  1601. %assign Y (Y+1)
  1602. %endrep
  1603. movq [r0+ Y *FDEC_STRIDE], m0
  1604. movhps [r0+(Y+4)*FDEC_STRIDE], m0
  1605. RET
  1606. %endif ; !HIGH_BIT_DEPTH
  1607. ;-----------------------------------------------------------------------------
  1608. ; void predict_8x8c_v( uint8_t *src )
  1609. ;-----------------------------------------------------------------------------
  1610. %macro PREDICT_8x8C_V 0
  1611. cglobal predict_8x8c_v, 1,1
  1612. mova m0, [r0 - FDEC_STRIDEB]
  1613. STORE8 m0
  1614. RET
  1615. %endmacro
  1616. %if HIGH_BIT_DEPTH
  1617. INIT_XMM sse
  1618. PREDICT_8x8C_V
  1619. %else
  1620. INIT_MMX mmx
  1621. PREDICT_8x8C_V
  1622. %endif
  1623. %if HIGH_BIT_DEPTH
  1624. INIT_MMX
  1625. cglobal predict_8x8c_v_mmx, 1,1
  1626. mova m0, [r0 - FDEC_STRIDEB]
  1627. mova m1, [r0 - FDEC_STRIDEB + 8]
  1628. %assign Y 0
  1629. %rep 8
  1630. mova [r0 + (Y&1)*FDEC_STRIDEB], m0
  1631. mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
  1632. %if (Y&1) && (Y!=7)
  1633. add r0, FDEC_STRIDEB*2
  1634. %endif
  1635. %assign Y Y+1
  1636. %endrep
  1637. RET
  1638. %endif
  1639. %macro PREDICT_8x16C_V 0
  1640. cglobal predict_8x16c_v, 1,1
  1641. mova m0, [r0 - FDEC_STRIDEB]
  1642. STORE16 m0
  1643. RET
  1644. %endmacro
  1645. %if HIGH_BIT_DEPTH
  1646. INIT_XMM sse
  1647. PREDICT_8x16C_V
  1648. %else
  1649. INIT_MMX mmx
  1650. PREDICT_8x16C_V
  1651. %endif
  1652. ;-----------------------------------------------------------------------------
  1653. ; void predict_8x8c_h( uint8_t *src )
  1654. ;-----------------------------------------------------------------------------
  1655. %macro PREDICT_C_H 0
  1656. cglobal predict_8x8c_h, 1,1
  1657. %if cpuflag(ssse3) && notcpuflag(avx2)
  1658. mova m2, [pb_3]
  1659. %endif
  1660. PRED_H_4ROWS 8, 1
  1661. PRED_H_4ROWS 8, 0
  1662. RET
  1663. cglobal predict_8x16c_h, 1,2
  1664. %if cpuflag(ssse3) && notcpuflag(avx2)
  1665. mova m2, [pb_3]
  1666. %endif
  1667. mov r1d, 4
  1668. .loop:
  1669. PRED_H_4ROWS 8, 1
  1670. dec r1d
  1671. jg .loop
  1672. RET
  1673. %endmacro
  1674. INIT_MMX mmx2
  1675. PREDICT_C_H
  1676. %if HIGH_BIT_DEPTH
  1677. INIT_XMM sse2
  1678. PREDICT_C_H
  1679. INIT_XMM avx2
  1680. PREDICT_C_H
  1681. %else
  1682. INIT_MMX ssse3
  1683. PREDICT_C_H
  1684. %endif
  1685. ;-----------------------------------------------------------------------------
  1686. ; void predict_8x8c_dc( pixel *src )
  1687. ;-----------------------------------------------------------------------------
  1688. %macro LOAD_LEFT 1
  1689. movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
  1690. movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
  1691. add r1d, r2d
  1692. movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
  1693. add r1d, r2d
  1694. movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
  1695. add r1d, r2d
  1696. %endmacro
  1697. %macro PREDICT_8x8C_DC 0
  1698. cglobal predict_8x8c_dc, 1,3
  1699. pxor m7, m7
  1700. %if HIGH_BIT_DEPTH
  1701. movq m0, [r0-FDEC_STRIDEB+0]
  1702. movq m1, [r0-FDEC_STRIDEB+8]
  1703. HADDW m0, m2
  1704. HADDW m1, m2
  1705. %else ; !HIGH_BIT_DEPTH
  1706. movd m0, [r0-FDEC_STRIDEB+0]
  1707. movd m1, [r0-FDEC_STRIDEB+4]
  1708. psadbw m0, m7 ; s0
  1709. psadbw m1, m7 ; s1
  1710. %endif
  1711. add r0, FDEC_STRIDEB*4
  1712. LOAD_LEFT 0 ; s2
  1713. movd m2, r1d
  1714. LOAD_LEFT 4 ; s3
  1715. movd m3, r1d
  1716. punpcklwd m0, m1
  1717. punpcklwd m2, m3
  1718. punpckldq m0, m2 ; s0, s1, s2, s3
  1719. pshufw m3, m0, q3312 ; s2, s1, s3, s3
  1720. pshufw m0, m0, q1310 ; s0, s1, s3, s1
  1721. paddw m0, m3
  1722. psrlw m0, 2
  1723. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  1724. %if HIGH_BIT_DEPTH
  1725. %if cpuflag(sse2)
  1726. movq2dq xmm0, m0
  1727. punpcklwd xmm0, xmm0
  1728. pshufd xmm1, xmm0, q3322
  1729. punpckldq xmm0, xmm0
  1730. %assign Y 0
  1731. %rep 8
  1732. %assign i (0 + (Y/4))
  1733. movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
  1734. %assign Y Y+1
  1735. %endrep
  1736. %else ; !sse2
  1737. pshufw m1, m0, q0000
  1738. pshufw m2, m0, q1111
  1739. pshufw m3, m0, q2222
  1740. pshufw m4, m0, q3333
  1741. %assign Y 0
  1742. %rep 8
  1743. %assign i (1 + (Y/4)*2)
  1744. %assign j (2 + (Y/4)*2)
  1745. movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
  1746. movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
  1747. %assign Y Y+1
  1748. %endrep
  1749. %endif
  1750. %else ; !HIGH_BIT_DEPTH
  1751. packuswb m0, m0
  1752. punpcklbw m0, m0
  1753. movq m1, m0
  1754. punpcklbw m0, m0
  1755. punpckhbw m1, m1
  1756. %assign Y 0
  1757. %rep 8
  1758. %assign i (0 + (Y/4))
  1759. movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
  1760. %assign Y Y+1
  1761. %endrep
  1762. %endif
  1763. RET
  1764. %endmacro
  1765. INIT_MMX mmx2
  1766. PREDICT_8x8C_DC
  1767. %if HIGH_BIT_DEPTH
  1768. INIT_MMX sse2
  1769. PREDICT_8x8C_DC
  1770. %endif
  1771. %if HIGH_BIT_DEPTH
  1772. %macro STORE_4LINES 3
  1773. %if cpuflag(sse2)
  1774. movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
  1775. movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
  1776. movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
  1777. movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
  1778. %else
  1779. movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
  1780. movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
  1781. movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
  1782. movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
  1783. movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
  1784. movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
  1785. movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
  1786. movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
  1787. %endif
  1788. %endmacro
  1789. %else
  1790. %macro STORE_4LINES 2
  1791. movq [r0+FDEC_STRIDEB*(%2-4)], %1
  1792. movq [r0+FDEC_STRIDEB*(%2-3)], %1
  1793. movq [r0+FDEC_STRIDEB*(%2-2)], %1
  1794. movq [r0+FDEC_STRIDEB*(%2-1)], %1
  1795. %endmacro
  1796. %endif
  1797. %macro PREDICT_8x16C_DC 0
  1798. cglobal predict_8x16c_dc, 1,3
  1799. pxor m7, m7
  1800. %if HIGH_BIT_DEPTH
  1801. movq m0, [r0-FDEC_STRIDEB+0]
  1802. movq m1, [r0-FDEC_STRIDEB+8]
  1803. HADDW m0, m2
  1804. HADDW m1, m2
  1805. %else
  1806. movd m0, [r0-FDEC_STRIDEB+0]
  1807. movd m1, [r0-FDEC_STRIDEB+4]
  1808. psadbw m0, m7 ; s0
  1809. psadbw m1, m7 ; s1
  1810. %endif
  1811. punpcklwd m0, m1 ; s0, s1
  1812. add r0, FDEC_STRIDEB*4
  1813. LOAD_LEFT 0 ; s2
  1814. pinsrw m0, r1d, 2
  1815. LOAD_LEFT 4 ; s3
  1816. pinsrw m0, r1d, 3 ; s0, s1, s2, s3
  1817. add r0, FDEC_STRIDEB*8
  1818. LOAD_LEFT 0 ; s4
  1819. pinsrw m1, r1d, 2
  1820. LOAD_LEFT 4 ; s5
  1821. pinsrw m1, r1d, 3 ; s1, __, s4, s5
  1822. sub r0, FDEC_STRIDEB*8
  1823. pshufw m2, m0, q1310 ; s0, s1, s3, s1
  1824. pshufw m0, m0, q3312 ; s2, s1, s3, s3
  1825. pshufw m3, m1, q0302 ; s4, s1, s5, s1
  1826. pshufw m1, m1, q3322 ; s4, s4, s5, s5
  1827. paddw m0, m2
  1828. paddw m1, m3
  1829. psrlw m0, 2
  1830. psrlw m1, 2
  1831. pavgw m0, m7
  1832. pavgw m1, m7
  1833. %if HIGH_BIT_DEPTH
  1834. %if cpuflag(sse2)
  1835. movq2dq xmm0, m0
  1836. movq2dq xmm1, m1
  1837. punpcklwd xmm0, xmm0
  1838. punpcklwd xmm1, xmm1
  1839. pshufd xmm2, xmm0, q3322
  1840. pshufd xmm3, xmm1, q3322
  1841. punpckldq xmm0, xmm0
  1842. punpckldq xmm1, xmm1
  1843. STORE_4LINES xmm0, xmm0, 0
  1844. STORE_4LINES xmm2, xmm2, 4
  1845. STORE_4LINES xmm1, xmm1, 8
  1846. STORE_4LINES xmm3, xmm3, 12
  1847. %else
  1848. pshufw m2, m0, q0000
  1849. pshufw m3, m0, q1111
  1850. pshufw m4, m0, q2222
  1851. pshufw m5, m0, q3333
  1852. STORE_4LINES m2, m3, 0
  1853. STORE_4LINES m4, m5, 4
  1854. pshufw m2, m1, q0000
  1855. pshufw m3, m1, q1111
  1856. pshufw m4, m1, q2222
  1857. pshufw m5, m1, q3333
  1858. STORE_4LINES m2, m3, 8
  1859. STORE_4LINES m4, m5, 12
  1860. %endif
  1861. %else
  1862. packuswb m0, m0 ; dc0, dc1, dc2, dc3
  1863. packuswb m1, m1 ; dc4, dc5, dc6, dc7
  1864. punpcklbw m0, m0
  1865. punpcklbw m1, m1
  1866. pshufw m2, m0, q1100
  1867. pshufw m3, m0, q3322
  1868. pshufw m4, m1, q1100
  1869. pshufw m5, m1, q3322
  1870. STORE_4LINES m2, 0
  1871. STORE_4LINES m3, 4
  1872. add r0, FDEC_STRIDEB*8
  1873. STORE_4LINES m4, 0
  1874. STORE_4LINES m5, 4
  1875. %endif
  1876. RET
  1877. %endmacro
  1878. INIT_MMX mmx2
  1879. PREDICT_8x16C_DC
  1880. %if HIGH_BIT_DEPTH
  1881. INIT_MMX sse2
  1882. PREDICT_8x16C_DC
  1883. %endif
  1884. %macro PREDICT_C_DC_TOP 1
  1885. %if HIGH_BIT_DEPTH
  1886. INIT_XMM
  1887. cglobal predict_8x%1c_dc_top_sse2, 1,1
  1888. pxor m2, m2
  1889. mova m0, [r0 - FDEC_STRIDEB]
  1890. pshufd m1, m0, q2301
  1891. paddw m0, m1
  1892. pshuflw m1, m0, q2301
  1893. pshufhw m1, m1, q2301
  1894. paddw m0, m1
  1895. psrlw m0, 1
  1896. pavgw m0, m2
  1897. STORE%1 m0
  1898. RET
  1899. %else ; !HIGH_BIT_DEPTH
  1900. INIT_MMX
  1901. cglobal predict_8x%1c_dc_top_mmx2, 1,1
  1902. movq mm0, [r0 - FDEC_STRIDE]
  1903. pxor mm1, mm1
  1904. pxor mm2, mm2
  1905. punpckhbw mm1, mm0
  1906. punpcklbw mm0, mm2
  1907. psadbw mm1, mm2 ; s1
  1908. psadbw mm0, mm2 ; s0
  1909. psrlw mm1, 1
  1910. psrlw mm0, 1
  1911. pavgw mm1, mm2
  1912. pavgw mm0, mm2
  1913. pshufw mm1, mm1, 0
  1914. pshufw mm0, mm0, 0 ; dc0 (w)
  1915. packuswb mm0, mm1 ; dc0,dc1 (b)
  1916. STORE%1 mm0
  1917. RET
  1918. %endif
  1919. %endmacro
  1920. PREDICT_C_DC_TOP 8
  1921. PREDICT_C_DC_TOP 16
  1922. ;-----------------------------------------------------------------------------
  1923. ; void predict_16x16_v( pixel *src )
  1924. ;-----------------------------------------------------------------------------
  1925. %macro PREDICT_16x16_V 0
  1926. cglobal predict_16x16_v, 1,2
  1927. %assign %%i 0
  1928. %rep 16*SIZEOF_PIXEL/mmsize
  1929. mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
  1930. %assign %%i %%i+1
  1931. %endrep
  1932. %if 16*SIZEOF_PIXEL/mmsize == 4
  1933. STORE16 m0, m1, m2, m3
  1934. %elif 16*SIZEOF_PIXEL/mmsize == 2
  1935. STORE16 m0, m1
  1936. %else
  1937. STORE16 m0
  1938. %endif
  1939. RET
  1940. %endmacro
  1941. INIT_MMX mmx2
  1942. PREDICT_16x16_V
  1943. INIT_XMM sse
  1944. PREDICT_16x16_V
  1945. %if HIGH_BIT_DEPTH
  1946. INIT_YMM avx
  1947. PREDICT_16x16_V
  1948. %endif
  1949. ;-----------------------------------------------------------------------------
  1950. ; void predict_16x16_h( pixel *src )
  1951. ;-----------------------------------------------------------------------------
  1952. %macro PREDICT_16x16_H 0
  1953. cglobal predict_16x16_h, 1,2
  1954. %if cpuflag(ssse3) && notcpuflag(avx2)
  1955. mova m2, [pb_3]
  1956. %endif
  1957. mov r1d, 4
  1958. .loop:
  1959. PRED_H_4ROWS 16, 1
  1960. dec r1d
  1961. jg .loop
  1962. RET
  1963. %endmacro
  1964. INIT_MMX mmx2
  1965. PREDICT_16x16_H
  1966. %if HIGH_BIT_DEPTH
  1967. INIT_XMM sse2
  1968. PREDICT_16x16_H
  1969. INIT_YMM avx2
  1970. PREDICT_16x16_H
  1971. %else
  1972. ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
  1973. INIT_XMM ssse3
  1974. PREDICT_16x16_H
  1975. %endif
  1976. ;-----------------------------------------------------------------------------
  1977. ; void predict_16x16_dc( pixel *src )
  1978. ;-----------------------------------------------------------------------------
  1979. %if WIN64
  1980. DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
  1981. %else
  1982. DECLARE_REG_TMP 3
  1983. %endif
  1984. INIT_XMM
  1985. ; Returns the sum of the left pixels in r1d+r2d
  1986. cglobal predict_16x16_dc_left_internal, 0,4
  1987. movzx r1d, pixel [r0-SIZEOF_PIXEL]
  1988. movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
  1989. %assign i 2*FDEC_STRIDEB
  1990. %rep 7
  1991. movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
  1992. add r1d, t0d
  1993. movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
  1994. add r2d, t0d
  1995. %assign i i+2*FDEC_STRIDEB
  1996. %endrep
  1997. RET
  1998. %macro PRED16x16_DC 2
  1999. %if HIGH_BIT_DEPTH
  2000. mova xm0, [r0 - FDEC_STRIDEB+ 0]
  2001. paddw xm0, [r0 - FDEC_STRIDEB+16]
  2002. HADDW xm0, xm2
  2003. paddw xm0, %1
  2004. psrlw xm0, %2
  2005. SPLATW m0, xm0
  2006. %if mmsize == 32
  2007. STORE16 m0
  2008. %else
  2009. STORE16 m0, m0
  2010. %endif
  2011. %else ; !HIGH_BIT_DEPTH
  2012. pxor m0, m0
  2013. psadbw m0, [r0 - FDEC_STRIDE]
  2014. MOVHL m1, m0
  2015. paddw m0, m1
  2016. paddusw m0, %1
  2017. psrlw m0, %2 ; dc
  2018. SPLATW m0, m0
  2019. packuswb m0, m0 ; dc in bytes
  2020. STORE16 m0
  2021. %endif
  2022. %endmacro
  2023. %macro PREDICT_16x16_DC 0
  2024. cglobal predict_16x16_dc, 1,3
  2025. call predict_16x16_dc_left_internal
  2026. lea r1d, [r1+r2+16]
  2027. movd xm3, r1d
  2028. PRED16x16_DC xm3, 5
  2029. RET
  2030. cglobal predict_16x16_dc_top, 1,2
  2031. PRED16x16_DC [pw_8], 4
  2032. RET
  2033. cglobal predict_16x16_dc_left, 1,3
  2034. call predict_16x16_dc_left_internal
  2035. lea r1d, [r1+r2+8]
  2036. shr r1d, 4
  2037. movd xm0, r1d
  2038. SPLATW m0, xm0
  2039. %if HIGH_BIT_DEPTH && mmsize == 16
  2040. STORE16 m0, m0
  2041. %else
  2042. %if HIGH_BIT_DEPTH == 0
  2043. packuswb m0, m0
  2044. %endif
  2045. STORE16 m0
  2046. %endif
  2047. RET
  2048. %endmacro
  2049. INIT_XMM sse2
  2050. PREDICT_16x16_DC
  2051. %if HIGH_BIT_DEPTH
  2052. INIT_YMM avx2
  2053. PREDICT_16x16_DC
  2054. %else
  2055. INIT_XMM avx2
  2056. PREDICT_16x16_DC
  2057. %endif