mc-a2.asm 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883
  1. ;*****************************************************************************
  2. ;* mc-a2.asm: x86 motion compensation
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Holger Lubitz <holger@lubitz.org>
  9. ;* Mathieu Monnier <manao@melix.net>
  10. ;* Oskar Arvidsson <oskar@irock.se>
  11. ;*
  12. ;* This program is free software; you can redistribute it and/or modify
  13. ;* it under the terms of the GNU General Public License as published by
  14. ;* the Free Software Foundation; either version 2 of the License, or
  15. ;* (at your option) any later version.
  16. ;*
  17. ;* This program is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. ;* GNU General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU General Public License
  23. ;* along with this program; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  25. ;*
  26. ;* This program is also available under a commercial proprietary license.
  27. ;* For more information, contact us at licensing@x264.com.
  28. ;*****************************************************************************
  29. %include "x86inc.asm"
  30. %include "x86util.asm"
  31. SECTION_RODATA 64
  32. %if HIGH_BIT_DEPTH
  33. v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
  34. db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
  35. db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
  36. v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
  37. v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
  38. v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
  39. ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
  40. v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
  41. dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
  42. copy_swap_shuf: SHUFFLE_MASK_W 1,0,3,2,5,4,7,6
  43. deinterleave_shuf: SHUFFLE_MASK_W 0,2,4,6,1,3,5,7
  44. deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
  45. deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
  46. %else
  47. deinterleave_rgb_shuf: db 0, 3, 6, 9, 0, 3, 6, 9, 1, 4, 7,10, 2, 5, 8,11
  48. db 0, 4, 8,12, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14
  49. copy_swap_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
  50. deinterleave_shuf: db 0, 2, 4, 6, 8,10,12,14, 1, 3, 5, 7, 9,11,13,15
  51. deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
  52. deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
  53. %endif ; !HIGH_BIT_DEPTH
  54. pw_1024: times 16 dw 1024
  55. filt_mul20: times 32 db 20
  56. filt_mul15: times 16 db 1, -5
  57. filt_mul51: times 16 db -5, 1
  58. hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
  59. mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
  60. mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
  61. db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
  62. ; bits 0-3: pshufb, bits 4-7: AVX-512 vpermq
  63. mbtree_fix8_pack_shuf: db 0x01,0x20,0x43,0x62,0x15,0x34,0x57,0x76,0x09,0x08,0x0b,0x0a,0x0d,0x0c,0x0f,0x0e
  64. pf_256: times 4 dd 256.0
  65. pf_inv16777216: times 4 dd 0x1p-24
  66. pd_16: times 4 dd 16
  67. pad10: times 8 dw 10*PIXEL_MAX
  68. pad20: times 8 dw 20*PIXEL_MAX
  69. pad30: times 8 dw 30*PIXEL_MAX
  70. depad: times 4 dd 32*20*PIXEL_MAX + 512
  71. tap1: times 4 dw 1, -5
  72. tap2: times 4 dw 20, 20
  73. tap3: times 4 dw -5, 1
  74. pw_0xc000: times 8 dw 0xc000
  75. pw_31: times 8 dw 31
  76. pd_4: times 4 dd 4
  77. SECTION .text
  78. cextern pb_0
  79. cextern pw_1
  80. cextern pw_8
  81. cextern pw_16
  82. cextern pw_32
  83. cextern pw_512
  84. cextern pw_00ff
  85. cextern pw_3fff
  86. cextern pw_pixel_max
  87. cextern pw_0to15
  88. cextern pd_8
  89. cextern pd_0123
  90. cextern pd_ffff
  91. cextern deinterleave_shufd
  92. %macro LOAD_ADD 4
  93. movh %4, %3
  94. movh %1, %2
  95. punpcklbw %4, m0
  96. punpcklbw %1, m0
  97. paddw %1, %4
  98. %endmacro
  99. %macro LOAD_ADD_2 6
  100. mova %5, %3
  101. mova %1, %4
  102. punpckhbw %6, %5, m0
  103. punpcklbw %5, m0
  104. punpckhbw %2, %1, m0
  105. punpcklbw %1, m0
  106. paddw %1, %5
  107. paddw %2, %6
  108. %endmacro
  109. %macro FILT_V2 6
  110. psubw %1, %2 ; a-b
  111. psubw %4, %5
  112. psubw %2, %3 ; b-c
  113. psubw %5, %6
  114. psllw %2, 2
  115. psllw %5, 2
  116. psubw %1, %2 ; a-5*b+4*c
  117. psllw %3, 4
  118. psubw %4, %5
  119. psllw %6, 4
  120. paddw %1, %3 ; a-5*b+20*c
  121. paddw %4, %6
  122. %endmacro
  123. %macro FILT_H 3
  124. psubw %1, %2 ; a-b
  125. psraw %1, 2 ; (a-b)/4
  126. psubw %1, %2 ; (a-b)/4-b
  127. paddw %1, %3 ; (a-b)/4-b+c
  128. psraw %1, 2 ; ((a-b)/4-b+c)/4
  129. paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
  130. %endmacro
  131. %macro FILT_H2 6
  132. psubw %1, %2
  133. psubw %4, %5
  134. psraw %1, 2
  135. psraw %4, 2
  136. psubw %1, %2
  137. psubw %4, %5
  138. paddw %1, %3
  139. paddw %4, %6
  140. psraw %1, 2
  141. psraw %4, 2
  142. paddw %1, %3
  143. paddw %4, %6
  144. %endmacro
  145. %macro FILT_PACK 3-5
  146. %if cpuflag(ssse3)
  147. pmulhrsw %1, %3
  148. pmulhrsw %2, %3
  149. %else
  150. paddw %1, %3
  151. paddw %2, %3
  152. %if %0 == 5
  153. psubusw %1, %5
  154. psubusw %2, %5
  155. psrlw %1, %4
  156. psrlw %2, %4
  157. %else
  158. psraw %1, %4
  159. psraw %2, %4
  160. %endif
  161. %endif
  162. %if HIGH_BIT_DEPTH == 0
  163. packuswb %1, %2
  164. %endif
  165. %endmacro
  166. ;The hpel_filter routines use non-temporal writes for output.
  167. ;The following defines may be uncommented for testing.
  168. ;Doing the hpel_filter temporal may be a win if the last level cache
  169. ;is big enough (preliminary benching suggests on the order of 4* framesize).
  170. ;%define movntq movq
  171. ;%define movntps movaps
  172. ;%define sfence
  173. %if HIGH_BIT_DEPTH
  174. ;-----------------------------------------------------------------------------
  175. ; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
  176. ;-----------------------------------------------------------------------------
  177. %macro HPEL_FILTER 0
  178. cglobal hpel_filter_v, 5,6,11
  179. FIX_STRIDES r3, r4
  180. lea r5, [r1+r3]
  181. sub r1, r3
  182. sub r1, r3
  183. %if num_mmregs > 8
  184. mova m8, [pad10]
  185. mova m9, [pad20]
  186. mova m10, [pad30]
  187. %define s10 m8
  188. %define s20 m9
  189. %define s30 m10
  190. %else
  191. %define s10 [pad10]
  192. %define s20 [pad20]
  193. %define s30 [pad30]
  194. %endif
  195. add r0, r4
  196. add r2, r4
  197. neg r4
  198. mova m7, [pw_pixel_max]
  199. pxor m0, m0
  200. .loop:
  201. mova m1, [r1]
  202. mova m2, [r1+r3]
  203. mova m3, [r1+r3*2]
  204. mova m4, [r1+mmsize]
  205. mova m5, [r1+r3+mmsize]
  206. mova m6, [r1+r3*2+mmsize]
  207. paddw m1, [r5+r3*2]
  208. paddw m2, [r5+r3]
  209. paddw m3, [r5]
  210. paddw m4, [r5+r3*2+mmsize]
  211. paddw m5, [r5+r3+mmsize]
  212. paddw m6, [r5+mmsize]
  213. add r1, 2*mmsize
  214. add r5, 2*mmsize
  215. FILT_V2 m1, m2, m3, m4, m5, m6
  216. mova m6, [pw_16]
  217. psubw m1, s20
  218. psubw m4, s20
  219. mova [r2+r4], m1
  220. mova [r2+r4+mmsize], m4
  221. paddw m1, s30
  222. paddw m4, s30
  223. FILT_PACK m1, m4, m6, 5, s10
  224. CLIPW m1, m0, m7
  225. CLIPW m4, m0, m7
  226. mova [r0+r4], m1
  227. mova [r0+r4+mmsize], m4
  228. add r4, 2*mmsize
  229. jl .loop
  230. RET
  231. ;-----------------------------------------------------------------------------
  232. ; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
  233. ;-----------------------------------------------------------------------------
  234. cglobal hpel_filter_c, 3,3,10
  235. add r2, r2
  236. add r0, r2
  237. add r1, r2
  238. neg r2
  239. mova m0, [tap1]
  240. mova m7, [tap3]
  241. %if num_mmregs > 8
  242. mova m8, [tap2]
  243. mova m9, [depad]
  244. %define s1 m8
  245. %define s2 m9
  246. %else
  247. %define s1 [tap2]
  248. %define s2 [depad]
  249. %endif
  250. .loop:
  251. movu m1, [r1+r2-4]
  252. movu m2, [r1+r2-2]
  253. mova m3, [r1+r2+0]
  254. movu m4, [r1+r2+2]
  255. movu m5, [r1+r2+4]
  256. movu m6, [r1+r2+6]
  257. pmaddwd m1, m0
  258. pmaddwd m2, m0
  259. pmaddwd m3, s1
  260. pmaddwd m4, s1
  261. pmaddwd m5, m7
  262. pmaddwd m6, m7
  263. paddd m1, s2
  264. paddd m2, s2
  265. paddd m3, m5
  266. paddd m4, m6
  267. paddd m1, m3
  268. paddd m2, m4
  269. psrad m1, 10
  270. psrad m2, 10
  271. pslld m2, 16
  272. pand m1, [pd_ffff]
  273. por m1, m2
  274. CLIPW m1, [pb_0], [pw_pixel_max]
  275. mova [r0+r2], m1
  276. add r2, mmsize
  277. jl .loop
  278. RET
  279. ;-----------------------------------------------------------------------------
  280. ; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
  281. ;-----------------------------------------------------------------------------
  282. cglobal hpel_filter_h, 3,4,8
  283. %define src r1+r2
  284. add r2, r2
  285. add r0, r2
  286. add r1, r2
  287. neg r2
  288. mova m0, [pw_pixel_max]
  289. .loop:
  290. movu m1, [src-4]
  291. movu m2, [src-2]
  292. mova m3, [src+0]
  293. movu m6, [src+2]
  294. movu m4, [src+4]
  295. movu m5, [src+6]
  296. paddw m3, m6 ; c0
  297. paddw m2, m4 ; b0
  298. paddw m1, m5 ; a0
  299. %if mmsize == 16
  300. movu m4, [src-4+mmsize]
  301. movu m5, [src-2+mmsize]
  302. %endif
  303. movu m7, [src+4+mmsize]
  304. movu m6, [src+6+mmsize]
  305. paddw m5, m7 ; b1
  306. paddw m4, m6 ; a1
  307. movu m7, [src+2+mmsize]
  308. mova m6, [src+0+mmsize]
  309. paddw m6, m7 ; c1
  310. FILT_H2 m1, m2, m3, m4, m5, m6
  311. mova m7, [pw_1]
  312. pxor m2, m2
  313. FILT_PACK m1, m4, m7, 1
  314. CLIPW m1, m2, m0
  315. CLIPW m4, m2, m0
  316. mova [r0+r2], m1
  317. mova [r0+r2+mmsize], m4
  318. add r2, mmsize*2
  319. jl .loop
  320. RET
  321. %endmacro ; HPEL_FILTER
  322. INIT_MMX mmx2
  323. HPEL_FILTER
  324. INIT_XMM sse2
  325. HPEL_FILTER
  326. %endif ; HIGH_BIT_DEPTH
  327. %if HIGH_BIT_DEPTH == 0
  328. %macro HPEL_V 1
  329. ;-----------------------------------------------------------------------------
  330. ; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
  331. ;-----------------------------------------------------------------------------
  332. cglobal hpel_filter_v, 5,6,%1
  333. lea r5, [r1+r3]
  334. sub r1, r3
  335. sub r1, r3
  336. add r0, r4
  337. lea r2, [r2+r4*2]
  338. neg r4
  339. %if cpuflag(ssse3)
  340. mova m0, [filt_mul15]
  341. %else
  342. pxor m0, m0
  343. %endif
  344. .loop:
  345. %if cpuflag(ssse3)
  346. mova m1, [r1]
  347. mova m4, [r1+r3]
  348. mova m2, [r5+r3*2]
  349. mova m5, [r5+r3]
  350. mova m3, [r1+r3*2]
  351. mova m6, [r5]
  352. SBUTTERFLY bw, 1, 4, 7
  353. SBUTTERFLY bw, 2, 5, 7
  354. SBUTTERFLY bw, 3, 6, 7
  355. pmaddubsw m1, m0
  356. pmaddubsw m4, m0
  357. pmaddubsw m2, m0
  358. pmaddubsw m5, m0
  359. pmaddubsw m3, [filt_mul20]
  360. pmaddubsw m6, [filt_mul20]
  361. paddw m1, m2
  362. paddw m4, m5
  363. paddw m1, m3
  364. paddw m4, m6
  365. mova m7, [pw_1024]
  366. %else
  367. LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
  368. LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
  369. LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
  370. LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
  371. FILT_V2 m1, m2, m3, m4, m5, m6
  372. mova m7, [pw_16]
  373. %endif
  374. %if mmsize==32
  375. mova [r2+r4*2], xm1
  376. mova [r2+r4*2+mmsize/2], xm4
  377. vextracti128 [r2+r4*2+mmsize], m1, 1
  378. vextracti128 [r2+r4*2+mmsize*3/2], m4, 1
  379. %else
  380. mova [r2+r4*2], m1
  381. mova [r2+r4*2+mmsize], m4
  382. %endif
  383. FILT_PACK m1, m4, m7, 5
  384. movnta [r0+r4], m1
  385. add r1, mmsize
  386. add r5, mmsize
  387. add r4, mmsize
  388. jl .loop
  389. RET
  390. %endmacro
  391. ;-----------------------------------------------------------------------------
  392. ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
  393. ;-----------------------------------------------------------------------------
  394. INIT_MMX mmx2
  395. cglobal hpel_filter_c, 3,3
  396. add r0, r2
  397. lea r1, [r1+r2*2]
  398. neg r2
  399. %define src r1+r2*2
  400. movq m7, [pw_32]
  401. .loop:
  402. movq m1, [src-4]
  403. movq m2, [src-2]
  404. movq m3, [src ]
  405. movq m4, [src+4]
  406. movq m5, [src+6]
  407. paddw m3, [src+2] ; c0
  408. paddw m2, m4 ; b0
  409. paddw m1, m5 ; a0
  410. movq m6, [src+8]
  411. paddw m4, [src+14] ; a1
  412. paddw m5, [src+12] ; b1
  413. paddw m6, [src+10] ; c1
  414. FILT_H2 m1, m2, m3, m4, m5, m6
  415. FILT_PACK m1, m4, m7, 6
  416. movntq [r0+r2], m1
  417. add r2, 8
  418. jl .loop
  419. RET
  420. ;-----------------------------------------------------------------------------
  421. ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  422. ;-----------------------------------------------------------------------------
  423. INIT_MMX mmx2
  424. cglobal hpel_filter_h, 3,3
  425. add r0, r2
  426. add r1, r2
  427. neg r2
  428. %define src r1+r2
  429. pxor m0, m0
  430. .loop:
  431. movd m1, [src-2]
  432. movd m2, [src-1]
  433. movd m3, [src ]
  434. movd m6, [src+1]
  435. movd m4, [src+2]
  436. movd m5, [src+3]
  437. punpcklbw m1, m0
  438. punpcklbw m2, m0
  439. punpcklbw m3, m0
  440. punpcklbw m6, m0
  441. punpcklbw m4, m0
  442. punpcklbw m5, m0
  443. paddw m3, m6 ; c0
  444. paddw m2, m4 ; b0
  445. paddw m1, m5 ; a0
  446. movd m7, [src+7]
  447. movd m6, [src+6]
  448. punpcklbw m7, m0
  449. punpcklbw m6, m0
  450. paddw m4, m7 ; c1
  451. paddw m5, m6 ; b1
  452. movd m7, [src+5]
  453. movd m6, [src+4]
  454. punpcklbw m7, m0
  455. punpcklbw m6, m0
  456. paddw m6, m7 ; a1
  457. movq m7, [pw_1]
  458. FILT_H2 m1, m2, m3, m4, m5, m6
  459. FILT_PACK m1, m4, m7, 1
  460. movntq [r0+r2], m1
  461. add r2, 8
  462. jl .loop
  463. RET
  464. %macro HPEL_C 0
  465. ;-----------------------------------------------------------------------------
  466. ; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
  467. ;-----------------------------------------------------------------------------
  468. cglobal hpel_filter_c, 3,3,9
  469. add r0, r2
  470. lea r1, [r1+r2*2]
  471. neg r2
  472. %define src r1+r2*2
  473. %ifnidn cpuname, sse2
  474. %if cpuflag(ssse3)
  475. mova m7, [pw_512]
  476. %else
  477. mova m7, [pw_32]
  478. %endif
  479. %define pw_rnd m7
  480. %elif ARCH_X86_64
  481. mova m8, [pw_32]
  482. %define pw_rnd m8
  483. %else
  484. %define pw_rnd [pw_32]
  485. %endif
  486. ; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
  487. %if mmsize==32
  488. .loop:
  489. movu m4, [src-4]
  490. movu m5, [src-2]
  491. mova m6, [src+0]
  492. movu m3, [src-4+mmsize]
  493. movu m2, [src-2+mmsize]
  494. mova m1, [src+0+mmsize]
  495. paddw m4, [src+6]
  496. paddw m5, [src+4]
  497. paddw m6, [src+2]
  498. paddw m3, [src+6+mmsize]
  499. paddw m2, [src+4+mmsize]
  500. paddw m1, [src+2+mmsize]
  501. FILT_H2 m4, m5, m6, m3, m2, m1
  502. %else
  503. mova m0, [src-16]
  504. mova m1, [src]
  505. .loop:
  506. mova m2, [src+16]
  507. PALIGNR m4, m1, m0, 12, m7
  508. PALIGNR m5, m1, m0, 14, m0
  509. PALIGNR m0, m2, m1, 6, m7
  510. paddw m4, m0
  511. PALIGNR m0, m2, m1, 4, m7
  512. paddw m5, m0
  513. PALIGNR m6, m2, m1, 2, m7
  514. paddw m6, m1
  515. FILT_H m4, m5, m6
  516. mova m0, m2
  517. mova m5, m2
  518. PALIGNR m2, m1, 12, m7
  519. PALIGNR m5, m1, 14, m1
  520. mova m1, [src+32]
  521. PALIGNR m3, m1, m0, 6, m7
  522. paddw m3, m2
  523. PALIGNR m6, m1, m0, 4, m7
  524. paddw m5, m6
  525. PALIGNR m6, m1, m0, 2, m7
  526. paddw m6, m0
  527. FILT_H m3, m5, m6
  528. %endif
  529. FILT_PACK m4, m3, pw_rnd, 6
  530. %if mmsize==32
  531. vpermq m4, m4, q3120
  532. %endif
  533. movnta [r0+r2], m4
  534. add r2, mmsize
  535. jl .loop
  536. RET
  537. %endmacro
  538. ;-----------------------------------------------------------------------------
  539. ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  540. ;-----------------------------------------------------------------------------
  541. INIT_XMM sse2
  542. cglobal hpel_filter_h, 3,3,8
  543. add r0, r2
  544. add r1, r2
  545. neg r2
  546. %define src r1+r2
  547. pxor m0, m0
  548. .loop:
  549. movh m1, [src-2]
  550. movh m2, [src-1]
  551. movh m3, [src ]
  552. movh m4, [src+1]
  553. movh m5, [src+2]
  554. movh m6, [src+3]
  555. punpcklbw m1, m0
  556. punpcklbw m2, m0
  557. punpcklbw m3, m0
  558. punpcklbw m4, m0
  559. punpcklbw m5, m0
  560. punpcklbw m6, m0
  561. paddw m3, m4 ; c0
  562. paddw m2, m5 ; b0
  563. paddw m1, m6 ; a0
  564. movh m4, [src+6]
  565. movh m5, [src+7]
  566. movh m6, [src+10]
  567. movh m7, [src+11]
  568. punpcklbw m4, m0
  569. punpcklbw m5, m0
  570. punpcklbw m6, m0
  571. punpcklbw m7, m0
  572. paddw m5, m6 ; b1
  573. paddw m4, m7 ; a1
  574. movh m6, [src+8]
  575. movh m7, [src+9]
  576. punpcklbw m6, m0
  577. punpcklbw m7, m0
  578. paddw m6, m7 ; c1
  579. mova m7, [pw_1] ; FIXME xmm8
  580. FILT_H2 m1, m2, m3, m4, m5, m6
  581. FILT_PACK m1, m4, m7, 1
  582. movntps [r0+r2], m1
  583. add r2, 16
  584. jl .loop
  585. RET
  586. ;-----------------------------------------------------------------------------
  587. ; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
  588. ;-----------------------------------------------------------------------------
  589. %macro HPEL_H 0
  590. cglobal hpel_filter_h, 3,3
  591. add r0, r2
  592. add r1, r2
  593. neg r2
  594. %define src r1+r2
  595. mova m0, [src-16]
  596. mova m1, [src]
  597. mova m7, [pw_1024]
  598. .loop:
  599. mova m2, [src+16]
  600. ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
  601. ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
  602. ; the repeated loads of constants for pmaddubsw.
  603. palignr m3, m1, m0, 14
  604. palignr m4, m1, m0, 15
  605. palignr m0, m2, m1, 2
  606. pmaddubsw m3, [filt_mul15]
  607. pmaddubsw m4, [filt_mul15]
  608. pmaddubsw m0, [filt_mul51]
  609. palignr m5, m2, m1, 1
  610. palignr m6, m2, m1, 3
  611. paddw m3, m0
  612. mova m0, m1
  613. pmaddubsw m1, [filt_mul20]
  614. pmaddubsw m5, [filt_mul20]
  615. pmaddubsw m6, [filt_mul51]
  616. paddw m3, m1
  617. paddw m4, m5
  618. paddw m4, m6
  619. FILT_PACK m3, m4, m7, 5
  620. pshufb m3, [hpel_shuf]
  621. mova m1, m2
  622. movntps [r0+r2], m3
  623. add r2, 16
  624. jl .loop
  625. RET
  626. %endmacro
  627. INIT_MMX mmx2
  628. HPEL_V 0
  629. INIT_XMM sse2
  630. HPEL_V 8
  631. %if ARCH_X86_64 == 0
  632. INIT_XMM sse2
  633. HPEL_C
  634. INIT_XMM ssse3
  635. HPEL_C
  636. HPEL_V 0
  637. HPEL_H
  638. INIT_XMM avx
  639. HPEL_C
  640. HPEL_V 0
  641. HPEL_H
  642. INIT_YMM avx2
  643. HPEL_V 8
  644. HPEL_C
  645. INIT_YMM avx2
  646. cglobal hpel_filter_h, 3,3,8
  647. add r0, r2
  648. add r1, r2
  649. neg r2
  650. %define src r1+r2
  651. mova m5, [filt_mul15]
  652. mova m6, [filt_mul20]
  653. mova m7, [filt_mul51]
  654. .loop:
  655. movu m0, [src-2]
  656. movu m1, [src-1]
  657. movu m2, [src+2]
  658. pmaddubsw m0, m5
  659. pmaddubsw m1, m5
  660. pmaddubsw m2, m7
  661. paddw m0, m2
  662. mova m2, [src+0]
  663. movu m3, [src+1]
  664. movu m4, [src+3]
  665. pmaddubsw m2, m6
  666. pmaddubsw m3, m6
  667. pmaddubsw m4, m7
  668. paddw m0, m2
  669. paddw m1, m3
  670. paddw m1, m4
  671. mova m2, [pw_1024]
  672. FILT_PACK m0, m1, m2, 5
  673. pshufb m0, [hpel_shuf]
  674. movnta [r0+r2], m0
  675. add r2, mmsize
  676. jl .loop
  677. RET
  678. %endif
  679. %if ARCH_X86_64
  680. %macro DO_FILT_V 5
  681. ;The optimum prefetch distance is difficult to determine in checkasm:
  682. ;any prefetch seems slower than not prefetching.
  683. ;In real use, the prefetch seems to be a slight win.
  684. ;+mmsize is picked somewhat arbitrarily here based on the fact that even one
  685. ;loop iteration is going to take longer than the prefetch.
  686. prefetcht0 [r1+r2*2+mmsize]
  687. %if cpuflag(ssse3)
  688. mova m1, [r3]
  689. mova m2, [r3+r2]
  690. mova %3, [r3+r2*2]
  691. mova m3, [r1]
  692. mova %1, [r1+r2]
  693. mova %2, [r1+r2*2]
  694. punpckhbw m4, m1, m2
  695. punpcklbw m1, m2
  696. punpckhbw m2, %1, %2
  697. punpcklbw %1, %2
  698. punpckhbw %2, m3, %3
  699. punpcklbw m3, %3
  700. pmaddubsw m1, m12
  701. pmaddubsw m4, m12
  702. pmaddubsw %1, m0
  703. pmaddubsw m2, m0
  704. pmaddubsw m3, m14
  705. pmaddubsw %2, m14
  706. paddw m1, %1
  707. paddw m4, m2
  708. paddw m1, m3
  709. paddw m4, %2
  710. %else
  711. LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
  712. LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
  713. LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
  714. packuswb %3, %4
  715. FILT_V2 m1, m2, m3, m4, m5, m6
  716. %endif
  717. add r3, mmsize
  718. add r1, mmsize
  719. %if mmsize==32
  720. vinserti128 %1, m1, xm4, 1
  721. vperm2i128 %2, m1, m4, q0301
  722. %else
  723. mova %1, m1
  724. mova %2, m4
  725. %endif
  726. FILT_PACK m1, m4, m15, 5
  727. movntps [r8+r4+%5], m1
  728. %endmacro
  729. %macro FILT_C 3
  730. %if mmsize==32
  731. vperm2i128 m3, %2, %1, q0003
  732. %endif
  733. PALIGNR m1, %2, %1, (mmsize-4), m3
  734. PALIGNR m2, %2, %1, (mmsize-2), m3
  735. %if mmsize==32
  736. vperm2i128 %1, %3, %2, q0003
  737. %endif
  738. PALIGNR m3, %3, %2, 4, %1
  739. PALIGNR m4, %3, %2, 2, %1
  740. paddw m3, m2
  741. %if mmsize==32
  742. mova m2, %1
  743. %endif
  744. mova %1, %3
  745. PALIGNR %3, %3, %2, 6, m2
  746. paddw m4, %2
  747. paddw %3, m1
  748. FILT_H %3, m3, m4
  749. %endmacro
  750. %macro DO_FILT_C 4
  751. FILT_C %1, %2, %3
  752. FILT_C %2, %1, %4
  753. FILT_PACK %3, %4, m15, 6
  754. %if mmsize==32
  755. vpermq %3, %3, q3120
  756. %endif
  757. movntps [r5+r4], %3
  758. %endmacro
  759. %macro ADD8TO16 5
  760. punpckhbw %3, %1, %5
  761. punpcklbw %1, %5
  762. punpcklbw %4, %2, %5
  763. punpckhbw %2, %5
  764. paddw %2, %3
  765. paddw %1, %4
  766. %endmacro
  767. %macro DO_FILT_H 3
  768. %if mmsize==32
  769. vperm2i128 m3, %2, %1, q0003
  770. %endif
  771. PALIGNR m1, %2, %1, (mmsize-2), m3
  772. PALIGNR m2, %2, %1, (mmsize-1), m3
  773. %if mmsize==32
  774. vperm2i128 m3, %3, %2, q0003
  775. %endif
  776. PALIGNR m4, %3, %2, 1 , m3
  777. PALIGNR m5, %3, %2, 2 , m3
  778. PALIGNR m6, %3, %2, 3 , m3
  779. mova %1, %2
  780. %if cpuflag(ssse3)
  781. pmaddubsw m1, m12
  782. pmaddubsw m2, m12
  783. pmaddubsw %2, m14
  784. pmaddubsw m4, m14
  785. pmaddubsw m5, m0
  786. pmaddubsw m6, m0
  787. paddw m1, %2
  788. paddw m2, m4
  789. paddw m1, m5
  790. paddw m2, m6
  791. FILT_PACK m1, m2, m15, 5
  792. pshufb m1, [hpel_shuf]
  793. %else ; ssse3, avx
  794. ADD8TO16 m1, m6, m12, m3, m0 ; a
  795. ADD8TO16 m2, m5, m12, m3, m0 ; b
  796. ADD8TO16 %2, m4, m12, m3, m0 ; c
  797. FILT_V2 m1, m2, %2, m6, m5, m4
  798. FILT_PACK m1, m6, m15, 5
  799. %endif
  800. movntps [r0+r4], m1
  801. mova %2, %3
  802. %endmacro
  803. %macro HPEL 0
  804. ;-----------------------------------------------------------------------------
  805. ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
  806. ; uint8_t *src, intptr_t stride, int width, int height )
  807. ;-----------------------------------------------------------------------------
  808. cglobal hpel_filter, 7,9,16
  809. mov r7, r3
  810. sub r5d, mmsize
  811. mov r8, r1
  812. and r7, mmsize-1
  813. sub r3, r7
  814. add r0, r5
  815. add r8, r5
  816. add r7, r5
  817. add r5, r2
  818. mov r2, r4
  819. neg r7
  820. lea r1, [r3+r2]
  821. sub r3, r2
  822. sub r3, r2
  823. mov r4, r7
  824. %if cpuflag(ssse3)
  825. mova m0, [filt_mul51]
  826. mova m12, [filt_mul15]
  827. mova m14, [filt_mul20]
  828. mova m15, [pw_1024]
  829. %else
  830. pxor m0, m0
  831. mova m15, [pw_16]
  832. %endif
  833. ;ALIGN 16
  834. .loopy:
  835. ; first filter_v
  836. DO_FILT_V m8, m7, m13, m12, 0
  837. ;ALIGN 16
  838. .loopx:
  839. DO_FILT_V m6, m5, m11, m12, mmsize
  840. .lastx:
  841. %if cpuflag(ssse3)
  842. psrlw m15, 1 ; pw_512
  843. %else
  844. paddw m15, m15 ; pw_32
  845. %endif
  846. DO_FILT_C m9, m8, m7, m6
  847. %if cpuflag(ssse3)
  848. paddw m15, m15 ; pw_1024
  849. %else
  850. psrlw m15, 1 ; pw_16
  851. %endif
  852. mova m7, m5
  853. DO_FILT_H m10, m13, m11
  854. add r4, mmsize
  855. jl .loopx
  856. cmp r4, mmsize
  857. jl .lastx
  858. ; setup regs for next y
  859. sub r4, r7
  860. sub r4, r2
  861. sub r1, r4
  862. sub r3, r4
  863. add r0, r2
  864. add r8, r2
  865. add r5, r2
  866. mov r4, r7
  867. sub r6d, 1
  868. jg .loopy
  869. sfence
  870. RET
  871. %endmacro
  872. INIT_XMM sse2
  873. HPEL
  874. INIT_XMM ssse3
  875. HPEL
  876. INIT_XMM avx
  877. HPEL
  878. INIT_YMM avx2
  879. HPEL
  880. %endif ; ARCH_X86_64
  881. %undef movntq
  882. %undef movntps
  883. %undef sfence
  884. %endif ; !HIGH_BIT_DEPTH
  885. %macro PREFETCHNT_ITER 2 ; src, bytes/iteration
  886. %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
  887. %rep (%2+63) / 64 ; assume 64 byte cache lines
  888. prefetchnta [%1+%%i]
  889. %assign %%i %%i + 64
  890. %endrep
  891. %endmacro
  892. ;-----------------------------------------------------------------------------
  893. ; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
  894. ; pixel *src, intptr_t i_src, int w, int h )
  895. ;-----------------------------------------------------------------------------
  896. ; assumes i_dst and w are multiples of mmsize, and i_dst>w
  897. %macro PLANE_COPY_CORE 1 ; swap
  898. %if %1
  899. cglobal plane_copy_swap_core, 6,7
  900. %if mmsize == 32
  901. vbroadcasti128 m4, [copy_swap_shuf]
  902. %else
  903. mova m4, [copy_swap_shuf]
  904. %endif
  905. %else
  906. cglobal plane_copy_core, 6,7
  907. %endif
  908. FIX_STRIDES r1, r3
  909. %if %1 && HIGH_BIT_DEPTH
  910. shl r4d, 2
  911. %elif %1 || HIGH_BIT_DEPTH
  912. add r4d, r4d
  913. %else
  914. movsxdifnidn r4, r4d
  915. %endif
  916. add r0, r4
  917. add r2, r4
  918. neg r4
  919. .loopy:
  920. lea r6, [r4+4*mmsize]
  921. %if %1
  922. test r6d, r6d
  923. jg .skip
  924. %endif
  925. .loopx:
  926. PREFETCHNT_ITER r2+r6, 4*mmsize
  927. movu m0, [r2+r6-4*mmsize]
  928. movu m1, [r2+r6-3*mmsize]
  929. movu m2, [r2+r6-2*mmsize]
  930. movu m3, [r2+r6-1*mmsize]
  931. %if %1
  932. pshufb m0, m4
  933. pshufb m1, m4
  934. pshufb m2, m4
  935. pshufb m3, m4
  936. %endif
  937. movnta [r0+r6-4*mmsize], m0
  938. movnta [r0+r6-3*mmsize], m1
  939. movnta [r0+r6-2*mmsize], m2
  940. movnta [r0+r6-1*mmsize], m3
  941. add r6, 4*mmsize
  942. jle .loopx
  943. .skip:
  944. PREFETCHNT_ITER r2+r6, 4*mmsize
  945. sub r6, 4*mmsize
  946. jz .end
  947. .loop_end:
  948. movu m0, [r2+r6]
  949. %if %1
  950. pshufb m0, m4
  951. %endif
  952. movnta [r0+r6], m0
  953. add r6, mmsize
  954. jl .loop_end
  955. .end:
  956. add r0, r1
  957. add r2, r3
  958. dec r5d
  959. jg .loopy
  960. sfence
  961. RET
  962. %endmacro
  963. INIT_XMM sse
  964. PLANE_COPY_CORE 0
  965. INIT_XMM ssse3
  966. PLANE_COPY_CORE 1
  967. INIT_YMM avx
  968. PLANE_COPY_CORE 0
  969. INIT_YMM avx2
  970. PLANE_COPY_CORE 1
  971. %macro PLANE_COPY_AVX512 1 ; swap
  972. %if %1
  973. cglobal plane_copy_swap, 6,7
  974. vbroadcasti32x4 m4, [copy_swap_shuf]
  975. %else
  976. cglobal plane_copy, 6,7
  977. %endif
  978. movsxdifnidn r4, r4d
  979. %if %1 && HIGH_BIT_DEPTH
  980. %define %%mload vmovdqu32
  981. lea r2, [r2+4*r4-64]
  982. lea r0, [r0+4*r4-64]
  983. neg r4
  984. mov r6d, r4d
  985. shl r4, 2
  986. or r6d, 0xffff0010
  987. shrx r6d, r6d, r6d ; (1 << (w & 15)) - 1
  988. kmovw k1, r6d
  989. %elif %1 || HIGH_BIT_DEPTH
  990. %define %%mload vmovdqu16
  991. lea r2, [r2+2*r4-64]
  992. lea r0, [r0+2*r4-64]
  993. mov r6d, -1
  994. neg r4
  995. shrx r6d, r6d, r4d
  996. add r4, r4
  997. kmovd k1, r6d
  998. %else
  999. %define %%mload vmovdqu8
  1000. lea r2, [r2+1*r4-64]
  1001. lea r0, [r0+1*r4-64]
  1002. mov r6, -1
  1003. neg r4
  1004. shrx r6, r6, r4
  1005. %if ARCH_X86_64
  1006. kmovq k1, r6
  1007. %else
  1008. kmovd k1, r6d
  1009. test r4d, 32
  1010. jnz .l32
  1011. kxnord k2, k2, k2
  1012. kunpckdq k1, k1, k2
  1013. .l32:
  1014. %endif
  1015. %endif
  1016. FIX_STRIDES r3, r1
  1017. add r4, 4*64
  1018. jge .small
  1019. mov r6, r4
  1020. .loop: ; >256 bytes/row
  1021. PREFETCHNT_ITER r2+r4+64, 4*64
  1022. movu m0, [r2+r4-3*64]
  1023. movu m1, [r2+r4-2*64]
  1024. movu m2, [r2+r4-1*64]
  1025. movu m3, [r2+r4-0*64]
  1026. %if %1
  1027. pshufb m0, m4
  1028. pshufb m1, m4
  1029. pshufb m2, m4
  1030. pshufb m3, m4
  1031. %endif
  1032. movnta [r0+r4-3*64], m0
  1033. movnta [r0+r4-2*64], m1
  1034. movnta [r0+r4-1*64], m2
  1035. movnta [r0+r4-0*64], m3
  1036. add r4, 4*64
  1037. jl .loop
  1038. PREFETCHNT_ITER r2+r4+64, 4*64
  1039. sub r4, 3*64
  1040. jge .tail
  1041. .loop2:
  1042. movu m0, [r2+r4]
  1043. %if %1
  1044. pshufb m0, m4
  1045. %endif
  1046. movnta [r0+r4], m0
  1047. add r4, 64
  1048. jl .loop2
  1049. .tail:
  1050. %%mload m0 {k1}{z}, [r2+r4]
  1051. %if %1
  1052. pshufb m0, m4
  1053. %endif
  1054. movnta [r0+r4], m0
  1055. add r2, r3
  1056. add r0, r1
  1057. mov r4, r6
  1058. dec r5d
  1059. jg .loop
  1060. sfence
  1061. RET
  1062. .small: ; 65-256 bytes/row. skip non-temporal stores
  1063. sub r4, 3*64
  1064. jge .tiny
  1065. mov r6, r4
  1066. .small_loop:
  1067. PREFETCHNT_ITER r2+r4+64, 64
  1068. movu m0, [r2+r4]
  1069. %if %1
  1070. pshufb m0, m4
  1071. %endif
  1072. mova [r0+r4], m0
  1073. add r4, 64
  1074. jl .small_loop
  1075. PREFETCHNT_ITER r2+r4+64, 64
  1076. %%mload m0 {k1}{z}, [r2+r4]
  1077. %if %1
  1078. pshufb m0, m4
  1079. %endif
  1080. mova [r0+r4], m0
  1081. add r2, r3
  1082. add r0, r1
  1083. mov r4, r6
  1084. dec r5d
  1085. jg .small_loop
  1086. RET
  1087. .tiny: ; 1-64 bytes/row. skip non-temporal stores
  1088. PREFETCHNT_ITER r2+r4+64, 64
  1089. %%mload m0 {k1}{z}, [r2+r4]
  1090. %if %1
  1091. pshufb m0, m4
  1092. %endif
  1093. mova [r0+r4], m0
  1094. add r2, r3
  1095. add r0, r1
  1096. dec r5d
  1097. jg .tiny
  1098. RET
  1099. %endmacro
  1100. INIT_ZMM avx512
  1101. PLANE_COPY_AVX512 0
  1102. PLANE_COPY_AVX512 1
  1103. %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
  1104. %if HIGH_BIT_DEPTH
  1105. %assign x 0
  1106. %rep 16/mmsize
  1107. mov%4 m0, [%2+(x/2)*mmsize]
  1108. mov%4 m1, [%3+(x/2)*mmsize]
  1109. punpckhwd m2, m0, m1
  1110. punpcklwd m0, m1
  1111. mov%5a [%1+(x+0)*mmsize], m0
  1112. mov%5a [%1+(x+1)*mmsize], m2
  1113. %assign x (x+2)
  1114. %endrep
  1115. %else
  1116. movq m0, [%2]
  1117. %if mmsize==16
  1118. %ifidn %4, a
  1119. punpcklbw m0, [%3]
  1120. %else
  1121. movq m1, [%3]
  1122. punpcklbw m0, m1
  1123. %endif
  1124. mov%5a [%1], m0
  1125. %else
  1126. movq m1, [%3]
  1127. punpckhbw m2, m0, m1
  1128. punpcklbw m0, m1
  1129. mov%5a [%1+0], m0
  1130. mov%5a [%1+8], m2
  1131. %endif
  1132. %endif ; HIGH_BIT_DEPTH
  1133. %endmacro
  1134. %macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned
  1135. mov%6 m0, [%3]
  1136. %if mmsize == 32
  1137. pshufb m0, %5
  1138. vpermq m0, m0, q3120
  1139. %if %4
  1140. mova [%1], m0
  1141. %else
  1142. mov%6 [%1], xm0
  1143. vextracti128 [%2], m0, 1
  1144. %endif
  1145. %elif HIGH_BIT_DEPTH
  1146. mov%6 m1, [%3+mmsize]
  1147. psrld m2, m0, 16
  1148. psrld m3, m1, 16
  1149. pand m0, %5
  1150. pand m1, %5
  1151. packssdw m0, m1
  1152. packssdw m2, m3
  1153. mov%6 [%1], m0
  1154. mov%6 [%2], m2
  1155. %else ; !HIGH_BIT_DEPTH
  1156. %if cpuflag(ssse3)
  1157. pshufb m0, %5
  1158. %else
  1159. mova m1, m0
  1160. pand m0, %5
  1161. psrlw m1, 8
  1162. packuswb m0, m1
  1163. %endif
  1164. %if %4
  1165. mova [%1], m0
  1166. %else
  1167. movq [%1], m0
  1168. movhps [%2], m0
  1169. %endif
  1170. %endif ; HIGH_BIT_DEPTH
  1171. %endmacro
  1172. %macro PLANE_INTERLEAVE 0
  1173. ;-----------------------------------------------------------------------------
  1174. ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
  1175. ; uint8_t *srcu, intptr_t i_srcu,
  1176. ; uint8_t *srcv, intptr_t i_srcv, int w, int h )
  1177. ;-----------------------------------------------------------------------------
  1178. ; assumes i_dst and w are multiples of 16, and i_dst>2*w
  1179. cglobal plane_copy_interleave_core, 6,9
  1180. mov r6d, r6m
  1181. %if HIGH_BIT_DEPTH
  1182. FIX_STRIDES r1, r3, r5, r6d
  1183. movifnidn r1mp, r1
  1184. movifnidn r3mp, r3
  1185. mov r6m, r6d
  1186. %endif
  1187. lea r0, [r0+r6*2]
  1188. add r2, r6
  1189. add r4, r6
  1190. %if ARCH_X86_64
  1191. DECLARE_REG_TMP 7,8
  1192. %else
  1193. DECLARE_REG_TMP 1,3
  1194. %endif
  1195. mov t1, r1
  1196. shr t1, SIZEOF_PIXEL
  1197. sub t1, r6
  1198. mov t0d, r7m
  1199. .loopy:
  1200. mov r6d, r6m
  1201. neg r6
  1202. .prefetch:
  1203. prefetchnta [r2+r6]
  1204. prefetchnta [r4+r6]
  1205. add r6, 64
  1206. jl .prefetch
  1207. mov r6d, r6m
  1208. neg r6
  1209. .loopx:
  1210. INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
  1211. INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
  1212. add r6, 16*SIZEOF_PIXEL
  1213. jl .loopx
  1214. .pad:
  1215. %assign n 0
  1216. %rep SIZEOF_PIXEL
  1217. %if mmsize==8
  1218. movntq [r0+r6*2+(n+ 0)], m0
  1219. movntq [r0+r6*2+(n+ 8)], m0
  1220. movntq [r0+r6*2+(n+16)], m0
  1221. movntq [r0+r6*2+(n+24)], m0
  1222. %else
  1223. movntdq [r0+r6*2+(n+ 0)], m0
  1224. movntdq [r0+r6*2+(n+16)], m0
  1225. %endif
  1226. %assign n n+32
  1227. %endrep
  1228. add r6, 16*SIZEOF_PIXEL
  1229. cmp r6, t1
  1230. jl .pad
  1231. add r0, r1mp
  1232. add r2, r3mp
  1233. add r4, r5
  1234. dec t0d
  1235. jg .loopy
  1236. sfence
  1237. emms
  1238. RET
  1239. ;-----------------------------------------------------------------------------
  1240. ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
  1241. ;-----------------------------------------------------------------------------
  1242. cglobal store_interleave_chroma, 5,5
  1243. FIX_STRIDES r1
  1244. .loop:
  1245. INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
  1246. INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
  1247. add r2, FDEC_STRIDEB*2
  1248. add r3, FDEC_STRIDEB*2
  1249. lea r0, [r0+r1*2]
  1250. sub r4d, 2
  1251. jg .loop
  1252. RET
  1253. %endmacro ; PLANE_INTERLEAVE
  1254. %macro DEINTERLEAVE_START 0
  1255. %if mmsize == 32
  1256. vbroadcasti128 m4, [deinterleave_shuf]
  1257. %elif HIGH_BIT_DEPTH
  1258. mova m4, [pd_ffff]
  1259. %elif cpuflag(ssse3)
  1260. mova m4, [deinterleave_shuf]
  1261. %else
  1262. mova m4, [pw_00ff]
  1263. %endif ; HIGH_BIT_DEPTH
  1264. %endmacro
  1265. %macro PLANE_DEINTERLEAVE 0
  1266. ;-----------------------------------------------------------------------------
  1267. ; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta,
  1268. ; pixel *dstb, intptr_t i_dstb,
  1269. ; pixel *src, intptr_t i_src, int w, int h )
  1270. ;-----------------------------------------------------------------------------
  1271. %if ARCH_X86_64
  1272. cglobal plane_copy_deinterleave, 6,9
  1273. %define %%w r7
  1274. %define %%h r8d
  1275. mov r8d, r7m
  1276. %else
  1277. cglobal plane_copy_deinterleave, 6,7
  1278. %define %%w r6m
  1279. %define %%h dword r7m
  1280. %endif
  1281. %if HIGH_BIT_DEPTH
  1282. %assign %%n 16
  1283. %else
  1284. %assign %%n mmsize/2
  1285. %endif
  1286. DEINTERLEAVE_START
  1287. mov r6d, r6m
  1288. FIX_STRIDES r1, r3, r5, r6d
  1289. add r0, r6
  1290. add r2, r6
  1291. lea r4, [r4+r6*2]
  1292. neg r6
  1293. mov %%w, r6
  1294. .loop:
  1295. DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, m4, u
  1296. DEINTERLEAVE r0+r6+%%n, r2+r6+%%n, r4+r6*2+%%n*2, 0, m4, u
  1297. add r6, %%n*2
  1298. jl .loop
  1299. add r0, r1
  1300. add r2, r3
  1301. add r4, r5
  1302. mov r6, %%w
  1303. dec %%h
  1304. jg .loop
  1305. RET
  1306. %endmacro ; PLANE_DEINTERLEAVE
  1307. %macro LOAD_DEINTERLEAVE_CHROMA 0
  1308. ;-----------------------------------------------------------------------------
  1309. ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
  1310. ;-----------------------------------------------------------------------------
  1311. cglobal load_deinterleave_chroma_fenc, 4,4
  1312. DEINTERLEAVE_START
  1313. FIX_STRIDES r2
  1314. .loop:
  1315. DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
  1316. DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
  1317. add r0, FENC_STRIDEB*2
  1318. lea r1, [r1+r2*2]
  1319. sub r3d, 2
  1320. jg .loop
  1321. RET
  1322. ;-----------------------------------------------------------------------------
  1323. ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
  1324. ;-----------------------------------------------------------------------------
  1325. cglobal load_deinterleave_chroma_fdec, 4,4
  1326. DEINTERLEAVE_START
  1327. FIX_STRIDES r2
  1328. .loop:
  1329. DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
  1330. DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
  1331. add r0, FDEC_STRIDEB*2
  1332. lea r1, [r1+r2*2]
  1333. sub r3d, 2
  1334. jg .loop
  1335. RET
  1336. %endmacro ; LOAD_DEINTERLEAVE_CHROMA
  1337. %macro LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512 0
  1338. cglobal load_deinterleave_chroma_fdec, 4,5
  1339. vbroadcasti32x8 m0, [deinterleave_shuf32a]
  1340. mov r4d, 0x3333ff00
  1341. kmovd k1, r4d
  1342. lea r4, [r2*3]
  1343. kshiftrd k2, k1, 16
  1344. .loop:
  1345. vbroadcasti128 ym1, [r1]
  1346. vbroadcasti32x4 m1 {k1}, [r1+r2]
  1347. vbroadcasti128 ym2, [r1+r2*2]
  1348. vbroadcasti32x4 m2 {k1}, [r1+r4]
  1349. lea r1, [r1+r2*4]
  1350. pshufb m1, m0
  1351. pshufb m2, m0
  1352. vmovdqa32 [r0] {k2}, m1
  1353. vmovdqa32 [r0+mmsize] {k2}, m2
  1354. add r0, 2*mmsize
  1355. sub r3d, 4
  1356. jg .loop
  1357. RET
  1358. %endmacro
  1359. %macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0
  1360. cglobal load_deinterleave_chroma_fenc, 4,5
  1361. vbroadcasti128 m0, [deinterleave_shuf]
  1362. lea r4, [r2*3]
  1363. .loop:
  1364. mova xm1, [r1] ; 0
  1365. vinserti128 ym1, [r1+r2], 1 ; 1
  1366. %if mmsize == 64
  1367. mova xm2, [r1+r2*4] ; 4
  1368. vinserti32x4 m1, [r1+r2*2], 2 ; 2
  1369. vinserti32x4 m2, [r1+r4*2], 2 ; 6
  1370. vinserti32x4 m1, [r1+r4], 3 ; 3
  1371. lea r1, [r1+r2*4]
  1372. vinserti32x4 m2, [r1+r2], 1 ; 5
  1373. vinserti32x4 m2, [r1+r4], 3 ; 7
  1374. %else
  1375. mova xm2, [r1+r2*2] ; 2
  1376. vinserti128 m2, [r1+r4], 1 ; 3
  1377. %endif
  1378. lea r1, [r1+r2*4]
  1379. pshufb m1, m0
  1380. pshufb m2, m0
  1381. mova [r0], m1
  1382. mova [r0+mmsize], m2
  1383. add r0, 2*mmsize
  1384. sub r3d, mmsize/8
  1385. jg .loop
  1386. RET
  1387. %endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
  1388. %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
  1389. %if mmsize == 32
  1390. vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16]
  1391. %elif cpuflag(ssse3)
  1392. mova m3, [deinterleave_rgb_shuf+(%1-3)*16]
  1393. %endif
  1394. %%loopy:
  1395. mov %8, r6
  1396. mov %9, %6
  1397. %%loopx:
  1398. %if mmsize == 32 && %1 == 3
  1399. movu xm0, [%8+0*12]
  1400. vinserti128 m0, m0, [%8+1*12], 1
  1401. movu xm1, [%8+2*12]
  1402. vinserti128 m1, m1, [%8+3*12], 1
  1403. %else
  1404. movu m0, [%8]
  1405. movu m1, [%8+%1*mmsize/4]
  1406. %endif
  1407. %if cpuflag(ssse3)
  1408. pshufb m0, m3 ; a0 a1 a2 a3 a0 a1 a2 a3 b0 b1 b2 b3 c0 c1 c2 c3
  1409. pshufb m1, m3 ; a4 a5 a6 a7 a4 a5 a6 a7 b4 b5 b6 b7 c4 c5 c6 c7
  1410. %if mmsize == 32
  1411. vpblendd m2, m0, m1, 0x22
  1412. punpckhdq m0, m1
  1413. vpermd m2, m4, m2
  1414. vpermd m0, m4, m0
  1415. mova [r0+%9], xm2
  1416. mova [r2+%9], xm0
  1417. vextracti128 [r4+%9], m0, 1
  1418. %else
  1419. SBUTTERFLY dq, 0, 1, 2
  1420. movq [r0+%9], m0
  1421. movq [r2+%9], m1
  1422. movhps [r4+%9], m1
  1423. %endif
  1424. %elif %1 == 3
  1425. SBUTTERFLY bw, 0, 1, 2
  1426. pshufd m2, m0, q0321 ; c0 c4 a1 a5 b1 b5 c1 c5 __ __ __ __ a0 a4 b0 b4
  1427. punpcklbw m3, m2, m1 ; c0 c2 c4 c6 a1 a3 a5 a7 b1 b3 b5 b7 c1 c3 c5 c7
  1428. punpckhbw m2, m0 ; __ __ __ __ __ __ __ __ a0 a2 a4 a6 b0 b2 b4 b6
  1429. pshufd m0, m3, q2103 ; c1 c3 c5 c7 __ __ __ __ a1 a3 a5 a7 b1 b3 b5 b7
  1430. punpckhbw m2, m0 ; a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
  1431. punpcklbw m3, m0 ; c0 c1 c2 c3 c4 c5 c6 c7
  1432. movq [r0+%9], m2
  1433. movhps [r2+%9], m2
  1434. movq [r4+%9], m3
  1435. %else ; %1 == 4
  1436. SBUTTERFLY bw, 0, 1, 2
  1437. SBUTTERFLY bw, 0, 1, 2
  1438. SBUTTERFLY bw, 0, 1, 2
  1439. movq [r0+%9], m0
  1440. movhps [r2+%9], m0
  1441. movq [r4+%9], m1
  1442. %endif
  1443. add %8, %1*mmsize/2
  1444. add %9, mmsize/2
  1445. jl %%loopx
  1446. add r0, %2
  1447. add r2, %3
  1448. add r4, %4
  1449. add r6, %5
  1450. dec %7d
  1451. jg %%loopy
  1452. %endmacro
  1453. %macro PLANE_DEINTERLEAVE_RGB 0
  1454. ;-----------------------------------------------------------------------------
  1455. ; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
  1456. ; pixel *dstb, intptr_t i_dstb,
  1457. ; pixel *dstc, intptr_t i_dstc,
  1458. ; pixel *src, intptr_t i_src, int pw, int w, int h )
  1459. ;-----------------------------------------------------------------------------
  1460. %if ARCH_X86_64
  1461. cglobal plane_copy_deinterleave_rgb, 8,12
  1462. %define %%args r1, r3, r5, r7, r8, r9, r10, r11
  1463. mov r8d, r9m
  1464. mov r9d, r10m
  1465. add r0, r8
  1466. add r2, r8
  1467. add r4, r8
  1468. neg r8
  1469. %else
  1470. cglobal plane_copy_deinterleave_rgb, 1,7
  1471. %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
  1472. mov r1, r9m
  1473. mov r2, r2m
  1474. mov r4, r4m
  1475. mov r6, r6m
  1476. add r0, r1
  1477. add r2, r1
  1478. add r4, r1
  1479. neg r1
  1480. mov r9m, r1
  1481. mov r1, r10m
  1482. %endif
  1483. %if mmsize == 32
  1484. mova m4, [deinterleave_shufd]
  1485. %endif
  1486. cmp dword r8m, 4
  1487. je .pw4
  1488. PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
  1489. jmp .ret
  1490. .pw4:
  1491. PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
  1492. .ret:
  1493. REP_RET
  1494. %endmacro
  1495. %macro PLANE_DEINTERLEAVE_V210 0
  1496. ;-----------------------------------------------------------------------------
  1497. ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
  1498. ; uint16_t *dstc, intptr_t i_dstc,
  1499. ; uint32_t *src, intptr_t i_src, int w, int h )
  1500. ;-----------------------------------------------------------------------------
  1501. %if ARCH_X86_64
  1502. cglobal plane_copy_deinterleave_v210, 8,10,7
  1503. %define src r8
  1504. %define org_w r9
  1505. %define h r7d
  1506. %else
  1507. cglobal plane_copy_deinterleave_v210, 7,7,7
  1508. %define src r4m
  1509. %define org_w r6m
  1510. %define h dword r7m
  1511. %endif
  1512. FIX_STRIDES r1, r3, r6d
  1513. shl r5, 2
  1514. add r0, r6
  1515. add r2, r6
  1516. neg r6
  1517. mov src, r4
  1518. mov org_w, r6
  1519. %if cpuflag(avx512)
  1520. vpbroadcastd m2, [v210_mask]
  1521. vpbroadcastd m3, [v210_shuf_avx512]
  1522. psrlw m3, 6 ; dw 0, 4
  1523. mova m4, [v210_shuf_avx512] ; luma
  1524. psrlw m5, m4, 8 ; chroma
  1525. %else
  1526. %if mmsize == 32
  1527. vbroadcasti128 m2, [v210_mask]
  1528. vbroadcasti128 m3, [v210_luma_shuf]
  1529. vbroadcasti128 m4, [v210_chroma_shuf]
  1530. %else
  1531. mova m2, [v210_mask]
  1532. mova m3, [v210_luma_shuf]
  1533. mova m4, [v210_chroma_shuf]
  1534. %endif
  1535. mova m5, [v210_mult] ; also functions as vpermd index for avx2
  1536. pshufd m6, m5, q1102
  1537. %endif
  1538. ALIGN 16
  1539. .loop:
  1540. movu m1, [r4]
  1541. pandn m0, m2, m1
  1542. pand m1, m2
  1543. %if cpuflag(avx512)
  1544. psrld m0, 10
  1545. vpsrlvw m1, m3
  1546. mova m6, m0
  1547. vpermt2w m0, m4, m1
  1548. vpermt2w m1, m5, m6
  1549. %else
  1550. pshufb m0, m3
  1551. pshufb m1, m4
  1552. pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
  1553. pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
  1554. %if mmsize == 32
  1555. vpermd m0, m5, m0
  1556. vpermd m1, m5, m1
  1557. %endif
  1558. %endif
  1559. movu [r0+r6], m0
  1560. movu [r2+r6], m1
  1561. add r4, mmsize
  1562. add r6, mmsize*3/4
  1563. jl .loop
  1564. add r0, r1
  1565. add r2, r3
  1566. add src, r5
  1567. mov r4, src
  1568. mov r6, org_w
  1569. dec h
  1570. jg .loop
  1571. RET
  1572. %endmacro ; PLANE_DEINTERLEAVE_V210
  1573. INIT_MMX mmx2
  1574. PLANE_INTERLEAVE
  1575. INIT_XMM sse2
  1576. PLANE_INTERLEAVE
  1577. PLANE_DEINTERLEAVE
  1578. LOAD_DEINTERLEAVE_CHROMA
  1579. INIT_YMM avx2
  1580. PLANE_DEINTERLEAVE
  1581. %if HIGH_BIT_DEPTH
  1582. INIT_XMM ssse3
  1583. PLANE_DEINTERLEAVE_V210
  1584. INIT_XMM avx
  1585. PLANE_INTERLEAVE
  1586. PLANE_DEINTERLEAVE
  1587. LOAD_DEINTERLEAVE_CHROMA
  1588. PLANE_DEINTERLEAVE_V210
  1589. INIT_YMM avx2
  1590. LOAD_DEINTERLEAVE_CHROMA
  1591. PLANE_DEINTERLEAVE_V210
  1592. INIT_ZMM avx512
  1593. PLANE_DEINTERLEAVE_V210
  1594. %else
  1595. INIT_XMM sse2
  1596. PLANE_DEINTERLEAVE_RGB
  1597. INIT_XMM ssse3
  1598. PLANE_DEINTERLEAVE
  1599. LOAD_DEINTERLEAVE_CHROMA
  1600. PLANE_DEINTERLEAVE_RGB
  1601. INIT_YMM avx2
  1602. LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
  1603. PLANE_DEINTERLEAVE_RGB
  1604. INIT_ZMM avx512
  1605. LOAD_DEINTERLEAVE_CHROMA_FDEC_AVX512
  1606. LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
  1607. %endif
  1608. ; These functions are not general-use; not only do they require aligned input, but memcpy
  1609. ; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128.
  1610. ;-----------------------------------------------------------------------------
  1611. ; void *memcpy_aligned( void *dst, const void *src, size_t n );
  1612. ;-----------------------------------------------------------------------------
  1613. %macro MEMCPY 0
  1614. cglobal memcpy_aligned, 3,3
  1615. %if mmsize == 32
  1616. test r2d, 16
  1617. jz .copy32
  1618. mova xm0, [r1+r2-16]
  1619. mova [r0+r2-16], xm0
  1620. sub r2d, 16
  1621. jle .ret
  1622. .copy32:
  1623. %endif
  1624. test r2d, mmsize
  1625. jz .loop
  1626. mova m0, [r1+r2-mmsize]
  1627. mova [r0+r2-mmsize], m0
  1628. sub r2d, mmsize
  1629. jle .ret
  1630. .loop:
  1631. mova m0, [r1+r2-1*mmsize]
  1632. mova m1, [r1+r2-2*mmsize]
  1633. mova [r0+r2-1*mmsize], m0
  1634. mova [r0+r2-2*mmsize], m1
  1635. sub r2d, 2*mmsize
  1636. jg .loop
  1637. .ret:
  1638. RET
  1639. %endmacro
  1640. ;-----------------------------------------------------------------------------
  1641. ; void *memzero_aligned( void *dst, size_t n );
  1642. ;-----------------------------------------------------------------------------
  1643. %macro MEMZERO 0
  1644. cglobal memzero_aligned, 2,2
  1645. xorps m0, m0
  1646. .loop:
  1647. %assign %%i mmsize
  1648. %rep 128 / mmsize
  1649. movaps [r0 + r1 - %%i], m0
  1650. %assign %%i %%i+mmsize
  1651. %endrep
  1652. sub r1d, 128
  1653. jg .loop
  1654. RET
  1655. %endmacro
  1656. INIT_XMM sse
  1657. MEMCPY
  1658. MEMZERO
  1659. INIT_YMM avx
  1660. MEMCPY
  1661. MEMZERO
  1662. INIT_ZMM avx512
  1663. MEMZERO
  1664. cglobal memcpy_aligned, 3,4
  1665. dec r2d ; offset of the last byte
  1666. rorx r3d, r2d, 2
  1667. and r2d, ~63
  1668. and r3d, 15 ; n = number of dwords minus one to copy in the tail
  1669. mova m0, [r1+r2]
  1670. not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff
  1671. shrx r3d, r3d, r3d ; 0xffff >> (n^15)
  1672. kmovw k1, r3d ; (1 << (n+1)) - 1
  1673. vmovdqa32 [r0+r2] {k1}, m0
  1674. sub r2d, 64
  1675. jl .ret
  1676. .loop:
  1677. mova m0, [r1+r2]
  1678. mova [r0+r2], m0
  1679. sub r2d, 64
  1680. jge .loop
  1681. .ret:
  1682. RET
  1683. %if HIGH_BIT_DEPTH == 0
  1684. ;-----------------------------------------------------------------------------
  1685. ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
  1686. ;-----------------------------------------------------------------------------
  1687. %macro INTEGRAL_INIT4H 0
  1688. cglobal integral_init4h, 3,4
  1689. lea r3, [r0+r2*2]
  1690. add r1, r2
  1691. neg r2
  1692. pxor m4, m4
  1693. .loop:
  1694. mova xm0, [r1+r2]
  1695. mova xm1, [r1+r2+16]
  1696. %if mmsize==32
  1697. vinserti128 m0, m0, [r1+r2+ 8], 1
  1698. vinserti128 m1, m1, [r1+r2+24], 1
  1699. %else
  1700. palignr m1, m0, 8
  1701. %endif
  1702. mpsadbw m0, m4, 0
  1703. mpsadbw m1, m4, 0
  1704. paddw m0, [r0+r2*2]
  1705. paddw m1, [r0+r2*2+mmsize]
  1706. mova [r3+r2*2 ], m0
  1707. mova [r3+r2*2+mmsize], m1
  1708. add r2, mmsize
  1709. jl .loop
  1710. RET
  1711. %endmacro
  1712. INIT_XMM sse4
  1713. INTEGRAL_INIT4H
  1714. INIT_YMM avx2
  1715. INTEGRAL_INIT4H
  1716. %macro INTEGRAL_INIT8H 0
  1717. cglobal integral_init8h, 3,4
  1718. lea r3, [r0+r2*2]
  1719. add r1, r2
  1720. neg r2
  1721. pxor m4, m4
  1722. .loop:
  1723. mova xm0, [r1+r2]
  1724. mova xm1, [r1+r2+16]
  1725. %if mmsize==32
  1726. vinserti128 m0, m0, [r1+r2+ 8], 1
  1727. vinserti128 m1, m1, [r1+r2+24], 1
  1728. mpsadbw m2, m0, m4, 100100b
  1729. mpsadbw m3, m1, m4, 100100b
  1730. %else
  1731. palignr m1, m0, 8
  1732. mpsadbw m2, m0, m4, 100b
  1733. mpsadbw m3, m1, m4, 100b
  1734. %endif
  1735. mpsadbw m0, m4, 0
  1736. mpsadbw m1, m4, 0
  1737. paddw m0, [r0+r2*2]
  1738. paddw m1, [r0+r2*2+mmsize]
  1739. paddw m0, m2
  1740. paddw m1, m3
  1741. mova [r3+r2*2 ], m0
  1742. mova [r3+r2*2+mmsize], m1
  1743. add r2, mmsize
  1744. jl .loop
  1745. RET
  1746. %endmacro
  1747. INIT_XMM sse4
  1748. INTEGRAL_INIT8H
  1749. INIT_XMM avx
  1750. INTEGRAL_INIT8H
  1751. INIT_YMM avx2
  1752. INTEGRAL_INIT8H
  1753. %endif ; !HIGH_BIT_DEPTH
  1754. %macro INTEGRAL_INIT_8V 0
  1755. ;-----------------------------------------------------------------------------
  1756. ; void integral_init8v( uint16_t *sum8, intptr_t stride )
  1757. ;-----------------------------------------------------------------------------
  1758. cglobal integral_init8v, 3,3
  1759. add r1, r1
  1760. add r0, r1
  1761. lea r2, [r0+r1*8]
  1762. neg r1
  1763. .loop:
  1764. mova m0, [r2+r1]
  1765. mova m1, [r2+r1+mmsize]
  1766. psubw m0, [r0+r1]
  1767. psubw m1, [r0+r1+mmsize]
  1768. mova [r0+r1], m0
  1769. mova [r0+r1+mmsize], m1
  1770. add r1, 2*mmsize
  1771. jl .loop
  1772. RET
  1773. %endmacro
  1774. INIT_MMX mmx
  1775. INTEGRAL_INIT_8V
  1776. INIT_XMM sse2
  1777. INTEGRAL_INIT_8V
  1778. INIT_YMM avx2
  1779. INTEGRAL_INIT_8V
  1780. ;-----------------------------------------------------------------------------
  1781. ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
  1782. ;-----------------------------------------------------------------------------
  1783. INIT_MMX mmx
  1784. cglobal integral_init4v, 3,5
  1785. shl r2, 1
  1786. lea r3, [r0+r2*4]
  1787. lea r4, [r0+r2*8]
  1788. mova m0, [r0+r2]
  1789. mova m4, [r4+r2]
  1790. .loop:
  1791. mova m1, m4
  1792. psubw m1, m0
  1793. mova m4, [r4+r2-8]
  1794. mova m0, [r0+r2-8]
  1795. paddw m1, m4
  1796. mova m3, [r3+r2-8]
  1797. psubw m1, m0
  1798. psubw m3, m0
  1799. mova [r0+r2-8], m1
  1800. mova [r1+r2-8], m3
  1801. sub r2, 8
  1802. jge .loop
  1803. RET
  1804. INIT_XMM sse2
  1805. cglobal integral_init4v, 3,5
  1806. shl r2, 1
  1807. add r0, r2
  1808. add r1, r2
  1809. lea r3, [r0+r2*4]
  1810. lea r4, [r0+r2*8]
  1811. neg r2
  1812. .loop:
  1813. mova m0, [r0+r2]
  1814. mova m1, [r4+r2]
  1815. mova m2, m0
  1816. mova m4, m1
  1817. shufpd m0, [r0+r2+16], 1
  1818. shufpd m1, [r4+r2+16], 1
  1819. paddw m0, m2
  1820. paddw m1, m4
  1821. mova m3, [r3+r2]
  1822. psubw m1, m0
  1823. psubw m3, m2
  1824. mova [r0+r2], m1
  1825. mova [r1+r2], m3
  1826. add r2, 16
  1827. jl .loop
  1828. RET
  1829. INIT_XMM ssse3
  1830. cglobal integral_init4v, 3,5
  1831. shl r2, 1
  1832. add r0, r2
  1833. add r1, r2
  1834. lea r3, [r0+r2*4]
  1835. lea r4, [r0+r2*8]
  1836. neg r2
  1837. .loop:
  1838. mova m2, [r0+r2]
  1839. mova m0, [r0+r2+16]
  1840. mova m4, [r4+r2]
  1841. mova m1, [r4+r2+16]
  1842. palignr m0, m2, 8
  1843. palignr m1, m4, 8
  1844. paddw m0, m2
  1845. paddw m1, m4
  1846. mova m3, [r3+r2]
  1847. psubw m1, m0
  1848. psubw m3, m2
  1849. mova [r0+r2], m1
  1850. mova [r1+r2], m3
  1851. add r2, 16
  1852. jl .loop
  1853. RET
  1854. INIT_YMM avx2
  1855. cglobal integral_init4v, 3,5
  1856. add r2, r2
  1857. add r0, r2
  1858. add r1, r2
  1859. lea r3, [r0+r2*4]
  1860. lea r4, [r0+r2*8]
  1861. neg r2
  1862. .loop:
  1863. mova m2, [r0+r2]
  1864. movu m1, [r4+r2+8]
  1865. paddw m0, m2, [r0+r2+8]
  1866. paddw m1, [r4+r2]
  1867. mova m3, [r3+r2]
  1868. psubw m1, m0
  1869. psubw m3, m2
  1870. mova [r0+r2], m1
  1871. mova [r1+r2], m3
  1872. add r2, 32
  1873. jl .loop
  1874. RET
  1875. %macro FILT8x4 7
  1876. mova %3, [r0+%7]
  1877. mova %4, [r0+r5+%7]
  1878. pavgb %3, %4
  1879. pavgb %4, [r0+r5*2+%7]
  1880. PALIGNR %1, %3, 1, m6
  1881. PALIGNR %2, %4, 1, m6
  1882. %if cpuflag(xop)
  1883. pavgb %1, %3
  1884. pavgb %2, %4
  1885. %else
  1886. pavgb %1, %3
  1887. pavgb %2, %4
  1888. psrlw %5, %1, 8
  1889. psrlw %6, %2, 8
  1890. pand %1, m7
  1891. pand %2, m7
  1892. %endif
  1893. %endmacro
  1894. %macro FILT32x4U 4
  1895. mova m1, [r0+r5]
  1896. pavgb m0, m1, [r0]
  1897. movu m3, [r0+r5+1]
  1898. pavgb m2, m3, [r0+1]
  1899. pavgb m1, [r0+r5*2]
  1900. pavgb m3, [r0+r5*2+1]
  1901. pavgb m0, m2
  1902. pavgb m1, m3
  1903. mova m3, [r0+r5+mmsize]
  1904. pavgb m2, m3, [r0+mmsize]
  1905. movu m5, [r0+r5+1+mmsize]
  1906. pavgb m4, m5, [r0+1+mmsize]
  1907. pavgb m3, [r0+r5*2+mmsize]
  1908. pavgb m5, [r0+r5*2+1+mmsize]
  1909. pavgb m2, m4
  1910. pavgb m3, m5
  1911. pshufb m0, m7
  1912. pshufb m1, m7
  1913. pshufb m2, m7
  1914. pshufb m3, m7
  1915. punpckhqdq m4, m0, m2
  1916. punpcklqdq m0, m0, m2
  1917. punpckhqdq m5, m1, m3
  1918. punpcklqdq m2, m1, m3
  1919. vpermq m0, m0, q3120
  1920. vpermq m1, m4, q3120
  1921. vpermq m2, m2, q3120
  1922. vpermq m3, m5, q3120
  1923. mova [%1], m0
  1924. mova [%2], m1
  1925. mova [%3], m2
  1926. mova [%4], m3
  1927. %endmacro
  1928. %macro FILT16x2 4
  1929. mova m3, [r0+%4+mmsize]
  1930. mova m2, [r0+%4]
  1931. pavgb m3, [r0+%4+r5+mmsize]
  1932. pavgb m2, [r0+%4+r5]
  1933. PALIGNR %1, m3, 1, m6
  1934. pavgb %1, m3
  1935. PALIGNR m3, m2, 1, m6
  1936. pavgb m3, m2
  1937. %if cpuflag(xop)
  1938. vpperm m5, m3, %1, m7
  1939. vpperm m3, m3, %1, m6
  1940. %else
  1941. psrlw m5, m3, 8
  1942. psrlw m4, %1, 8
  1943. pand m3, m7
  1944. pand %1, m7
  1945. packuswb m3, %1
  1946. packuswb m5, m4
  1947. %endif
  1948. mova [%2], m3
  1949. mova [%3], m5
  1950. mova %1, m2
  1951. %endmacro
  1952. %macro FILT8x2U 3
  1953. mova m3, [r0+%3+8]
  1954. mova m2, [r0+%3]
  1955. pavgb m3, [r0+%3+r5+8]
  1956. pavgb m2, [r0+%3+r5]
  1957. mova m1, [r0+%3+9]
  1958. mova m0, [r0+%3+1]
  1959. pavgb m1, [r0+%3+r5+9]
  1960. pavgb m0, [r0+%3+r5+1]
  1961. pavgb m1, m3
  1962. pavgb m0, m2
  1963. psrlw m3, m1, 8
  1964. psrlw m2, m0, 8
  1965. pand m1, m7
  1966. pand m0, m7
  1967. packuswb m0, m1
  1968. packuswb m2, m3
  1969. mova [%1], m0
  1970. mova [%2], m2
  1971. %endmacro
  1972. %macro FILT8xU 3
  1973. mova m3, [r0+%3+8]
  1974. mova m2, [r0+%3]
  1975. pavgw m3, [r0+%3+r5+8]
  1976. pavgw m2, [r0+%3+r5]
  1977. movu m1, [r0+%3+10]
  1978. movu m0, [r0+%3+2]
  1979. pavgw m1, [r0+%3+r5+10]
  1980. pavgw m0, [r0+%3+r5+2]
  1981. pavgw m1, m3
  1982. pavgw m0, m2
  1983. psrld m3, m1, 16
  1984. psrld m2, m0, 16
  1985. pand m1, m7
  1986. pand m0, m7
  1987. packssdw m0, m1
  1988. packssdw m2, m3
  1989. movu [%1], m0
  1990. mova [%2], m2
  1991. %endmacro
  1992. %macro FILT8xA 4
  1993. mova m3, [r0+%4+mmsize]
  1994. mova m2, [r0+%4]
  1995. pavgw m3, [r0+%4+r5+mmsize]
  1996. pavgw m2, [r0+%4+r5]
  1997. PALIGNR %1, m3, 2, m6
  1998. pavgw %1, m3
  1999. PALIGNR m3, m2, 2, m6
  2000. pavgw m3, m2
  2001. %if cpuflag(xop)
  2002. vpperm m5, m3, %1, m7
  2003. vpperm m3, m3, %1, m6
  2004. %else
  2005. psrld m5, m3, 16
  2006. psrld m4, %1, 16
  2007. pand m3, m7
  2008. pand %1, m7
  2009. packssdw m3, %1
  2010. packssdw m5, m4
  2011. %endif
  2012. mova [%2], m3
  2013. mova [%3], m5
  2014. mova %1, m2
  2015. %endmacro
  2016. ;-----------------------------------------------------------------------------
  2017. ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
  2018. ; intptr_t src_stride, intptr_t dst_stride, int width, int height )
  2019. ;-----------------------------------------------------------------------------
  2020. %macro FRAME_INIT_LOWRES 0
  2021. cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
  2022. %if HIGH_BIT_DEPTH
  2023. shl dword r6m, 1
  2024. FIX_STRIDES r5
  2025. shl dword r7m, 1
  2026. %endif
  2027. %if mmsize >= 16
  2028. add dword r7m, mmsize-1
  2029. and dword r7m, ~(mmsize-1)
  2030. %endif
  2031. ; src += 2*(height-1)*stride + 2*width
  2032. mov r6d, r8m
  2033. dec r6d
  2034. imul r6d, r5d
  2035. add r6d, r7m
  2036. lea r0, [r0+r6*2]
  2037. ; dst += (height-1)*stride + width
  2038. mov r6d, r8m
  2039. dec r6d
  2040. imul r6d, r6m
  2041. add r6d, r7m
  2042. add r1, r6
  2043. add r2, r6
  2044. add r3, r6
  2045. add r4, r6
  2046. ; gap = stride - width
  2047. mov r6d, r6m
  2048. sub r6d, r7m
  2049. PUSH r6
  2050. %define dst_gap [rsp+gprsize]
  2051. mov r6d, r5d
  2052. sub r6d, r7m
  2053. shl r6d, 1
  2054. PUSH r6
  2055. %define src_gap [rsp]
  2056. %if HIGH_BIT_DEPTH
  2057. %if cpuflag(xop)
  2058. mova m6, [deinterleave_shuf32a]
  2059. mova m7, [deinterleave_shuf32b]
  2060. %else
  2061. pcmpeqw m7, m7
  2062. psrld m7, 16
  2063. %endif
  2064. .vloop:
  2065. mov r6d, r7m
  2066. %ifnidn cpuname, mmx2
  2067. mova m0, [r0]
  2068. mova m1, [r0+r5]
  2069. pavgw m0, m1
  2070. pavgw m1, [r0+r5*2]
  2071. %endif
  2072. .hloop:
  2073. sub r0, mmsize*2
  2074. sub r1, mmsize
  2075. sub r2, mmsize
  2076. sub r3, mmsize
  2077. sub r4, mmsize
  2078. %ifidn cpuname, mmx2
  2079. FILT8xU r1, r2, 0
  2080. FILT8xU r3, r4, r5
  2081. %else
  2082. FILT8xA m0, r1, r2, 0
  2083. FILT8xA m1, r3, r4, r5
  2084. %endif
  2085. sub r6d, mmsize
  2086. jg .hloop
  2087. %else ; !HIGH_BIT_DEPTH
  2088. %if cpuflag(avx2)
  2089. vbroadcasti128 m7, [deinterleave_shuf]
  2090. %elif cpuflag(xop)
  2091. mova m6, [deinterleave_shuf32a]
  2092. mova m7, [deinterleave_shuf32b]
  2093. %else
  2094. pcmpeqb m7, m7
  2095. psrlw m7, 8
  2096. %endif
  2097. .vloop:
  2098. mov r6d, r7m
  2099. %ifnidn cpuname, mmx2
  2100. %if mmsize <= 16
  2101. mova m0, [r0]
  2102. mova m1, [r0+r5]
  2103. pavgb m0, m1
  2104. pavgb m1, [r0+r5*2]
  2105. %endif
  2106. %endif
  2107. .hloop:
  2108. sub r0, mmsize*2
  2109. sub r1, mmsize
  2110. sub r2, mmsize
  2111. sub r3, mmsize
  2112. sub r4, mmsize
  2113. %if mmsize==32
  2114. FILT32x4U r1, r2, r3, r4
  2115. %elifdef m8
  2116. FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
  2117. mova m8, m0
  2118. mova m9, m1
  2119. FILT8x4 m2, m3, m0, m1, m4, m5, 0
  2120. %if cpuflag(xop)
  2121. vpperm m4, m2, m8, m7
  2122. vpperm m2, m2, m8, m6
  2123. vpperm m5, m3, m9, m7
  2124. vpperm m3, m3, m9, m6
  2125. %else
  2126. packuswb m2, m8
  2127. packuswb m3, m9
  2128. packuswb m4, m10
  2129. packuswb m5, m11
  2130. %endif
  2131. mova [r1], m2
  2132. mova [r2], m4
  2133. mova [r3], m3
  2134. mova [r4], m5
  2135. %elifidn cpuname, mmx2
  2136. FILT8x2U r1, r2, 0
  2137. FILT8x2U r3, r4, r5
  2138. %else
  2139. FILT16x2 m0, r1, r2, 0
  2140. FILT16x2 m1, r3, r4, r5
  2141. %endif
  2142. sub r6d, mmsize
  2143. jg .hloop
  2144. %endif ; HIGH_BIT_DEPTH
  2145. .skip:
  2146. mov r6, dst_gap
  2147. sub r0, src_gap
  2148. sub r1, r6
  2149. sub r2, r6
  2150. sub r3, r6
  2151. sub r4, r6
  2152. dec dword r8m
  2153. jg .vloop
  2154. ADD rsp, 2*gprsize
  2155. emms
  2156. RET
  2157. %endmacro ; FRAME_INIT_LOWRES
  2158. INIT_MMX mmx2
  2159. FRAME_INIT_LOWRES
  2160. %if ARCH_X86_64 == 0
  2161. INIT_MMX cache32, mmx2
  2162. FRAME_INIT_LOWRES
  2163. %endif
  2164. INIT_XMM sse2
  2165. FRAME_INIT_LOWRES
  2166. INIT_XMM ssse3
  2167. FRAME_INIT_LOWRES
  2168. INIT_XMM avx
  2169. FRAME_INIT_LOWRES
  2170. INIT_XMM xop
  2171. FRAME_INIT_LOWRES
  2172. %if HIGH_BIT_DEPTH==0
  2173. INIT_YMM avx2
  2174. FRAME_INIT_LOWRES
  2175. %endif
  2176. ;-----------------------------------------------------------------------------
  2177. ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  2178. ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
  2179. ;-----------------------------------------------------------------------------
  2180. %macro MBTREE 0
  2181. cglobal mbtree_propagate_cost, 6,6,7
  2182. movss m6, [r5]
  2183. mov r5d, r6m
  2184. lea r0, [r0+r5*2]
  2185. add r5d, r5d
  2186. add r1, r5
  2187. add r2, r5
  2188. add r3, r5
  2189. add r4, r5
  2190. neg r5
  2191. pxor m4, m4
  2192. shufps m6, m6, 0
  2193. mova m5, [pw_3fff]
  2194. .loop:
  2195. movq m2, [r2+r5] ; intra
  2196. movq m0, [r4+r5] ; invq
  2197. movq m3, [r3+r5] ; inter
  2198. movq m1, [r1+r5] ; prop
  2199. pand m3, m5
  2200. pminsw m3, m2
  2201. punpcklwd m2, m4
  2202. punpcklwd m0, m4
  2203. pmaddwd m0, m2
  2204. punpcklwd m1, m4
  2205. punpcklwd m3, m4
  2206. %if cpuflag(fma4)
  2207. cvtdq2ps m0, m0
  2208. cvtdq2ps m1, m1
  2209. fmaddps m0, m0, m6, m1
  2210. cvtdq2ps m1, m2
  2211. psubd m2, m3
  2212. cvtdq2ps m2, m2
  2213. rcpps m3, m1
  2214. mulps m1, m3
  2215. mulps m0, m2
  2216. addps m2, m3, m3
  2217. fnmaddps m3, m1, m3, m2
  2218. mulps m0, m3
  2219. %else
  2220. cvtdq2ps m0, m0
  2221. mulps m0, m6 ; intra*invq*fps_factor>>8
  2222. cvtdq2ps m1, m1 ; prop
  2223. addps m0, m1 ; prop + (intra*invq*fps_factor>>8)
  2224. cvtdq2ps m1, m2 ; intra
  2225. psubd m2, m3 ; intra - inter
  2226. cvtdq2ps m2, m2 ; intra - inter
  2227. rcpps m3, m1 ; 1 / intra 1st approximation
  2228. mulps m1, m3 ; intra * (1/intra 1st approx)
  2229. mulps m1, m3 ; intra * (1/intra 1st approx)^2
  2230. mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
  2231. addps m3, m3 ; 2 * (1/intra 1st approx)
  2232. subps m3, m1 ; 2nd approximation for 1/intra
  2233. mulps m0, m3 ; / intra
  2234. %endif
  2235. cvtps2dq m0, m0
  2236. packssdw m0, m0
  2237. movh [r0+r5], m0
  2238. add r5, 8
  2239. jl .loop
  2240. RET
  2241. %endmacro
  2242. INIT_XMM sse2
  2243. MBTREE
  2244. ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
  2245. INIT_XMM fma4
  2246. MBTREE
  2247. %macro INT16_UNPACK 1
  2248. punpckhwd xm6, xm%1, xm7
  2249. punpcklwd xm%1, xm7
  2250. vinsertf128 m%1, m%1, xm6, 1
  2251. %endmacro
  2252. ; FIXME: align loads to 16 bytes
  2253. %macro MBTREE_AVX 0
  2254. cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
  2255. vbroadcastss m5, [r5]
  2256. mov r5d, r6m
  2257. lea r2, [r2+r5*2]
  2258. add r5d, r5d
  2259. add r4, r5
  2260. neg r5
  2261. sub r1, r5
  2262. sub r3, r5
  2263. sub r0, r5
  2264. mova xm4, [pw_3fff]
  2265. %if notcpuflag(avx2)
  2266. pxor xm7, xm7
  2267. %endif
  2268. .loop:
  2269. %if cpuflag(avx2)
  2270. pmovzxwd m0, [r2+r5] ; intra
  2271. pmovzxwd m1, [r4+r5] ; invq
  2272. pmovzxwd m2, [r1+r5] ; prop
  2273. pand xm3, xm4, [r3+r5] ; inter
  2274. pmovzxwd m3, xm3
  2275. pmaddwd m1, m0
  2276. psubusw m3, m0, m3
  2277. cvtdq2ps m0, m0
  2278. cvtdq2ps m1, m1
  2279. cvtdq2ps m2, m2
  2280. cvtdq2ps m3, m3
  2281. fmaddps m1, m1, m5, m2
  2282. rcpps m2, m0
  2283. mulps m0, m2
  2284. mulps m1, m3
  2285. addps m3, m2, m2
  2286. fnmaddps m2, m2, m0, m3
  2287. mulps m1, m2
  2288. %else
  2289. movu xm0, [r2+r5]
  2290. movu xm1, [r4+r5]
  2291. movu xm2, [r1+r5]
  2292. pand xm3, xm4, [r3+r5]
  2293. psubusw xm3, xm0, xm3
  2294. INT16_UNPACK 0
  2295. INT16_UNPACK 1
  2296. INT16_UNPACK 2
  2297. INT16_UNPACK 3
  2298. cvtdq2ps m0, m0
  2299. cvtdq2ps m1, m1
  2300. cvtdq2ps m2, m2
  2301. cvtdq2ps m3, m3
  2302. mulps m1, m0
  2303. mulps m1, m5 ; intra*invq*fps_factor>>8
  2304. addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
  2305. rcpps m2, m0 ; 1 / intra 1st approximation
  2306. mulps m0, m2 ; intra * (1/intra 1st approx)
  2307. mulps m0, m2 ; intra * (1/intra 1st approx)^2
  2308. mulps m1, m3 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
  2309. addps m2, m2 ; 2 * (1/intra 1st approx)
  2310. subps m2, m0 ; 2nd approximation for 1/intra
  2311. mulps m1, m2 ; / intra
  2312. %endif
  2313. cvtps2dq m1, m1
  2314. vextractf128 xm2, m1, 1
  2315. packssdw xm1, xm2
  2316. mova [r0+r5], xm1
  2317. add r5, 16
  2318. jl .loop
  2319. RET
  2320. %endmacro
  2321. INIT_YMM avx
  2322. MBTREE_AVX
  2323. INIT_YMM avx2
  2324. MBTREE_AVX
  2325. INIT_ZMM avx512
  2326. cglobal mbtree_propagate_cost, 6,6
  2327. vbroadcastss m5, [r5]
  2328. mov r5d, 0x3fff3fff
  2329. vpbroadcastd ym4, r5d
  2330. mov r5d, r6m
  2331. lea r2, [r2+r5*2]
  2332. add r5d, r5d
  2333. add r1, r5
  2334. neg r5
  2335. sub r4, r5
  2336. sub r3, r5
  2337. sub r0, r5
  2338. .loop:
  2339. pmovzxwd m0, [r2+r5] ; intra
  2340. pmovzxwd m1, [r1+r5] ; prop
  2341. pmovzxwd m2, [r4+r5] ; invq
  2342. pand ym3, ym4, [r3+r5] ; inter
  2343. pmovzxwd m3, ym3
  2344. psubusw m3, m0, m3
  2345. cvtdq2ps m0, m0
  2346. cvtdq2ps m1, m1
  2347. cvtdq2ps m2, m2
  2348. cvtdq2ps m3, m3
  2349. vdivps m1, m0, {rn-sae}
  2350. fmaddps m1, m2, m5, m1
  2351. mulps m1, m3
  2352. cvtps2dq m1, m1
  2353. vpmovsdw [r0+r5], m1
  2354. add r5, 32
  2355. jl .loop
  2356. RET
  2357. %macro MBTREE_PROPAGATE_LIST 0
  2358. ;-----------------------------------------------------------------------------
  2359. ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
  2360. ; int16_t *output, int bipred_weight, int mb_y, int len )
  2361. ;-----------------------------------------------------------------------------
  2362. cglobal mbtree_propagate_list_internal, 4,6,8
  2363. movh m6, [pw_0to15] ; mb_x
  2364. movd m7, r5m
  2365. pshuflw m7, m7, 0
  2366. punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
  2367. movd m7, r4m
  2368. SPLATW m7, m7 ; bipred_weight
  2369. psllw m7, 9 ; bipred_weight << 9
  2370. mov r5d, r6m
  2371. xor r4d, r4d
  2372. .loop:
  2373. mova m3, [r1+r4*2]
  2374. movu m4, [r2+r4*2]
  2375. mova m5, [pw_0xc000]
  2376. pand m4, m5
  2377. pcmpeqw m4, m5
  2378. pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
  2379. %if cpuflag(avx)
  2380. pblendvb m5, m3, m5, m4
  2381. %else
  2382. pand m5, m4
  2383. pandn m4, m3
  2384. por m5, m4 ; if( lists_used == 3 )
  2385. ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
  2386. %endif
  2387. movu m0, [r0+r4*4] ; x,y
  2388. movu m1, [r0+r4*4+mmsize]
  2389. psraw m2, m0, 5
  2390. psraw m3, m1, 5
  2391. mova m4, [pd_4]
  2392. paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
  2393. paddw m6, m4 ; {mbx, mby} += {4, 0}
  2394. paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
  2395. paddw m6, m4 ; {mbx, mby} += {4, 0}
  2396. mova [r3+mmsize*0], m2
  2397. mova [r3+mmsize*1], m3
  2398. mova m3, [pw_31]
  2399. pand m0, m3 ; x &= 31
  2400. pand m1, m3 ; y &= 31
  2401. packuswb m0, m1
  2402. psrlw m1, m0, 3
  2403. pand m0, m3 ; x
  2404. SWAP 1, 3
  2405. pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
  2406. mova m3, [pw_32]
  2407. psubw m3, m0 ; 32 - x
  2408. mova m4, [pw_1024]
  2409. psubw m4, m1 ; (32 - y) << 5
  2410. pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
  2411. pmullw m4, m0 ; idx1weight = (32-y)*x << 5
  2412. pmullw m0, m1 ; idx3weight = y*x << 5
  2413. pmullw m1, m3 ; idx2weight = y*(32-x) << 5
  2414. ; avoid overflow in the input to pmulhrsw
  2415. psrlw m3, m2, 15
  2416. psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
  2417. pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
  2418. pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
  2419. pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
  2420. pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
  2421. SBUTTERFLY wd, 2, 4, 3
  2422. SBUTTERFLY wd, 1, 0, 3
  2423. mova [r3+mmsize*2], m2
  2424. mova [r3+mmsize*3], m4
  2425. mova [r3+mmsize*4], m1
  2426. mova [r3+mmsize*5], m0
  2427. add r4d, mmsize/2
  2428. add r3, mmsize*6
  2429. cmp r4d, r5d
  2430. jl .loop
  2431. REP_RET
  2432. %endmacro
  2433. INIT_XMM ssse3
  2434. MBTREE_PROPAGATE_LIST
  2435. INIT_XMM avx
  2436. MBTREE_PROPAGATE_LIST
  2437. INIT_YMM avx2
  2438. cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
  2439. mova xm4, [pw_0xc000]
  2440. %if UNIX64
  2441. shl r4d, 9
  2442. shl r5d, 16
  2443. movd xm5, r4d
  2444. movd xm6, r5d
  2445. vpbroadcastw xm5, xm5
  2446. vpbroadcastd m6, xm6
  2447. %else
  2448. vpbroadcastw xm5, r4m
  2449. vpbroadcastd m6, r5m
  2450. psllw xm5, 9 ; bipred_weight << 9
  2451. pslld m6, 16
  2452. %endif
  2453. mov r4d, r6m
  2454. lea r1, [r1+r4*2]
  2455. lea r2, [r2+r4*2]
  2456. lea r0, [r0+r4*4]
  2457. neg r4
  2458. por m6, [pd_0123] ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
  2459. vbroadcasti128 m7, [pw_31]
  2460. .loop:
  2461. mova xm3, [r1+r4*2]
  2462. pand xm0, xm4, [r2+r4*2]
  2463. pmulhrsw xm1, xm3, xm5 ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6
  2464. pcmpeqw xm0, xm4
  2465. pblendvb xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount
  2466. vpermq m3, m3, q1100
  2467. movu m0, [r0+r4*4] ; {x, y}
  2468. vbroadcasti128 m1, [pd_8]
  2469. psraw m2, m0, 5
  2470. paddw m2, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
  2471. paddw m6, m1 ; i_mb_x += 8
  2472. mova [r3], m2
  2473. mova m1, [pw_32]
  2474. pand m0, m7
  2475. psubw m1, m0
  2476. packuswb m1, m0 ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y}
  2477. psrlw m0, m1, 3
  2478. pand m1, [pw_00ff] ; 32-x x 32-x x
  2479. pandn m0, m7, m0 ; (32-y y 32-y y) << 5
  2480. pshufd m2, m1, q1032
  2481. pmullw m1, m0 ; idx0 idx3 idx0 idx3
  2482. pmullw m2, m0 ; idx1 idx2 idx1 idx2
  2483. pmulhrsw m0, m1, m3 ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10
  2484. pmulhrsw m2, m3 ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10
  2485. psignw m0, m1 ; correct potential overflow in the idx0 input to pmulhrsw
  2486. punpcklwd m1, m0, m2 ; idx01weight
  2487. punpckhwd m2, m0 ; idx23weight
  2488. mova [r3+32], m1
  2489. mova [r3+64], m2
  2490. add r3, 3*mmsize
  2491. add r4, 8
  2492. jl .loop
  2493. RET
  2494. %if ARCH_X86_64
  2495. ;-----------------------------------------------------------------------------
  2496. ; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
  2497. ; uint16_t *lowres_costs, int bipred_weight, int mb_y,
  2498. ; int width, int height, int stride, int list_mask );
  2499. ;-----------------------------------------------------------------------------
  2500. INIT_ZMM avx512
  2501. cglobal mbtree_propagate_list_internal, 5,7,21
  2502. mova xm16, [pw_0xc000]
  2503. vpbroadcastw xm17, r5m ; bipred_weight << 9
  2504. vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT)
  2505. vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf]
  2506. vbroadcasti32x8 m6, [pd_0123]
  2507. vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
  2508. vbroadcasti128 m7, [pd_8]
  2509. vbroadcasti128 m8, [pw_31]
  2510. vbroadcasti128 m9, [pw_32]
  2511. psllw m10, m9, 4
  2512. pcmpeqw ym19, ym19 ; pw_m1
  2513. vpbroadcastw ym20, r7m ; width
  2514. psrld m11, m7, 3 ; pd_1
  2515. psrld m12, m8, 16 ; pd_31
  2516. vpbroadcastd m13, r8m ; height
  2517. vpbroadcastd m14, r9m ; stride
  2518. pslld m15, m14, 16
  2519. por m15, m11 ; {1, stride, 1, stride} ...
  2520. lea r4, [r4+2*r0] ; lowres_costs
  2521. lea r3, [r3+2*r0] ; propagate_amount
  2522. lea r2, [r2+4*r0] ; mvs
  2523. neg r0
  2524. mov r6d, 0x5555ffff
  2525. kmovd k4, r6d
  2526. kshiftrd k5, k4, 16 ; 0x5555
  2527. kshiftlw k6, k4, 8 ; 0xff00
  2528. .loop:
  2529. vbroadcasti128 ym1, [r4+2*r0]
  2530. mova xm4, [r3+2*r0]
  2531. vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3)
  2532. vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
  2533. vptestmw k1, ym1, ym18
  2534. vpermw m4, m5, m4
  2535. vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy}
  2536. psraw m0, m3, 5
  2537. paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
  2538. paddd m6, m7 ; i_mb_x += 8
  2539. pand m3, m8 ; {x, y}
  2540. vprold m1, m3, 20 ; {y, x} << 4
  2541. vpsubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
  2542. vpsubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
  2543. pmullw m3, m1
  2544. paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
  2545. pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
  2546. pslld ym1, ym0, 16
  2547. psubw ym1, ym19
  2548. vmovdqu16 ym1 {k5}, ym0
  2549. vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
  2550. kunpckwd k2, k2, k2
  2551. psrad m1, m0, 16
  2552. vpaddd m1 {k6}, m11
  2553. vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
  2554. pmaddwd m0, m15
  2555. vpaddd m0 {k6}, m14 ; idx0 | idx2
  2556. vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
  2557. vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
  2558. ; We're handling dwords, but the offsets are in words so there may be partial overlaps.
  2559. ; We can work around this by handling dword-aligned and -unaligned offsets separately.
  2560. vptestmd k0, m0, m11
  2561. kandnw k2, k0, k1 ; dword-aligned offsets
  2562. kmovw k3, k2
  2563. vpgatherdd m3 {k2}, [r1+2*m0]
  2564. ; If there are conflicts in the offsets we have to handle them before storing the results.
  2565. ; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel
  2566. ; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets.
  2567. vpconflictd m4, m0
  2568. vpbroadcastmw2d m1, k1
  2569. vptestmd k2, m1, m4
  2570. ktestw k2, k2
  2571. jz .no_conflicts
  2572. pand m1, m4 ; mask away unused offsets to avoid false positives
  2573. vplzcntd m1, m1
  2574. pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb
  2575. .conflict_loop:
  2576. vpermd m4 {k2}{z}, m1, m2
  2577. vpermd m1 {k2}, m1, m1 ; shift the index one step forward
  2578. paddsw m2, m4 ; add the weights of conflicting offsets
  2579. vpcmpd k2, m1, m12, 2
  2580. ktestw k2, k2
  2581. jnz .conflict_loop
  2582. .no_conflicts:
  2583. paddsw m3, m2
  2584. vpscatterdd [r1+2*m0] {k3}, m3
  2585. kandw k1, k0, k1 ; dword-unaligned offsets
  2586. kmovw k2, k1
  2587. vpgatherdd m1 {k1}, [r1+2*m0]
  2588. paddsw m1, m2 ; all conflicts have already been resolved
  2589. vpscatterdd [r1+2*m0] {k2}, m1
  2590. add r0, 8
  2591. jl .loop
  2592. RET
  2593. %endif
  2594. %macro MBTREE_FIX8 0
  2595. ;-----------------------------------------------------------------------------
  2596. ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
  2597. ;-----------------------------------------------------------------------------
  2598. cglobal mbtree_fix8_pack, 3,4
  2599. %if mmsize == 32
  2600. vbroadcastf128 m2, [pf_256]
  2601. vbroadcasti128 m3, [mbtree_fix8_pack_shuf]
  2602. %else
  2603. movaps m2, [pf_256]
  2604. mova m3, [mbtree_fix8_pack_shuf]
  2605. %endif
  2606. sub r2d, mmsize/2
  2607. movsxdifnidn r2, r2d
  2608. lea r1, [r1+4*r2]
  2609. lea r0, [r0+2*r2]
  2610. neg r2
  2611. jg .skip_loop
  2612. .loop:
  2613. mulps m0, m2, [r1+4*r2]
  2614. mulps m1, m2, [r1+4*r2+mmsize]
  2615. cvttps2dq m0, m0
  2616. cvttps2dq m1, m1
  2617. packssdw m0, m1
  2618. pshufb m0, m3
  2619. %if mmsize == 32
  2620. vpermq m0, m0, q3120
  2621. %endif
  2622. mova [r0+2*r2], m0
  2623. add r2, mmsize/2
  2624. jle .loop
  2625. .skip_loop:
  2626. sub r2, mmsize/2
  2627. jz .end
  2628. ; Do the remaining values in scalar in order to avoid overreading src.
  2629. .scalar:
  2630. mulss xm0, xm2, [r1+4*r2+2*mmsize]
  2631. cvttss2si r3d, xm0
  2632. rol r3w, 8
  2633. mov [r0+2*r2+mmsize], r3w
  2634. inc r2
  2635. jl .scalar
  2636. .end:
  2637. RET
  2638. ;-----------------------------------------------------------------------------
  2639. ; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
  2640. ;-----------------------------------------------------------------------------
  2641. cglobal mbtree_fix8_unpack, 3,4
  2642. %if mmsize == 32
  2643. vbroadcastf128 m2, [pf_inv16777216]
  2644. %else
  2645. movaps m2, [pf_inv16777216]
  2646. mova m4, [mbtree_fix8_unpack_shuf+16]
  2647. %endif
  2648. mova m3, [mbtree_fix8_unpack_shuf]
  2649. sub r2d, mmsize/2
  2650. movsxdifnidn r2, r2d
  2651. lea r1, [r1+2*r2]
  2652. lea r0, [r0+4*r2]
  2653. neg r2
  2654. jg .skip_loop
  2655. .loop:
  2656. %if mmsize == 32
  2657. vbroadcasti128 m0, [r1+2*r2]
  2658. vbroadcasti128 m1, [r1+2*r2+16]
  2659. pshufb m0, m3
  2660. pshufb m1, m3
  2661. %else
  2662. mova m1, [r1+2*r2]
  2663. pshufb m0, m1, m3
  2664. pshufb m1, m4
  2665. %endif
  2666. cvtdq2ps m0, m0
  2667. cvtdq2ps m1, m1
  2668. mulps m0, m2
  2669. mulps m1, m2
  2670. movaps [r0+4*r2], m0
  2671. movaps [r0+4*r2+mmsize], m1
  2672. add r2, mmsize/2
  2673. jle .loop
  2674. .skip_loop:
  2675. sub r2, mmsize/2
  2676. jz .end
  2677. .scalar:
  2678. movzx r3d, word [r1+2*r2+mmsize]
  2679. bswap r3d
  2680. ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
  2681. ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
  2682. cvtsi2ss xm0, xm2, r3d
  2683. mulss xm0, xm2
  2684. movss [r0+4*r2+2*mmsize], xm0
  2685. inc r2
  2686. jl .scalar
  2687. .end:
  2688. RET
  2689. %endmacro
  2690. INIT_XMM ssse3
  2691. MBTREE_FIX8
  2692. INIT_YMM avx2
  2693. MBTREE_FIX8
  2694. %macro MBTREE_FIX8_AVX512_END 0
  2695. add r2, mmsize/2
  2696. jle .loop
  2697. cmp r2d, mmsize/2
  2698. jl .tail
  2699. RET
  2700. .tail:
  2701. ; Do the final loop iteration with partial masking to handle the remaining elements.
  2702. shrx r3d, r3d, r2d ; (1 << count) - 1
  2703. kmovd k1, r3d
  2704. kshiftrd k2, k1, 16
  2705. jmp .loop
  2706. %endmacro
  2707. INIT_ZMM avx512
  2708. cglobal mbtree_fix8_pack, 3,4
  2709. vbroadcastf32x4 m2, [pf_256]
  2710. vbroadcasti32x4 m3, [mbtree_fix8_pack_shuf]
  2711. psrld xm4, xm3, 4
  2712. pmovzxbq m4, xm4
  2713. sub r2d, mmsize/2
  2714. mov r3d, -1
  2715. movsxdifnidn r2, r2d
  2716. lea r1, [r1+4*r2]
  2717. lea r0, [r0+2*r2]
  2718. neg r2
  2719. jg .tail
  2720. kmovd k1, r3d
  2721. kmovw k2, k1
  2722. .loop:
  2723. vmulps m0 {k1}{z}, m2, [r1+4*r2]
  2724. vmulps m1 {k2}{z}, m2, [r1+4*r2+mmsize]
  2725. cvttps2dq m0, m0
  2726. cvttps2dq m1, m1
  2727. packssdw m0, m1
  2728. pshufb m0, m3
  2729. vpermq m0, m4, m0
  2730. vmovdqu16 [r0+2*r2] {k1}, m0
  2731. MBTREE_FIX8_AVX512_END
  2732. cglobal mbtree_fix8_unpack, 3,4
  2733. vbroadcasti32x8 m3, [mbtree_fix8_unpack_shuf]
  2734. vbroadcastf32x4 m2, [pf_inv16777216]
  2735. sub r2d, mmsize/2
  2736. mov r3d, -1
  2737. movsxdifnidn r2, r2d
  2738. lea r1, [r1+2*r2]
  2739. lea r0, [r0+4*r2]
  2740. neg r2
  2741. jg .tail
  2742. kmovw k1, r3d
  2743. kmovw k2, k1
  2744. .loop:
  2745. mova m1, [r1+2*r2]
  2746. vshufi32x4 m0, m1, m1, q1100
  2747. vshufi32x4 m1, m1, m1, q3322
  2748. pshufb m0, m3
  2749. pshufb m1, m3
  2750. cvtdq2ps m0, m0
  2751. cvtdq2ps m1, m1
  2752. mulps m0, m2
  2753. mulps m1, m2
  2754. vmovaps [r0+4*r2] {k1}, m0
  2755. vmovaps [r0+4*r2+mmsize] {k2}, m1
  2756. MBTREE_FIX8_AVX512_END