quant-a.asm 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274
  1. ;*****************************************************************************
  2. ;* quant-a.asm: x86 quantization and level-run
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Christian Heine <sennindemokrit@gmx.net>
  9. ;* Oskar Arvidsson <oskar@irock.se>
  10. ;* Henrik Gramner <henrik@gramner.com>
  11. ;*
  12. ;* This program is free software; you can redistribute it and/or modify
  13. ;* it under the terms of the GNU General Public License as published by
  14. ;* the Free Software Foundation; either version 2 of the License, or
  15. ;* (at your option) any later version.
  16. ;*
  17. ;* This program is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. ;* GNU General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU General Public License
  23. ;* along with this program; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  25. ;*
  26. ;* This program is also available under a commercial proprietary license.
  27. ;* For more information, contact us at licensing@x264.com.
  28. ;*****************************************************************************
  29. %include "x86inc.asm"
  30. %include "x86util.asm"
  31. SECTION_RODATA 64
  32. %if HIGH_BIT_DEPTH
  33. decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15
  34. %else
  35. dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
  36. dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
  37. %endif
  38. %macro DQM4 3
  39. dw %1, %2, %1, %2, %2, %3, %2, %3
  40. %endmacro
  41. %macro DQM8 6
  42. dw %1, %4, %5, %4, %1, %4, %5, %4
  43. dw %4, %2, %6, %2, %4, %2, %6, %2
  44. dw %5, %6, %3, %6, %5, %6, %3, %6
  45. dw %4, %2, %6, %2, %4, %2, %6, %2
  46. %endmacro
  47. dequant8_scale:
  48. DQM8 20, 18, 32, 19, 25, 24
  49. DQM8 22, 19, 35, 21, 28, 26
  50. DQM8 26, 23, 42, 24, 33, 31
  51. DQM8 28, 25, 45, 26, 35, 33
  52. DQM8 32, 28, 51, 30, 40, 38
  53. DQM8 36, 32, 58, 34, 46, 43
  54. dequant4_scale:
  55. DQM4 10, 13, 16
  56. DQM4 11, 14, 18
  57. DQM4 13, 16, 20
  58. DQM4 14, 18, 23
  59. DQM4 16, 20, 25
  60. DQM4 18, 23, 29
  61. decimate_mask_table4:
  62. db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
  63. db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
  64. db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
  65. db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
  66. db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
  67. db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
  68. db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
  69. db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
  70. db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
  71. chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
  72. chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
  73. chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
  74. chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
  75. %if HIGH_BIT_DEPTH==0
  76. dct_coef_shuffle:
  77. %macro DCT_COEF_SHUFFLE 8
  78. %assign y x
  79. %rep 8
  80. %rep 7
  81. %rotate (~(y>>7))&1
  82. %assign y y<<((~(y>>7))&1)
  83. %endrep
  84. db %1*2
  85. %rotate 1
  86. %assign y y<<1
  87. %endrep
  88. %endmacro
  89. %assign x 0
  90. %rep 256
  91. DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
  92. %assign x x+1
  93. %endrep
  94. %endif
  95. SECTION .text
  96. cextern pb_1
  97. cextern pw_1
  98. cextern pw_2
  99. cextern pw_256
  100. cextern pd_1
  101. cextern pb_01
  102. cextern pd_1024
  103. cextern deinterleave_shufd
  104. cextern popcnt_table
  105. %macro QUANT_DC_START 2
  106. movd xm%1, r1m ; mf
  107. movd xm%2, r2m ; bias
  108. %if cpuflag(avx2)
  109. vpbroadcastdct m%1, xm%1
  110. vpbroadcastdct m%2, xm%2
  111. %elif HIGH_BIT_DEPTH
  112. SPLATD m%1, m%1
  113. SPLATD m%2, m%2
  114. %elif cpuflag(sse4) ; ssse3, but not faster on conroe
  115. mova m5, [pb_01]
  116. pshufb m%1, m5
  117. pshufb m%2, m5
  118. %else
  119. SPLATW m%1, m%1
  120. SPLATW m%2, m%2
  121. %endif
  122. %endmacro
  123. %macro QUANT_END 0
  124. xor eax, eax
  125. %if cpuflag(sse4)
  126. ptest m5, m5
  127. %else ; !sse4
  128. %if ARCH_X86_64
  129. %if mmsize == 16
  130. packsswb m5, m5
  131. %endif
  132. movq rcx, m5
  133. test rcx, rcx
  134. %else
  135. %if mmsize == 16
  136. pxor m4, m4
  137. pcmpeqb m5, m4
  138. pmovmskb ecx, m5
  139. cmp ecx, (1<<mmsize)-1
  140. %else
  141. packsswb m5, m5
  142. movd ecx, m5
  143. test ecx, ecx
  144. %endif
  145. %endif
  146. %endif ; cpuflag
  147. setne al
  148. %endmacro
  149. %if HIGH_BIT_DEPTH
  150. %macro QUANT_ONE_DC 4
  151. %if cpuflag(sse4)
  152. mova m0, [%1]
  153. ABSD m1, m0
  154. paddd m1, %3
  155. pmulld m1, %2
  156. psrad m1, 16
  157. %else ; !sse4
  158. mova m0, [%1]
  159. ABSD m1, m0
  160. paddd m1, %3
  161. mova m2, m1
  162. psrlq m2, 32
  163. pmuludq m1, %2
  164. pmuludq m2, %2
  165. psllq m2, 32
  166. paddd m1, m2
  167. psrld m1, 16
  168. %endif ; cpuflag
  169. PSIGND m1, m0
  170. mova [%1], m1
  171. ACCUM por, 5, 1, %4
  172. %endmacro
  173. %macro QUANT_TWO_DC 4
  174. %if cpuflag(sse4)
  175. mova m0, [%1 ]
  176. mova m1, [%1+mmsize]
  177. ABSD m2, m0
  178. ABSD m3, m1
  179. paddd m2, %3
  180. paddd m3, %3
  181. pmulld m2, %2
  182. pmulld m3, %2
  183. psrad m2, 16
  184. psrad m3, 16
  185. PSIGND m2, m0
  186. PSIGND m3, m1
  187. mova [%1 ], m2
  188. mova [%1+mmsize], m3
  189. ACCUM por, 5, 2, %4
  190. por m5, m3
  191. %else ; !sse4
  192. QUANT_ONE_DC %1, %2, %3, %4
  193. QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
  194. %endif ; cpuflag
  195. %endmacro
  196. %macro QUANT_ONE_AC_MMX 5
  197. mova m0, [%1]
  198. mova m2, [%2]
  199. ABSD m1, m0
  200. mova m4, m2
  201. paddd m1, [%3]
  202. mova m3, m1
  203. psrlq m4, 32
  204. psrlq m3, 32
  205. pmuludq m1, m2
  206. pmuludq m3, m4
  207. psllq m3, 32
  208. paddd m1, m3
  209. psrad m1, 16
  210. PSIGND m1, m0
  211. mova [%1], m1
  212. ACCUM por, %5, 1, %4
  213. %endmacro
  214. %macro QUANT_TWO_AC 5
  215. %if cpuflag(sse4)
  216. mova m0, [%1 ]
  217. mova m1, [%1+mmsize]
  218. ABSD m2, m0
  219. ABSD m3, m1
  220. paddd m2, [%3 ]
  221. paddd m3, [%3+mmsize]
  222. pmulld m2, [%2 ]
  223. pmulld m3, [%2+mmsize]
  224. psrad m2, 16
  225. psrad m3, 16
  226. PSIGND m2, m0
  227. PSIGND m3, m1
  228. mova [%1 ], m2
  229. mova [%1+mmsize], m3
  230. ACCUM por, %5, 2, %4
  231. por m%5, m3
  232. %else ; !sse4
  233. QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
  234. QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
  235. %endif ; cpuflag
  236. %endmacro
  237. ;-----------------------------------------------------------------------------
  238. ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
  239. ;-----------------------------------------------------------------------------
  240. %macro QUANT_DC 2
  241. cglobal quant_%1x%2_dc, 3,3,8
  242. QUANT_DC_START 6,7
  243. %if %1*%2 <= mmsize/4
  244. QUANT_ONE_DC r0, m6, m7, 0
  245. %else
  246. %assign x 0
  247. %rep %1*%2/(mmsize/2)
  248. QUANT_TWO_DC r0+x, m6, m7, x
  249. %assign x x+mmsize*2
  250. %endrep
  251. %endif
  252. QUANT_END
  253. RET
  254. %endmacro
  255. ;-----------------------------------------------------------------------------
  256. ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
  257. ;-----------------------------------------------------------------------------
  258. %macro QUANT_AC 2
  259. cglobal quant_%1x%2, 3,3,8
  260. %assign x 0
  261. %rep %1*%2/(mmsize/2)
  262. QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
  263. %assign x x+mmsize*2
  264. %endrep
  265. QUANT_END
  266. RET
  267. %endmacro
  268. %macro QUANT_4x4 2
  269. QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
  270. QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
  271. %endmacro
  272. %macro QUANT_4x4x4 0
  273. cglobal quant_4x4x4, 3,3,8
  274. QUANT_4x4 0, 5
  275. QUANT_4x4 64, 6
  276. add r0, 128
  277. packssdw m5, m6
  278. QUANT_4x4 0, 6
  279. QUANT_4x4 64, 7
  280. packssdw m6, m7
  281. packssdw m5, m6 ; AAAA BBBB CCCC DDDD
  282. pxor m4, m4
  283. pcmpeqd m5, m4
  284. movmskps eax, m5
  285. xor eax, 0xf
  286. RET
  287. %endmacro
  288. INIT_XMM sse2
  289. QUANT_DC 2, 2
  290. QUANT_DC 4, 4
  291. QUANT_AC 4, 4
  292. QUANT_AC 8, 8
  293. QUANT_4x4x4
  294. INIT_XMM ssse3
  295. QUANT_DC 2, 2
  296. QUANT_DC 4, 4
  297. QUANT_AC 4, 4
  298. QUANT_AC 8, 8
  299. QUANT_4x4x4
  300. INIT_XMM sse4
  301. QUANT_DC 2, 2
  302. QUANT_DC 4, 4
  303. QUANT_AC 4, 4
  304. QUANT_AC 8, 8
  305. QUANT_4x4x4
  306. INIT_YMM avx2
  307. QUANT_DC 4, 4
  308. QUANT_AC 4, 4
  309. QUANT_AC 8, 8
  310. INIT_YMM avx2
  311. cglobal quant_4x4x4, 3,3,6
  312. QUANT_TWO_AC r0, r1, r2, 0, 4
  313. QUANT_TWO_AC r0+64, r1, r2, 0, 5
  314. add r0, 128
  315. packssdw m4, m5
  316. QUANT_TWO_AC r0, r1, r2, 0, 5
  317. QUANT_TWO_AC r0+64, r1, r2, 0, 1
  318. packssdw m5, m1
  319. packssdw m4, m5
  320. pxor m3, m3
  321. pcmpeqd m4, m3
  322. movmskps eax, m4
  323. mov edx, eax
  324. shr eax, 4
  325. and eax, edx
  326. xor eax, 0xf
  327. RET
  328. %endif ; HIGH_BIT_DEPTH
  329. %if HIGH_BIT_DEPTH == 0
  330. %macro QUANT_ONE 5
  331. ;;; %1 (m64) dct[y][x]
  332. ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
  333. ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
  334. mova m1, %1 ; load dct coeffs
  335. ABSW m0, m1, sign
  336. paddusw m0, %3 ; round
  337. pmulhuw m0, %2 ; divide
  338. PSIGNW m0, m1 ; restore sign
  339. mova %1, m0 ; store
  340. ACCUM por, %5, 0, %4
  341. %endmacro
  342. %macro QUANT_TWO 8
  343. mova m1, %1
  344. mova m3, %2
  345. ABSW m0, m1, sign
  346. ABSW m2, m3, sign
  347. paddusw m0, %5
  348. paddusw m2, %6
  349. pmulhuw m0, %3
  350. pmulhuw m2, %4
  351. PSIGNW m0, m1
  352. PSIGNW m2, m3
  353. mova %1, m0
  354. mova %2, m2
  355. ACCUM por, %8, 0, %7
  356. ACCUM por, %8, 2, %7+mmsize
  357. %endmacro
  358. ;-----------------------------------------------------------------------------
  359. ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
  360. ;-----------------------------------------------------------------------------
  361. %macro QUANT_DC 2-3 0
  362. cglobal %1, 1,1,%3
  363. %if %2==1
  364. QUANT_DC_START 2,3
  365. QUANT_ONE [r0], m2, m3, 0, 5
  366. %else
  367. QUANT_DC_START 4,6
  368. %assign x 0
  369. %rep %2/2
  370. QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
  371. %assign x x+mmsize*2
  372. %endrep
  373. %endif
  374. QUANT_END
  375. RET
  376. %endmacro
  377. ;-----------------------------------------------------------------------------
  378. ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  379. ;-----------------------------------------------------------------------------
  380. %macro QUANT_AC 2
  381. cglobal %1, 3,3
  382. %if %2==1
  383. QUANT_ONE [r0], [r1], [r2], 0, 5
  384. %else
  385. %assign x 0
  386. %rep %2/2
  387. QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
  388. %assign x x+mmsize*2
  389. %endrep
  390. %endif
  391. QUANT_END
  392. RET
  393. %endmacro
  394. %macro QUANT_4x4 2
  395. %if UNIX64
  396. QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
  397. %else
  398. QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
  399. %if mmsize==8
  400. QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
  401. %endif
  402. %endif
  403. %endmacro
  404. %macro QUANT_4x4x4 0
  405. cglobal quant_4x4x4, 3,3,7
  406. %if UNIX64
  407. mova m8, [r1+mmsize*0]
  408. mova m9, [r1+mmsize*1]
  409. mova m10, [r2+mmsize*0]
  410. mova m11, [r2+mmsize*1]
  411. %endif
  412. QUANT_4x4 0, 4
  413. QUANT_4x4 32, 5
  414. packssdw m4, m5
  415. QUANT_4x4 64, 5
  416. QUANT_4x4 96, 6
  417. packssdw m5, m6
  418. packssdw m4, m5 ; AAAA BBBB CCCC DDDD
  419. pxor m3, m3
  420. pcmpeqd m4, m3
  421. movmskps eax, m4
  422. xor eax, 0xf
  423. RET
  424. %endmacro
  425. INIT_MMX mmx2
  426. QUANT_DC quant_2x2_dc, 1
  427. %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
  428. QUANT_DC quant_4x4_dc, 4
  429. INIT_MMX mmx2
  430. QUANT_AC quant_4x4, 4
  431. QUANT_AC quant_8x8, 16
  432. %endif
  433. INIT_XMM sse2
  434. QUANT_DC quant_4x4_dc, 2, 7
  435. QUANT_AC quant_4x4, 2
  436. QUANT_AC quant_8x8, 8
  437. QUANT_4x4x4
  438. INIT_XMM ssse3
  439. QUANT_DC quant_4x4_dc, 2, 7
  440. QUANT_AC quant_4x4, 2
  441. QUANT_AC quant_8x8, 8
  442. QUANT_4x4x4
  443. INIT_MMX ssse3
  444. QUANT_DC quant_2x2_dc, 1
  445. INIT_XMM sse4
  446. ;Not faster on Conroe, so only used in SSE4 versions
  447. QUANT_DC quant_4x4_dc, 2, 7
  448. QUANT_AC quant_4x4, 2
  449. QUANT_AC quant_8x8, 8
  450. INIT_YMM avx2
  451. QUANT_AC quant_4x4, 1
  452. QUANT_AC quant_8x8, 4
  453. QUANT_DC quant_4x4_dc, 1, 6
  454. INIT_YMM avx2
  455. cglobal quant_4x4x4, 3,3,6
  456. mova m2, [r1]
  457. mova m3, [r2]
  458. QUANT_ONE [r0+ 0], m2, m3, 0, 4
  459. QUANT_ONE [r0+32], m2, m3, 0, 5
  460. packssdw m4, m5
  461. QUANT_ONE [r0+64], m2, m3, 0, 5
  462. QUANT_ONE [r0+96], m2, m3, 0, 1
  463. packssdw m5, m1
  464. packssdw m4, m5
  465. pxor m3, m3
  466. pcmpeqd m4, m3
  467. movmskps eax, m4
  468. mov edx, eax
  469. shr eax, 4
  470. and eax, edx
  471. xor eax, 0xf
  472. RET
  473. %endif ; !HIGH_BIT_DEPTH
  474. ;=============================================================================
  475. ; dequant
  476. ;=============================================================================
  477. %macro DEQUANT16_L 4
  478. ;;; %1 dct[y][x]
  479. ;;; %2,%3 dequant_mf[i_mf][y][x]
  480. ;;; m2 i_qbits
  481. %if HIGH_BIT_DEPTH
  482. mova m0, %1
  483. mova m1, %4
  484. pmaddwd m0, %2
  485. pmaddwd m1, %3
  486. pslld m0, xm2
  487. pslld m1, xm2
  488. mova %1, m0
  489. mova %4, m1
  490. %else
  491. mova m0, %2
  492. packssdw m0, %3
  493. %if mmsize==32
  494. vpermq m0, m0, q3120
  495. %endif
  496. pmullw m0, %1
  497. psllw m0, xm2
  498. mova %1, m0
  499. %endif
  500. %endmacro
  501. %macro DEQUANT32_R 4
  502. ;;; %1 dct[y][x]
  503. ;;; %2,%3 dequant_mf[i_mf][y][x]
  504. ;;; m2 -i_qbits
  505. ;;; m3 f
  506. ;;; m4 0
  507. %if HIGH_BIT_DEPTH
  508. mova m0, %1
  509. mova m1, %4
  510. pmadcswd m0, m0, %2, m3
  511. pmadcswd m1, m1, %3, m3
  512. psrad m0, xm2
  513. psrad m1, xm2
  514. mova %1, m0
  515. mova %4, m1
  516. %else
  517. %if mmsize == 32
  518. pmovzxwd m0, %1
  519. pmovzxwd m1, %4
  520. %else
  521. mova m0, %1
  522. punpckhwd m1, m0, m4
  523. punpcklwd m0, m4
  524. %endif
  525. pmadcswd m0, m0, %2, m3
  526. pmadcswd m1, m1, %3, m3
  527. psrad m0, xm2
  528. psrad m1, xm2
  529. packssdw m0, m1
  530. %if mmsize == 32
  531. vpermq m0, m0, q3120
  532. %endif
  533. mova %1, m0
  534. %endif
  535. %endmacro
  536. %macro DEQUANT_LOOP 3
  537. %if 8*(%2-2*%3) > 0
  538. mov t0d, 8*(%2-2*%3)
  539. %%loop:
  540. %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
  541. %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
  542. sub t0d, 16*%3
  543. jge %%loop
  544. RET
  545. %else
  546. %if mmsize < 32
  547. %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
  548. %endif
  549. %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
  550. RET
  551. %endif
  552. %endmacro
  553. %macro DEQUANT16_FLAT 2-5
  554. mova m0, %1
  555. psllw m0, m4
  556. %assign i %0-2
  557. %rep %0-1
  558. %if i
  559. mova m %+ i, [r0+%2]
  560. pmullw m %+ i, m0
  561. %else
  562. pmullw m0, [r0+%2]
  563. %endif
  564. mova [r0+%2], m %+ i
  565. %assign i i-1
  566. %rotate 1
  567. %endrep
  568. %endmacro
  569. %if ARCH_X86_64
  570. DECLARE_REG_TMP 6,3,2
  571. %else
  572. DECLARE_REG_TMP 2,0,1
  573. %endif
  574. %macro DEQUANT_START 2
  575. movifnidn t2d, r2m
  576. imul t0d, t2d, 0x2b
  577. shr t0d, 8 ; i_qbits = i_qp / 6
  578. lea t1d, [t0*5]
  579. sub t2d, t0d
  580. sub t2d, t1d ; i_mf = i_qp % 6
  581. shl t2d, %1
  582. %if ARCH_X86_64
  583. add r1, t2 ; dequant_mf[i_mf]
  584. %else
  585. add r1, r1mp ; dequant_mf[i_mf]
  586. mov r0, r0mp ; dct
  587. %endif
  588. sub t0d, %2
  589. jl .rshift32 ; negative qbits => rightshift
  590. %endmacro
  591. ;-----------------------------------------------------------------------------
  592. ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
  593. ;-----------------------------------------------------------------------------
  594. %macro DEQUANT 3
  595. cglobal dequant_%1x%1, 0,3,6
  596. .skip_prologue:
  597. DEQUANT_START %2+2, %2
  598. .lshift:
  599. movd xm2, t0d
  600. DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
  601. .rshift32:
  602. neg t0d
  603. mova m3, [pd_1]
  604. movd xm2, t0d
  605. pslld m3, xm2
  606. pxor m4, m4
  607. psrld m3, 1
  608. DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
  609. %if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
  610. cglobal dequant_%1x%1_flat16, 0,3
  611. movifnidn t2d, r2m
  612. %if %1 == 8
  613. cmp t2d, 12
  614. jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
  615. sub t2d, 12
  616. %endif
  617. imul t0d, t2d, 0x2b
  618. shr t0d, 8 ; i_qbits = i_qp / 6
  619. lea t1d, [t0*5]
  620. sub t2d, t0d
  621. sub t2d, t1d ; i_mf = i_qp % 6
  622. shl t2d, %2
  623. %ifdef PIC
  624. lea r1, [dequant%1_scale]
  625. add r1, t2
  626. %else
  627. lea r1, [dequant%1_scale + t2]
  628. %endif
  629. movifnidn r0, r0mp
  630. movd xm4, t0d
  631. %if %1 == 4
  632. %if mmsize == 8
  633. DEQUANT16_FLAT [r1], 0, 16
  634. DEQUANT16_FLAT [r1+8], 8, 24
  635. %elif mmsize == 16
  636. DEQUANT16_FLAT [r1], 0, 16
  637. %else
  638. vbroadcasti128 m0, [r1]
  639. psllw m0, xm4
  640. pmullw m0, [r0]
  641. mova [r0], m0
  642. %endif
  643. %elif mmsize == 8
  644. DEQUANT16_FLAT [r1], 0, 8, 64, 72
  645. DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
  646. DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
  647. DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
  648. %elif mmsize == 16
  649. DEQUANT16_FLAT [r1], 0, 64
  650. DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
  651. DEQUANT16_FLAT [r1+32], 32, 96
  652. %else
  653. mova m1, [r1+ 0]
  654. mova m2, [r1+32]
  655. psllw m1, xm4
  656. psllw m2, xm4
  657. pmullw m0, m1, [r0+ 0]
  658. pmullw m3, m2, [r0+32]
  659. pmullw m4, m1, [r0+64]
  660. pmullw m5, m2, [r0+96]
  661. mova [r0+ 0], m0
  662. mova [r0+32], m3
  663. mova [r0+64], m4
  664. mova [r0+96], m5
  665. %endif
  666. RET
  667. %endif ; !HIGH_BIT_DEPTH && !AVX
  668. %endmacro ; DEQUANT
  669. %if HIGH_BIT_DEPTH
  670. INIT_XMM sse2
  671. DEQUANT 4, 4, 2
  672. DEQUANT 8, 6, 2
  673. INIT_XMM xop
  674. DEQUANT 4, 4, 2
  675. DEQUANT 8, 6, 2
  676. INIT_YMM avx2
  677. DEQUANT 4, 4, 4
  678. DEQUANT 8, 6, 4
  679. %else
  680. %if ARCH_X86_64 == 0
  681. INIT_MMX mmx
  682. DEQUANT 4, 4, 1
  683. DEQUANT 8, 6, 1
  684. %endif
  685. INIT_XMM sse2
  686. DEQUANT 4, 4, 2
  687. DEQUANT 8, 6, 2
  688. INIT_XMM avx
  689. DEQUANT 4, 4, 2
  690. DEQUANT 8, 6, 2
  691. INIT_XMM xop
  692. DEQUANT 4, 4, 2
  693. DEQUANT 8, 6, 2
  694. INIT_YMM avx2
  695. DEQUANT 4, 4, 4
  696. DEQUANT 8, 6, 4
  697. %endif
  698. %macro DEQUANT_START_AVX512 1-2 0 ; shift, flat
  699. %if %2 == 0
  700. movifnidn t2d, r2m
  701. %endif
  702. imul t0d, t2d, 0x2b
  703. shr t0d, 8 ; i_qbits = i_qp / 6
  704. lea t1d, [t0*5]
  705. sub t2d, t0d
  706. sub t2d, t1d ; i_mf = i_qp % 6
  707. shl t2d, %1
  708. %if %2
  709. %ifdef PIC
  710. %define dmf r1+t2
  711. lea r1, [dequant8_scale]
  712. %else
  713. %define dmf t2+dequant8_scale
  714. %endif
  715. %elif ARCH_X86_64
  716. %define dmf r1+t2
  717. %else
  718. %define dmf r1
  719. add r1, r1mp ; dequant_mf[i_mf]
  720. %endif
  721. movifnidn r0, r0mp
  722. %endmacro
  723. INIT_ZMM avx512
  724. cglobal dequant_4x4, 0,3
  725. DEQUANT_START_AVX512 6
  726. mova m0, [dmf]
  727. %if HIGH_BIT_DEPTH
  728. pmaddwd m0, [r0]
  729. %endif
  730. sub t0d, 4
  731. jl .rshift
  732. %if HIGH_BIT_DEPTH
  733. vpbroadcastd m1, t0d
  734. vpsllvd m0, m1
  735. mova [r0], m0
  736. %else
  737. vpbroadcastw ym1, t0d
  738. vpmovsdw ym0, m0
  739. pmullw ym0, [r0]
  740. vpsllvw ym0, ym1
  741. mova [r0], ym0
  742. %endif
  743. RET
  744. .rshift:
  745. %if HIGH_BIT_DEPTH == 0
  746. pmovzxwd m1, [r0]
  747. pmaddwd m0, m1
  748. %endif
  749. mov r1d, 1<<31
  750. shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
  751. neg t0d
  752. vpbroadcastd m1, r1d
  753. vpbroadcastd m2, t0d
  754. paddd m0, m1
  755. vpsravd m0, m2
  756. %if HIGH_BIT_DEPTH
  757. mova [r0], m0
  758. %else
  759. vpmovsdw [r0], m0
  760. %endif
  761. RET
  762. cglobal dequant_8x8, 0,3
  763. DEQUANT_START_AVX512 8
  764. mova m0, [dmf+0*64]
  765. mova m1, [dmf+1*64]
  766. mova m2, [dmf+2*64]
  767. mova m3, [dmf+3*64]
  768. %if HIGH_BIT_DEPTH
  769. pmaddwd m0, [r0+0*64]
  770. pmaddwd m1, [r0+1*64]
  771. pmaddwd m2, [r0+2*64]
  772. pmaddwd m3, [r0+3*64]
  773. %else
  774. mova m6, [dequant_shuf_avx512]
  775. %endif
  776. sub t0d, 6
  777. jl .rshift
  778. %if HIGH_BIT_DEPTH
  779. vpbroadcastd m4, t0d
  780. vpsllvd m0, m4
  781. vpsllvd m1, m4
  782. vpsllvd m2, m4
  783. vpsllvd m3, m4
  784. jmp .end
  785. .rshift:
  786. %else
  787. vpbroadcastw m4, t0d
  788. vpermt2w m0, m6, m1
  789. vpermt2w m2, m6, m3
  790. pmullw m0, [r0]
  791. pmullw m2, [r0+64]
  792. vpsllvw m0, m4
  793. vpsllvw m2, m4
  794. mova [r0], m0
  795. mova [r0+64], m2
  796. RET
  797. .rshift:
  798. pmovzxwd m4, [r0+0*32]
  799. pmovzxwd m5, [r0+1*32]
  800. pmaddwd m0, m4
  801. pmaddwd m1, m5
  802. pmovzxwd m4, [r0+2*32]
  803. pmovzxwd m5, [r0+3*32]
  804. pmaddwd m2, m4
  805. pmaddwd m3, m5
  806. %endif
  807. mov r1d, 1<<31
  808. shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
  809. neg t0d
  810. vpbroadcastd m4, r1d
  811. vpbroadcastd m5, t0d
  812. paddd m0, m4
  813. paddd m1, m4
  814. vpsravd m0, m5
  815. vpsravd m1, m5
  816. paddd m2, m4
  817. paddd m3, m4
  818. vpsravd m2, m5
  819. vpsravd m3, m5
  820. %if HIGH_BIT_DEPTH
  821. .end:
  822. mova [r0+0*64], m0
  823. mova [r0+1*64], m1
  824. mova [r0+2*64], m2
  825. mova [r0+3*64], m3
  826. %else
  827. vpermt2w m0, m6, m1
  828. vpermt2w m2, m6, m3
  829. mova [r0], m0
  830. mova [r0+64], m2
  831. %endif
  832. RET
  833. %if HIGH_BIT_DEPTH == 0
  834. cglobal dequant_8x8_flat16, 0,3
  835. movifnidn t2d, r2m
  836. cmp t2d, 12
  837. jl dequant_8x8_avx512
  838. sub t2d, 12
  839. DEQUANT_START_AVX512 6, 1
  840. vpbroadcastw m0, t0d
  841. mova m1, [dmf]
  842. vpsllvw m1, m0
  843. pmullw m0, m1, [r0]
  844. pmullw m1, [r0+64]
  845. mova [r0], m0
  846. mova [r0+64], m1
  847. RET
  848. %endif
  849. %undef dmf
  850. %macro DEQUANT_DC 2
  851. cglobal dequant_4x4dc, 0,3,6
  852. DEQUANT_START 6, 6
  853. .lshift:
  854. %if cpuflag(avx2)
  855. vpbroadcastdct m3, [r1]
  856. %else
  857. movd xm3, [r1]
  858. SPLAT%1 m3, xm3
  859. %endif
  860. movd xm2, t0d
  861. pslld m3, xm2
  862. %assign %%x 0
  863. %rep SIZEOF_PIXEL*32/mmsize
  864. %2 m0, m3, [r0+%%x]
  865. mova [r0+%%x], m0
  866. %assign %%x %%x+mmsize
  867. %endrep
  868. RET
  869. .rshift32:
  870. neg t0d
  871. %if cpuflag(avx2)
  872. vpbroadcastdct m2, [r1]
  873. %else
  874. movd xm2, [r1]
  875. %endif
  876. mova m5, [p%1_1]
  877. movd xm3, t0d
  878. pslld m4, m5, xm3
  879. psrld m4, 1
  880. %if HIGH_BIT_DEPTH
  881. %if notcpuflag(avx2)
  882. pshufd m2, m2, 0
  883. %endif
  884. %assign %%x 0
  885. %rep SIZEOF_PIXEL*32/mmsize
  886. pmadcswd m0, m2, [r0+%%x], m4
  887. psrad m0, xm3
  888. mova [r0+%%x], m0
  889. %assign %%x %%x+mmsize
  890. %endrep
  891. %else ; !HIGH_BIT_DEPTH
  892. %if notcpuflag(avx2)
  893. PSHUFLW m2, m2, 0
  894. %endif
  895. punpcklwd m2, m4
  896. %assign %%x 0
  897. %rep SIZEOF_PIXEL*32/mmsize
  898. mova m0, [r0+%%x]
  899. punpckhwd m1, m0, m5
  900. punpcklwd m0, m5
  901. pmaddwd m0, m2
  902. pmaddwd m1, m2
  903. psrad m0, xm3
  904. psrad m1, xm3
  905. packssdw m0, m1
  906. mova [r0+%%x], m0
  907. %assign %%x %%x+mmsize
  908. %endrep
  909. %endif ; !HIGH_BIT_DEPTH
  910. RET
  911. %endmacro
  912. %if HIGH_BIT_DEPTH
  913. INIT_XMM sse2
  914. DEQUANT_DC d, pmaddwd
  915. INIT_XMM xop
  916. DEQUANT_DC d, pmaddwd
  917. INIT_YMM avx2
  918. DEQUANT_DC d, pmaddwd
  919. %else
  920. %if ARCH_X86_64 == 0
  921. INIT_MMX mmx2
  922. DEQUANT_DC w, pmullw
  923. %endif
  924. INIT_XMM sse2
  925. DEQUANT_DC w, pmullw
  926. INIT_XMM avx
  927. DEQUANT_DC w, pmullw
  928. INIT_YMM avx2
  929. DEQUANT_DC w, pmullw
  930. %endif
  931. %macro PEXTRW 4
  932. %if cpuflag(sse4)
  933. pextrw %1, %2, %3
  934. %else
  935. ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
  936. %if %3
  937. pextrw %4d, %2, %3
  938. %else
  939. movd %4d, %2
  940. %endif
  941. mov %1, %4w
  942. %endif
  943. %endmacro
  944. ;-----------------------------------------------------------------------------
  945. ; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
  946. ; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
  947. ;-----------------------------------------------------------------------------
  948. %macro DEQUANT_2x4_DC 1
  949. %ifidn %1, dconly
  950. DECLARE_REG_TMP 6,3,2
  951. %define %%args dct, dmf, qp
  952. %else
  953. DECLARE_REG_TMP 6,4,3
  954. %define %%args dct, dct4x4, dmf, qp
  955. %endif
  956. %if ARCH_X86_64 == 0
  957. DECLARE_REG_TMP 2,0,1
  958. %endif
  959. cglobal idct_dequant_2x4_%1, 0,3,5, %%args
  960. movifnidn t2d, qpm
  961. imul t0d, t2d, 0x2b
  962. shr t0d, 8 ; qp / 6
  963. lea t1d, [t0*5]
  964. sub t2d, t0d
  965. sub t2d, t1d ; qp % 6
  966. shl t2d, 6 ; 16 * sizeof(int)
  967. %if ARCH_X86_64
  968. imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
  969. %else
  970. mov dctq, dctmp
  971. add t2, dmfmp
  972. imul t2d, [t2], -0xffff
  973. %endif
  974. %if HIGH_BIT_DEPTH
  975. mova m0, [dctq]
  976. mova m1, [dctq+16]
  977. SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps,
  978. packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later.
  979. %else
  980. movq m0, [dctq]
  981. movq m1, [dctq+8]
  982. SUMSUB_BA w, 1, 0, 2
  983. punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7
  984. %endif
  985. pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
  986. movd m3, t2d
  987. pshuflw m3, m3, q1000 ; + + + -
  988. SUMSUB_BA w, 0, 1, 2
  989. punpcklqdq m3, m3 ; + + + - + + + -
  990. pshufd m1, m1, q0022
  991. sub t0d, 6
  992. jl .rshift
  993. movd m2, t0d
  994. psllw m3, m2
  995. pmaddwd m0, m3
  996. pmaddwd m1, m3
  997. jmp .end
  998. .rshift:
  999. neg t0d
  1000. movd m2, t0d
  1001. pcmpeqd m4, m4
  1002. pmaddwd m0, m3
  1003. pmaddwd m1, m3
  1004. pslld m4, m2
  1005. psrad m4, 1
  1006. psubd m0, m4 ; + 1 << (qp/6-1)
  1007. psubd m1, m4
  1008. psrad m0, m2
  1009. psrad m1, m2
  1010. .end:
  1011. %ifidn %1, dconly
  1012. %if HIGH_BIT_DEPTH
  1013. mova [dctq], m0
  1014. mova [dctq+16], m1
  1015. %else
  1016. packssdw m0, m1
  1017. mova [dctq], m0
  1018. %endif
  1019. %else
  1020. movifnidn dct4x4q, dct4x4mp
  1021. %if HIGH_BIT_DEPTH
  1022. movd [dct4x4q+0*64], m0
  1023. %if cpuflag(sse4)
  1024. pextrd [dct4x4q+1*64], m0, 1
  1025. add dct4x4q, 4*64
  1026. pextrd [dct4x4q-2*64], m0, 2
  1027. pextrd [dct4x4q-1*64], m0, 3
  1028. movd [dct4x4q+0*64], m1
  1029. pextrd [dct4x4q+1*64], m1, 1
  1030. pextrd [dct4x4q+2*64], m1, 2
  1031. pextrd [dct4x4q+3*64], m1, 3
  1032. %else
  1033. MOVHL m2, m0
  1034. psrlq m0, 32
  1035. movd [dct4x4q+1*64], m0
  1036. add dct4x4q, 4*64
  1037. movd [dct4x4q-2*64], m2
  1038. psrlq m2, 32
  1039. movd [dct4x4q-1*64], m2
  1040. movd [dct4x4q+0*64], m1
  1041. MOVHL m2, m1
  1042. psrlq m1, 32
  1043. movd [dct4x4q+1*64], m1
  1044. movd [dct4x4q+2*64], m2
  1045. psrlq m2, 32
  1046. movd [dct4x4q+3*64], m2
  1047. %endif
  1048. %else
  1049. PEXTRW [dct4x4q+0*32], m0, 0, eax
  1050. PEXTRW [dct4x4q+1*32], m0, 2, eax
  1051. PEXTRW [dct4x4q+2*32], m0, 4, eax
  1052. PEXTRW [dct4x4q+3*32], m0, 6, eax
  1053. add dct4x4q, 4*32
  1054. PEXTRW [dct4x4q+0*32], m1, 0, eax
  1055. PEXTRW [dct4x4q+1*32], m1, 2, eax
  1056. PEXTRW [dct4x4q+2*32], m1, 4, eax
  1057. PEXTRW [dct4x4q+3*32], m1, 6, eax
  1058. %endif
  1059. %endif
  1060. RET
  1061. %endmacro
  1062. ; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
  1063. INIT_XMM sse2
  1064. DEQUANT_2x4_DC dc
  1065. DEQUANT_2x4_DC dconly
  1066. INIT_XMM avx
  1067. DEQUANT_2x4_DC dc
  1068. DEQUANT_2x4_DC dconly
  1069. ; t4 is eax for return value.
  1070. %if ARCH_X86_64
  1071. DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
  1072. %else
  1073. DECLARE_REG_TMP 4,1,2,3,0,5
  1074. %endif
  1075. ;-----------------------------------------------------------------------------
  1076. ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
  1077. ;-----------------------------------------------------------------------------
  1078. %macro OPTIMIZE_CHROMA_2x2_DC 0
  1079. cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
  1080. movifnidn t0, r0mp
  1081. movd m2, r1m
  1082. movq m1, [t0]
  1083. %if cpuflag(sse4)
  1084. pcmpeqb m4, m4
  1085. pslld m4, 11
  1086. %else
  1087. pxor m4, m4
  1088. %endif
  1089. %if cpuflag(ssse3)
  1090. mova m3, [chroma_dc_dct_mask]
  1091. mova m5, [chroma_dc_dmf_mask]
  1092. %else
  1093. mova m3, [chroma_dc_dct_mask_mmx]
  1094. mova m5, [chroma_dc_dmf_mask_mmx]
  1095. %endif
  1096. pshuflw m2, m2, 0
  1097. pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
  1098. punpcklqdq m2, m2
  1099. punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
  1100. mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
  1101. PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
  1102. PSIGNW m2, m5 ; + - - + - - + +
  1103. paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
  1104. pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
  1105. punpcklwd m1, m1
  1106. psrad m2, 16 ; + - - +
  1107. mov t1d, 3
  1108. paddd m0, m6
  1109. xor t4d, t4d
  1110. %if notcpuflag(ssse3)
  1111. psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
  1112. %endif
  1113. %if cpuflag(sse4)
  1114. ptest m0, m4
  1115. %else
  1116. mova m6, m0
  1117. SWAP 0, 6
  1118. psrad m6, 11
  1119. pcmpeqd m6, m4
  1120. pmovmskb t5d, m6
  1121. cmp t5d, 0xffff
  1122. %endif
  1123. jz .ret ; if the DC coefficients already round to zero, terminate early
  1124. mova m3, m0
  1125. .outer_loop:
  1126. movsx t3d, word [t0+2*t1] ; dct[coeff]
  1127. pshufd m6, m1, q3333
  1128. pshufd m1, m1, q2100 ; move the next element to high dword
  1129. PSIGND m5, m2, m6
  1130. test t3d, t3d
  1131. jz .loop_end
  1132. .outer_loop_0:
  1133. mov t2d, t3d
  1134. sar t3d, 31
  1135. or t3d, 1
  1136. .inner_loop:
  1137. psubd m3, m5 ; coeff -= sign
  1138. pxor m6, m0, m3
  1139. %if cpuflag(sse4)
  1140. ptest m6, m4
  1141. %else
  1142. psrad m6, 11
  1143. pcmpeqd m6, m4
  1144. pmovmskb t5d, m6
  1145. cmp t5d, 0xffff
  1146. %endif
  1147. jz .round_coeff
  1148. paddd m3, m5 ; coeff += sign
  1149. mov t4d, 1
  1150. .loop_end:
  1151. dec t1d
  1152. jz .last_coeff
  1153. pshufd m2, m2, q1320 ; - + - + / - - + +
  1154. jg .outer_loop
  1155. .ret:
  1156. REP_RET
  1157. .round_coeff:
  1158. sub t2d, t3d
  1159. mov [t0+2*t1], t2w
  1160. jnz .inner_loop
  1161. jmp .loop_end
  1162. .last_coeff:
  1163. movsx t3d, word [t0]
  1164. punpcklqdq m2, m2 ; + + + +
  1165. PSIGND m5, m2, m1
  1166. test t3d, t3d
  1167. jnz .outer_loop_0
  1168. RET
  1169. %endmacro
  1170. %if HIGH_BIT_DEPTH == 0
  1171. INIT_XMM sse2
  1172. OPTIMIZE_CHROMA_2x2_DC
  1173. INIT_XMM ssse3
  1174. OPTIMIZE_CHROMA_2x2_DC
  1175. INIT_XMM sse4
  1176. OPTIMIZE_CHROMA_2x2_DC
  1177. INIT_XMM avx
  1178. OPTIMIZE_CHROMA_2x2_DC
  1179. %endif ; !HIGH_BIT_DEPTH
  1180. %if HIGH_BIT_DEPTH
  1181. ;-----------------------------------------------------------------------------
  1182. ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
  1183. ;-----------------------------------------------------------------------------
  1184. %macro DENOISE_DCT 0
  1185. cglobal denoise_dct, 4,4,6
  1186. pxor m5, m5
  1187. movsxdifnidn r3, r3d
  1188. .loop:
  1189. mova m2, [r0+r3*4-2*mmsize]
  1190. mova m3, [r0+r3*4-1*mmsize]
  1191. ABSD m0, m2
  1192. ABSD m1, m3
  1193. paddd m4, m0, [r1+r3*4-2*mmsize]
  1194. psubd m0, [r2+r3*4-2*mmsize]
  1195. mova [r1+r3*4-2*mmsize], m4
  1196. paddd m4, m1, [r1+r3*4-1*mmsize]
  1197. psubd m1, [r2+r3*4-1*mmsize]
  1198. mova [r1+r3*4-1*mmsize], m4
  1199. pcmpgtd m4, m0, m5
  1200. pand m0, m4
  1201. pcmpgtd m4, m1, m5
  1202. pand m1, m4
  1203. PSIGND m0, m2
  1204. PSIGND m1, m3
  1205. mova [r0+r3*4-2*mmsize], m0
  1206. mova [r0+r3*4-1*mmsize], m1
  1207. sub r3d, mmsize/2
  1208. jg .loop
  1209. RET
  1210. %endmacro
  1211. %if ARCH_X86_64 == 0
  1212. INIT_MMX mmx
  1213. DENOISE_DCT
  1214. %endif
  1215. INIT_XMM sse2
  1216. DENOISE_DCT
  1217. INIT_XMM ssse3
  1218. DENOISE_DCT
  1219. INIT_XMM avx
  1220. DENOISE_DCT
  1221. INIT_YMM avx2
  1222. DENOISE_DCT
  1223. %else ; !HIGH_BIT_DEPTH
  1224. ;-----------------------------------------------------------------------------
  1225. ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
  1226. ;-----------------------------------------------------------------------------
  1227. %macro DENOISE_DCT 0
  1228. cglobal denoise_dct, 4,4,7
  1229. pxor m6, m6
  1230. movsxdifnidn r3, r3d
  1231. .loop:
  1232. mova m2, [r0+r3*2-2*mmsize]
  1233. mova m3, [r0+r3*2-1*mmsize]
  1234. ABSW m0, m2, sign
  1235. ABSW m1, m3, sign
  1236. psubusw m4, m0, [r2+r3*2-2*mmsize]
  1237. psubusw m5, m1, [r2+r3*2-1*mmsize]
  1238. PSIGNW m4, m2
  1239. PSIGNW m5, m3
  1240. mova [r0+r3*2-2*mmsize], m4
  1241. mova [r0+r3*2-1*mmsize], m5
  1242. punpcklwd m2, m0, m6
  1243. punpcklwd m3, m1, m6
  1244. punpckhwd m0, m6
  1245. punpckhwd m1, m6
  1246. paddd m2, [r1+r3*4-4*mmsize]
  1247. paddd m0, [r1+r3*4-3*mmsize]
  1248. paddd m3, [r1+r3*4-2*mmsize]
  1249. paddd m1, [r1+r3*4-1*mmsize]
  1250. mova [r1+r3*4-4*mmsize], m2
  1251. mova [r1+r3*4-3*mmsize], m0
  1252. mova [r1+r3*4-2*mmsize], m3
  1253. mova [r1+r3*4-1*mmsize], m1
  1254. sub r3, mmsize
  1255. jg .loop
  1256. RET
  1257. %endmacro
  1258. %if ARCH_X86_64 == 0
  1259. INIT_MMX mmx
  1260. DENOISE_DCT
  1261. %endif
  1262. INIT_XMM sse2
  1263. DENOISE_DCT
  1264. INIT_XMM ssse3
  1265. DENOISE_DCT
  1266. INIT_XMM avx
  1267. DENOISE_DCT
  1268. INIT_YMM avx2
  1269. cglobal denoise_dct, 4,4,4
  1270. pxor m3, m3
  1271. movsxdifnidn r3, r3d
  1272. .loop:
  1273. mova m1, [r0+r3*2-mmsize]
  1274. pabsw m0, m1
  1275. psubusw m2, m0, [r2+r3*2-mmsize]
  1276. vpermq m0, m0, q3120
  1277. psignw m2, m1
  1278. mova [r0+r3*2-mmsize], m2
  1279. punpcklwd m1, m0, m3
  1280. punpckhwd m0, m3
  1281. paddd m1, [r1+r3*4-2*mmsize]
  1282. paddd m0, [r1+r3*4-1*mmsize]
  1283. mova [r1+r3*4-2*mmsize], m1
  1284. mova [r1+r3*4-1*mmsize], m0
  1285. sub r3, mmsize/2
  1286. jg .loop
  1287. RET
  1288. %endif ; !HIGH_BIT_DEPTH
  1289. ;-----------------------------------------------------------------------------
  1290. ; int decimate_score( dctcoef *dct )
  1291. ;-----------------------------------------------------------------------------
  1292. %macro DECIMATE_MASK 4
  1293. %if HIGH_BIT_DEPTH
  1294. mova m0, [%3+0*16]
  1295. packssdw m0, [%3+1*16]
  1296. mova m1, [%3+2*16]
  1297. packssdw m1, [%3+3*16]
  1298. ABSW2 m0, m1, m0, m1, m3, m4
  1299. %else
  1300. ABSW m0, [%3+ 0], m3
  1301. ABSW m1, [%3+16], m4
  1302. %endif
  1303. packsswb m0, m1
  1304. pxor m2, m2
  1305. pcmpeqb m2, m0
  1306. pcmpgtb m0, %4
  1307. pmovmskb %1, m2
  1308. pmovmskb %2, m0
  1309. %endmacro
  1310. %macro DECIMATE_MASK16_AVX512 0
  1311. mova m0, [r0]
  1312. %if HIGH_BIT_DEPTH
  1313. vptestmd k0, m0, m0
  1314. pabsd m0, m0
  1315. vpcmpud k1, m0, [pd_1] {1to16}, 6
  1316. %else
  1317. vptestmw k0, m0, m0
  1318. pabsw m0, m0
  1319. vpcmpuw k1, m0, [pw_1], 6
  1320. %endif
  1321. %endmacro
  1322. %macro SHRX 2
  1323. %if cpuflag(bmi2)
  1324. shrx %1, %1, %2
  1325. %else
  1326. shr %1, %2b ; %2 has to be rcx/ecx
  1327. %endif
  1328. %endmacro
  1329. %macro BLSR 2
  1330. %if cpuflag(bmi1)
  1331. blsr %1, %2
  1332. %else
  1333. lea %1, [%2-1]
  1334. and %1, %2
  1335. %endif
  1336. %endmacro
  1337. cextern_common decimate_table4
  1338. cextern_common decimate_table8
  1339. %macro DECIMATE4x4 1
  1340. cglobal decimate_score%1, 1,3
  1341. %if cpuflag(avx512)
  1342. DECIMATE_MASK16_AVX512
  1343. xor eax, eax
  1344. kmovw edx, k0
  1345. %if %1 == 15
  1346. shr edx, 1
  1347. %else
  1348. test edx, edx
  1349. %endif
  1350. jz .ret
  1351. ktestw k1, k1
  1352. jnz .ret9
  1353. %else
  1354. DECIMATE_MASK edx, eax, r0, [pb_1]
  1355. xor edx, 0xffff
  1356. jz .ret
  1357. test eax, eax
  1358. jnz .ret9
  1359. %if %1 == 15
  1360. shr edx, 1
  1361. %endif
  1362. %endif
  1363. %ifdef PIC
  1364. lea r4, [decimate_mask_table4]
  1365. %define mask_table r4
  1366. %else
  1367. %define mask_table decimate_mask_table4
  1368. %endif
  1369. movzx ecx, dl
  1370. movzx eax, byte [mask_table + rcx]
  1371. %if ARCH_X86_64
  1372. xor edx, ecx
  1373. jz .ret
  1374. %if cpuflag(lzcnt)
  1375. lzcnt ecx, ecx
  1376. lea r5, [decimate_table4-32]
  1377. add r5, rcx
  1378. %else
  1379. bsr ecx, ecx
  1380. lea r5, [decimate_table4-1]
  1381. sub r5, rcx
  1382. %endif
  1383. %define table r5
  1384. %else
  1385. cmp edx, ecx
  1386. jz .ret
  1387. bsr ecx, ecx
  1388. shr edx, 1
  1389. SHRX edx, ecx
  1390. %define table decimate_table4
  1391. %endif
  1392. tzcnt ecx, edx
  1393. shr edx, 1
  1394. SHRX edx, ecx
  1395. add al, byte [table + rcx]
  1396. add al, byte [mask_table + rdx]
  1397. .ret:
  1398. REP_RET
  1399. .ret9:
  1400. mov eax, 9
  1401. RET
  1402. %endmacro
  1403. %macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high
  1404. mova m0, [r0+0*32]
  1405. packsswb m0, [r0+1*32]
  1406. mova m1, [r0+2*32]
  1407. packsswb m1, [r0+3*32]
  1408. mova m4, [pb_1]
  1409. pabsb m2, m0
  1410. pabsb m3, m1
  1411. por m2, m3 ; the > 1 checks don't care about order, so
  1412. ptest m4, m2 ; we can save latency by doing them here
  1413. jnc .ret9
  1414. vpermq m0, m0, q3120
  1415. vpermq m1, m1, q3120
  1416. pxor m4, m4
  1417. pcmpeqb m0, m4
  1418. pcmpeqb m1, m4
  1419. pmovmskb %1, m0
  1420. pmovmskb %2, m1
  1421. %endmacro
  1422. %macro DECIMATE_MASK64_AVX512 0
  1423. mova m0, [r0]
  1424. %if HIGH_BIT_DEPTH
  1425. packssdw m0, [r0+1*64]
  1426. mova m1, [r0+2*64]
  1427. packssdw m1, [r0+3*64]
  1428. packsswb m0, m1
  1429. vbroadcasti32x4 m1, [pb_1]
  1430. pabsb m2, m0
  1431. vpcmpub k0, m2, m1, 6
  1432. ktestq k0, k0
  1433. jnz .ret9
  1434. mova m1, [decimate_shuf_avx512]
  1435. vpermd m0, m1, m0
  1436. vptestmb k1, m0, m0
  1437. %else
  1438. mova m1, [r0+64]
  1439. vbroadcasti32x4 m3, [pb_1]
  1440. packsswb m2, m0, m1
  1441. pabsb m2, m2
  1442. vpcmpub k0, m2, m3, 6
  1443. ktestq k0, k0
  1444. jnz .ret9
  1445. vptestmw k1, m0, m0
  1446. vptestmw k2, m1, m1
  1447. %endif
  1448. %endmacro
  1449. %macro DECIMATE8x8 0
  1450. %if ARCH_X86_64
  1451. cglobal decimate_score64, 1,5
  1452. %if mmsize == 64
  1453. DECIMATE_MASK64_AVX512
  1454. xor eax, eax
  1455. %if HIGH_BIT_DEPTH
  1456. kmovq r1, k1
  1457. test r1, r1
  1458. jz .ret
  1459. %else
  1460. kortestd k1, k2
  1461. jz .ret
  1462. kunpckdq k1, k2, k1
  1463. kmovq r1, k1
  1464. %endif
  1465. %elif mmsize == 32
  1466. DECIMATE_MASK64_AVX2 r1d, eax
  1467. not r1
  1468. shl rax, 32
  1469. xor r1, rax
  1470. jz .ret
  1471. %else
  1472. mova m5, [pb_1]
  1473. DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5
  1474. test eax, eax
  1475. jnz .ret9
  1476. DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5
  1477. shl r2d, 16
  1478. or r1d, r2d
  1479. DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5
  1480. shl r2, 32
  1481. or eax, r3d
  1482. or r1, r2
  1483. DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5
  1484. not r1
  1485. shl r2, 48
  1486. xor r1, r2
  1487. jz .ret
  1488. add eax, r3d
  1489. jnz .ret9
  1490. %endif
  1491. %ifdef PIC
  1492. lea r4, [decimate_table8]
  1493. %define table r4
  1494. %else
  1495. %define table decimate_table8
  1496. %endif
  1497. mov al, -6
  1498. .loop:
  1499. tzcnt rcx, r1
  1500. add al, byte [table + rcx]
  1501. jge .ret9
  1502. shr r1, 1
  1503. SHRX r1, rcx
  1504. %if cpuflag(bmi2)
  1505. test r1, r1
  1506. %endif
  1507. jnz .loop
  1508. add al, 6
  1509. .ret:
  1510. REP_RET
  1511. .ret9:
  1512. mov eax, 9
  1513. RET
  1514. %else ; ARCH
  1515. cglobal decimate_score64, 1,4
  1516. %if mmsize == 64
  1517. DECIMATE_MASK64_AVX512
  1518. xor eax, eax
  1519. %if HIGH_BIT_DEPTH
  1520. kshiftrq k2, k1, 32
  1521. %endif
  1522. kmovd r2, k1
  1523. kmovd r3, k2
  1524. test r2, r2
  1525. jz .tryret
  1526. %elif mmsize == 32
  1527. DECIMATE_MASK64_AVX2 r2, r3
  1528. xor eax, eax
  1529. not r3
  1530. xor r2, -1
  1531. jz .tryret
  1532. %else
  1533. mova m5, [pb_1]
  1534. DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5
  1535. test r1, r1
  1536. jnz .ret9
  1537. DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5
  1538. not r2
  1539. shl r3, 16
  1540. xor r2, r3
  1541. mov r0m, r2
  1542. DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5
  1543. or r2, r1
  1544. DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5
  1545. add r0, r2
  1546. jnz .ret9
  1547. mov r2, r0m
  1548. not r3
  1549. shl r1, 16
  1550. xor r3, r1
  1551. test r2, r2
  1552. jz .tryret
  1553. %endif
  1554. mov al, -6
  1555. .loop:
  1556. tzcnt ecx, r2
  1557. add al, byte [decimate_table8 + ecx]
  1558. jge .ret9
  1559. sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well
  1560. jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31
  1561. shrd r2, r3, cl
  1562. SHRX r3, ecx
  1563. %if notcpuflag(bmi2)
  1564. test r2, r2
  1565. %endif
  1566. jnz .loop
  1567. BLSR r2, r3
  1568. jz .end
  1569. .largerun:
  1570. tzcnt ecx, r3
  1571. shr r3, 1
  1572. SHRX r3, ecx
  1573. .loop2:
  1574. tzcnt ecx, r3
  1575. add al, byte [decimate_table8 + ecx]
  1576. jge .ret9
  1577. shr r3, 1
  1578. SHRX r3, ecx
  1579. .run31:
  1580. test r3, r3
  1581. jnz .loop2
  1582. .end:
  1583. add al, 6
  1584. RET
  1585. .tryret:
  1586. BLSR r2, r3
  1587. jz .ret
  1588. mov al, -6
  1589. jmp .largerun
  1590. .ret9:
  1591. mov eax, 9
  1592. .ret:
  1593. REP_RET
  1594. %endif ; ARCH
  1595. %endmacro
  1596. INIT_XMM sse2
  1597. DECIMATE4x4 15
  1598. DECIMATE4x4 16
  1599. DECIMATE8x8
  1600. INIT_XMM ssse3
  1601. DECIMATE4x4 15
  1602. DECIMATE4x4 16
  1603. DECIMATE8x8
  1604. %if HIGH_BIT_DEPTH
  1605. INIT_ZMM avx512
  1606. %else
  1607. INIT_YMM avx2
  1608. DECIMATE8x8
  1609. INIT_YMM avx512
  1610. %endif
  1611. DECIMATE4x4 15
  1612. DECIMATE4x4 16
  1613. INIT_ZMM avx512
  1614. DECIMATE8x8
  1615. ;-----------------------------------------------------------------------------
  1616. ; int coeff_last( dctcoef *dct )
  1617. ;-----------------------------------------------------------------------------
  1618. %macro BSR 3
  1619. %if cpuflag(lzcnt)
  1620. lzcnt %1, %2
  1621. xor %1, %3
  1622. %else
  1623. bsr %1, %2
  1624. %endif
  1625. %endmacro
  1626. %macro LZCOUNT 3
  1627. %if cpuflag(lzcnt)
  1628. lzcnt %1, %2
  1629. %else
  1630. bsr %1, %2
  1631. xor %1, %3
  1632. %endif
  1633. %endmacro
  1634. %if HIGH_BIT_DEPTH
  1635. %macro LAST_MASK 3-4
  1636. %if %1 == 4
  1637. movq mm0, [%3]
  1638. packssdw mm0, [%3+8]
  1639. packsswb mm0, mm0
  1640. pcmpeqb mm0, mm2
  1641. pmovmskb %2, mm0
  1642. %elif mmsize == 16
  1643. movdqa xmm0, [%3+ 0]
  1644. %if %1 == 8
  1645. packssdw xmm0, [%3+16]
  1646. packsswb xmm0, xmm0
  1647. %else
  1648. movdqa xmm1, [%3+32]
  1649. packssdw xmm0, [%3+16]
  1650. packssdw xmm1, [%3+48]
  1651. packsswb xmm0, xmm1
  1652. %endif
  1653. pcmpeqb xmm0, xmm2
  1654. pmovmskb %2, xmm0
  1655. %elif %1 == 8
  1656. movq mm0, [%3+ 0]
  1657. movq mm1, [%3+16]
  1658. packssdw mm0, [%3+ 8]
  1659. packssdw mm1, [%3+24]
  1660. packsswb mm0, mm1
  1661. pcmpeqb mm0, mm2
  1662. pmovmskb %2, mm0
  1663. %else
  1664. movq mm0, [%3+ 0]
  1665. movq mm1, [%3+16]
  1666. packssdw mm0, [%3+ 8]
  1667. packssdw mm1, [%3+24]
  1668. movq mm3, [%3+32]
  1669. movq mm4, [%3+48]
  1670. packssdw mm3, [%3+40]
  1671. packssdw mm4, [%3+56]
  1672. packsswb mm0, mm1
  1673. packsswb mm3, mm4
  1674. pcmpeqb mm0, mm2
  1675. pcmpeqb mm3, mm2
  1676. pmovmskb %2, mm0
  1677. pmovmskb %4, mm3
  1678. shl %4, 8
  1679. or %2, %4
  1680. %endif
  1681. %endmacro
  1682. %macro COEFF_LAST4 0
  1683. cglobal coeff_last4, 1,3
  1684. pxor mm2, mm2
  1685. LAST_MASK 4, r1d, r0
  1686. xor r1d, 0xff
  1687. shr r1d, 4
  1688. BSR eax, r1d, 0x1f
  1689. RET
  1690. %endmacro
  1691. INIT_MMX mmx2
  1692. COEFF_LAST4
  1693. INIT_MMX lzcnt
  1694. COEFF_LAST4
  1695. %macro COEFF_LAST8 0
  1696. cglobal coeff_last8, 1,3
  1697. pxor m2, m2
  1698. LAST_MASK 8, r1d, r0
  1699. %if mmsize == 16
  1700. xor r1d, 0xffff
  1701. shr r1d, 8
  1702. %else
  1703. xor r1d, 0xff
  1704. %endif
  1705. BSR eax, r1d, 0x1f
  1706. RET
  1707. %endmacro
  1708. %if ARCH_X86_64 == 0
  1709. INIT_MMX mmx2
  1710. COEFF_LAST8
  1711. %endif
  1712. INIT_XMM sse2
  1713. COEFF_LAST8
  1714. INIT_XMM lzcnt
  1715. COEFF_LAST8
  1716. %else ; !HIGH_BIT_DEPTH
  1717. %macro LAST_MASK 3-4
  1718. %if %1 <= 8
  1719. movq mm0, [%3+ 0]
  1720. %if %1 == 4
  1721. packsswb mm0, mm0
  1722. %else
  1723. packsswb mm0, [%3+ 8]
  1724. %endif
  1725. pcmpeqb mm0, mm2
  1726. pmovmskb %2, mm0
  1727. %elif mmsize == 16
  1728. movdqa xmm0, [%3+ 0]
  1729. packsswb xmm0, [%3+16]
  1730. pcmpeqb xmm0, xmm2
  1731. pmovmskb %2, xmm0
  1732. %else
  1733. movq mm0, [%3+ 0]
  1734. movq mm1, [%3+16]
  1735. packsswb mm0, [%3+ 8]
  1736. packsswb mm1, [%3+24]
  1737. pcmpeqb mm0, mm2
  1738. pcmpeqb mm1, mm2
  1739. pmovmskb %2, mm0
  1740. pmovmskb %4, mm1
  1741. shl %4, 8
  1742. or %2, %4
  1743. %endif
  1744. %endmacro
  1745. %macro COEFF_LAST48 0
  1746. %if ARCH_X86_64
  1747. cglobal coeff_last4, 1,1
  1748. BSR rax, [r0], 0x3f
  1749. shr eax, 4
  1750. RET
  1751. %else
  1752. cglobal coeff_last4, 0,3
  1753. mov edx, r0mp
  1754. mov eax, [edx+4]
  1755. xor ecx, ecx
  1756. test eax, eax
  1757. cmovz eax, [edx]
  1758. setnz cl
  1759. BSR eax, eax, 0x1f
  1760. shr eax, 4
  1761. lea eax, [eax+ecx*2]
  1762. RET
  1763. %endif
  1764. cglobal coeff_last8, 1,3
  1765. pxor m2, m2
  1766. LAST_MASK 8, r1d, r0, r2d
  1767. xor r1d, 0xff
  1768. BSR eax, r1d, 0x1f
  1769. RET
  1770. %endmacro
  1771. INIT_MMX mmx2
  1772. COEFF_LAST48
  1773. INIT_MMX lzcnt
  1774. COEFF_LAST48
  1775. %endif ; HIGH_BIT_DEPTH
  1776. %macro COEFF_LAST 0
  1777. cglobal coeff_last15, 1,3
  1778. pxor m2, m2
  1779. LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
  1780. xor r1d, 0xffff
  1781. BSR eax, r1d, 0x1f
  1782. dec eax
  1783. RET
  1784. cglobal coeff_last16, 1,3
  1785. pxor m2, m2
  1786. LAST_MASK 16, r1d, r0, r2d
  1787. xor r1d, 0xffff
  1788. BSR eax, r1d, 0x1f
  1789. RET
  1790. %if ARCH_X86_64 == 0
  1791. cglobal coeff_last64, 1, 4-mmsize/16
  1792. pxor m2, m2
  1793. LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
  1794. LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
  1795. shl r2d, 16
  1796. or r1d, r2d
  1797. xor r1d, -1
  1798. jne .secondhalf
  1799. LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
  1800. LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
  1801. shl r2d, 16
  1802. or r1d, r2d
  1803. not r1d
  1804. BSR eax, r1d, 0x1f
  1805. RET
  1806. .secondhalf:
  1807. BSR eax, r1d, 0x1f
  1808. add eax, 32
  1809. RET
  1810. %else
  1811. cglobal coeff_last64, 1,3
  1812. pxor m2, m2
  1813. LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
  1814. LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
  1815. shl r2d, 16
  1816. or r1d, r2d
  1817. LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
  1818. LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
  1819. shl r0d, 16
  1820. or r2d, r0d
  1821. shl r2, 32
  1822. or r1, r2
  1823. not r1
  1824. BSR rax, r1, 0x3f
  1825. RET
  1826. %endif
  1827. %endmacro
  1828. %if ARCH_X86_64 == 0
  1829. INIT_MMX mmx2
  1830. COEFF_LAST
  1831. %endif
  1832. INIT_XMM sse2
  1833. COEFF_LAST
  1834. INIT_XMM lzcnt
  1835. COEFF_LAST
  1836. %macro LAST_MASK_AVX2 2
  1837. %if HIGH_BIT_DEPTH
  1838. mova m0, [%2+ 0]
  1839. packssdw m0, [%2+32]
  1840. mova m1, [%2+64]
  1841. packssdw m1, [%2+96]
  1842. packsswb m0, m1
  1843. mova m1, [deinterleave_shufd]
  1844. vpermd m0, m1, m0
  1845. %else
  1846. mova m0, [%2+ 0]
  1847. packsswb m0, [%2+32]
  1848. vpermq m0, m0, q3120
  1849. %endif
  1850. pcmpeqb m0, m2
  1851. pmovmskb %1, m0
  1852. %endmacro
  1853. %if ARCH_X86_64 == 0
  1854. INIT_YMM avx2
  1855. cglobal coeff_last64, 1,2
  1856. pxor m2, m2
  1857. LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
  1858. xor r1d, -1
  1859. jne .secondhalf
  1860. LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
  1861. not r1d
  1862. BSR eax, r1d, 0x1f
  1863. RET
  1864. .secondhalf:
  1865. BSR eax, r1d, 0x1f
  1866. add eax, 32
  1867. RET
  1868. %else
  1869. INIT_YMM avx2
  1870. cglobal coeff_last64, 1,3
  1871. pxor m2, m2
  1872. LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
  1873. LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
  1874. shl r2, 32
  1875. or r1, r2
  1876. not r1
  1877. BSR rax, r1, 0x3f
  1878. RET
  1879. %endif
  1880. %macro COEFF_LAST_AVX512 2 ; num, w/d
  1881. cglobal coeff_last%1, 1,2
  1882. mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
  1883. vptestm%2 k0, m0, m0
  1884. %if %1 == 15
  1885. mov eax, 30
  1886. kmovw r1d, k0
  1887. lzcnt r1d, r1d
  1888. sub eax, r1d
  1889. %else
  1890. kmovw eax, k0
  1891. lzcnt eax, eax
  1892. xor eax, 31
  1893. %endif
  1894. RET
  1895. %endmacro
  1896. %macro COEFF_LAST64_AVX512 1 ; w/d
  1897. cglobal coeff_last64, 1,2
  1898. pxor xm0, xm0
  1899. vpcmp%1 k0, m0, [r0+0*64], 4
  1900. vpcmp%1 k1, m0, [r0+1*64], 4
  1901. %if HIGH_BIT_DEPTH
  1902. vpcmp%1 k2, m0, [r0+2*64], 4
  1903. vpcmp%1 k3, m0, [r0+3*64], 4
  1904. kunpckwd k0, k1, k0
  1905. kunpckwd k1, k3, k2
  1906. %endif
  1907. %if ARCH_X86_64
  1908. kunpckdq k0, k1, k0
  1909. kmovq rax, k0
  1910. lzcnt rax, rax
  1911. xor eax, 63
  1912. %else
  1913. kmovd r1d, k1
  1914. kmovd eax, k0
  1915. lzcnt r1d, r1d
  1916. lzcnt eax, eax
  1917. xor r1d, 32
  1918. cmovnz eax, r1d
  1919. xor eax, 31
  1920. %endif
  1921. RET
  1922. %endmacro
  1923. %if HIGH_BIT_DEPTH
  1924. INIT_XMM avx512
  1925. COEFF_LAST_AVX512 4, d
  1926. INIT_YMM avx512
  1927. COEFF_LAST_AVX512 8, d
  1928. INIT_ZMM avx512
  1929. COEFF_LAST_AVX512 15, d
  1930. COEFF_LAST_AVX512 16, d
  1931. COEFF_LAST64_AVX512 d
  1932. %else ; !HIGH_BIT_DEPTH
  1933. INIT_XMM avx512
  1934. COEFF_LAST_AVX512 8, w
  1935. INIT_YMM avx512
  1936. COEFF_LAST_AVX512 15, w
  1937. COEFF_LAST_AVX512 16, w
  1938. INIT_ZMM avx512
  1939. COEFF_LAST64_AVX512 w
  1940. %endif ; !HIGH_BIT_DEPTH
  1941. ;-----------------------------------------------------------------------------
  1942. ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
  1943. ;-----------------------------------------------------------------------------
  1944. struc levelrun
  1945. .last: resd 1
  1946. .mask: resd 1
  1947. align 16, resb 1
  1948. .level: resw 16
  1949. endstruc
  1950. ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
  1951. %if WIN64
  1952. DECLARE_REG_TMP 3,1,2,0,4,5,6
  1953. %elif ARCH_X86_64
  1954. DECLARE_REG_TMP 0,1,2,3,4,5,6
  1955. %else
  1956. DECLARE_REG_TMP 6,3,2,1,4,5,0
  1957. %endif
  1958. %macro COEFF_LEVELRUN 1
  1959. cglobal coeff_level_run%1,0,7
  1960. movifnidn t0, r0mp
  1961. movifnidn t1, r1mp
  1962. pxor m2, m2
  1963. xor t3d, t3d
  1964. LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
  1965. %if %1==15
  1966. shr t5d, 1
  1967. %elif %1==8
  1968. and t5d, 0xff
  1969. %elif %1==4
  1970. and t5d, 0xf
  1971. %endif
  1972. xor t5d, (1<<%1)-1
  1973. mov [t1+levelrun.mask], t5d
  1974. shl t5d, 32-%1
  1975. mov t4d, %1-1
  1976. LZCOUNT t3d, t5d, 0x1f
  1977. xor t6d, t6d
  1978. add t5d, t5d
  1979. sub t4d, t3d
  1980. shl t5d, t3b
  1981. mov [t1+levelrun.last], t4d
  1982. .loop:
  1983. LZCOUNT t3d, t5d, 0x1f
  1984. %if HIGH_BIT_DEPTH
  1985. mov t2d, [t0+t4*4]
  1986. %else
  1987. mov t2w, [t0+t4*2]
  1988. %endif
  1989. inc t3d
  1990. shl t5d, t3b
  1991. %if HIGH_BIT_DEPTH
  1992. mov [t1+t6*4+levelrun.level], t2d
  1993. %else
  1994. mov [t1+t6*2+levelrun.level], t2w
  1995. %endif
  1996. inc t6d
  1997. sub t4d, t3d
  1998. jge .loop
  1999. RET
  2000. %endmacro
  2001. INIT_MMX mmx2
  2002. %if ARCH_X86_64 == 0
  2003. COEFF_LEVELRUN 15
  2004. COEFF_LEVELRUN 16
  2005. %endif
  2006. COEFF_LEVELRUN 4
  2007. COEFF_LEVELRUN 8
  2008. INIT_XMM sse2
  2009. %if HIGH_BIT_DEPTH
  2010. COEFF_LEVELRUN 8
  2011. %endif
  2012. COEFF_LEVELRUN 15
  2013. COEFF_LEVELRUN 16
  2014. INIT_MMX lzcnt
  2015. COEFF_LEVELRUN 4
  2016. %if HIGH_BIT_DEPTH == 0
  2017. COEFF_LEVELRUN 8
  2018. %endif
  2019. INIT_XMM lzcnt
  2020. %if HIGH_BIT_DEPTH
  2021. COEFF_LEVELRUN 8
  2022. %endif
  2023. COEFF_LEVELRUN 15
  2024. COEFF_LEVELRUN 16
  2025. ; Similar to the one above, but saves the DCT
  2026. ; coefficients in m0/m1 so we don't have to load
  2027. ; them later.
  2028. %macro LAST_MASK_LUT 3
  2029. pxor xm5, xm5
  2030. %if %1 <= 8
  2031. mova m0, [%3]
  2032. packsswb m2, m0, m0
  2033. %else
  2034. mova xm0, [%3+ 0]
  2035. mova xm1, [%3+16]
  2036. packsswb xm2, xm0, xm1
  2037. %if mmsize==32
  2038. vinserti128 m0, m0, xm1, 1
  2039. %endif
  2040. %endif
  2041. pcmpeqb xm2, xm5
  2042. pmovmskb %2, xm2
  2043. %endmacro
  2044. %macro COEFF_LEVELRUN_LUT 1
  2045. cglobal coeff_level_run%1,2,4+(%1/9)
  2046. %ifdef PIC
  2047. lea r5, [$$]
  2048. %define GLOBAL +r5-$$
  2049. %else
  2050. %define GLOBAL
  2051. %endif
  2052. LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
  2053. %if %1==15
  2054. shr eax, 1
  2055. %elif %1==8
  2056. and eax, 0xff
  2057. %elif %1==4
  2058. and eax, 0xf
  2059. %endif
  2060. xor eax, (1<<%1)-1
  2061. mov [r1+levelrun.mask], eax
  2062. %if %1==15
  2063. add eax, eax
  2064. %endif
  2065. %if %1 > 8
  2066. %if ARCH_X86_64
  2067. mov r4d, eax
  2068. shr r4d, 8
  2069. %else
  2070. movzx r4d, ah ; first 8 bits
  2071. %endif
  2072. %endif
  2073. movzx r2d, al ; second 8 bits
  2074. shl eax, 32-%1-(%1&1)
  2075. LZCOUNT eax, eax, 0x1f
  2076. mov r3d, %1-1
  2077. sub r3d, eax
  2078. mov [r1+levelrun.last], r3d
  2079. ; Here we abuse pshufb, combined with a lookup table, to do a gather
  2080. ; operation based on a bitmask. For example:
  2081. ;
  2082. ; dct 15-8 (input): 0 0 4 0 0 -2 1 0
  2083. ; dct 7-0 (input): 0 0 -1 0 0 0 0 15
  2084. ; bitmask 1: 0 0 1 0 0 1 1 0
  2085. ; bitmask 2: 0 0 1 0 0 0 0 1
  2086. ; gather 15-8: 4 -2 1 __ __ __ __ __
  2087. ; gather 7-0: -1 15 __ __ __ __ __ __
  2088. ; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
  2089. ;
  2090. ; The overlapping, dependent stores almost surely cause a mess of
  2091. ; forwarding issues, but it's still enormously faster.
  2092. %if %1 > 8
  2093. movzx eax, byte [popcnt_table+r4 GLOBAL]
  2094. movzx r3d, byte [popcnt_table+r2 GLOBAL]
  2095. %if mmsize==16
  2096. movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
  2097. movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
  2098. mova m4, [pw_256]
  2099. ; Storing 8 bytes of shuffle constant and converting it (unpack + or)
  2100. ; is neutral to slightly faster in local speed measurements, but it
  2101. ; cuts the table size in half, which is surely a big cache win.
  2102. punpcklbw m3, m3
  2103. punpcklbw m2, m2
  2104. por m3, m4
  2105. por m2, m4
  2106. pshufb m1, m3
  2107. pshufb m0, m2
  2108. mova [r1+levelrun.level], m1
  2109. ; This obnoxious unaligned store messes with store forwarding and
  2110. ; stalls the CPU to no end, but merging the two registers before
  2111. ; storing requires a variable 128-bit shift. Emulating this does
  2112. ; work, but requires a lot of ops and the gain is tiny and
  2113. ; inconsistent, so we'll err on the side of fewer instructions.
  2114. movu [r1+rax*2+levelrun.level], m0
  2115. %else ; mmsize==32
  2116. movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
  2117. vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
  2118. punpcklbw m2, m2
  2119. por m2, [pw_256]
  2120. pshufb m0, m2
  2121. vextracti128 [r1+levelrun.level], m0, 1
  2122. movu [r1+rax*2+levelrun.level], xm0
  2123. %endif
  2124. add eax, r3d
  2125. %else
  2126. movzx eax, byte [popcnt_table+r2 GLOBAL]
  2127. movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
  2128. punpcklbw m1, m1
  2129. por m1, [pw_256]
  2130. pshufb m0, m1
  2131. mova [r1+levelrun.level], m0
  2132. %endif
  2133. RET
  2134. %endmacro
  2135. %if HIGH_BIT_DEPTH==0
  2136. INIT_MMX ssse3
  2137. COEFF_LEVELRUN_LUT 4
  2138. INIT_XMM ssse3
  2139. COEFF_LEVELRUN_LUT 8
  2140. COEFF_LEVELRUN_LUT 15
  2141. COEFF_LEVELRUN_LUT 16
  2142. INIT_MMX ssse3, lzcnt
  2143. COEFF_LEVELRUN_LUT 4
  2144. INIT_XMM ssse3, lzcnt
  2145. COEFF_LEVELRUN_LUT 8
  2146. COEFF_LEVELRUN_LUT 15
  2147. COEFF_LEVELRUN_LUT 16
  2148. INIT_XMM avx2
  2149. COEFF_LEVELRUN_LUT 15
  2150. COEFF_LEVELRUN_LUT 16
  2151. %endif