dct-a.asm 67 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287
  1. ;*****************************************************************************
  2. ;* dct-a.asm: x86 transform and zigzag
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2003-2018 x264 project
  5. ;*
  6. ;* Authors: Holger Lubitz <holger@lubitz.org>
  7. ;* Loren Merritt <lorenm@u.washington.edu>
  8. ;* Laurent Aimar <fenrir@via.ecp.fr>
  9. ;* Min Chen <chenm001.163.com>
  10. ;* Fiona Glaser <fiona@x264.com>
  11. ;*
  12. ;* This program is free software; you can redistribute it and/or modify
  13. ;* it under the terms of the GNU General Public License as published by
  14. ;* the Free Software Foundation; either version 2 of the License, or
  15. ;* (at your option) any later version.
  16. ;*
  17. ;* This program is distributed in the hope that it will be useful,
  18. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. ;* GNU General Public License for more details.
  21. ;*
  22. ;* You should have received a copy of the GNU General Public License
  23. ;* along with this program; if not, write to the Free Software
  24. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  25. ;*
  26. ;* This program is also available under a commercial proprietary license.
  27. ;* For more information, contact us at licensing@x264.com.
  28. ;*****************************************************************************
  29. %include "x86inc.asm"
  30. %include "x86util.asm"
  31. SECTION_RODATA 64
  32. ; AVX-512 permutation indices are bit-packed to save cache
  33. %if HIGH_BIT_DEPTH
  34. scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
  35. dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
  36. dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
  37. dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
  38. ; bits 19-23: 8x8_frame4
  39. scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1
  40. dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
  41. dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
  42. dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
  43. cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
  44. dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
  45. dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
  46. dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
  47. %else
  48. dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
  49. dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
  50. dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
  51. dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
  52. scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
  53. dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
  54. dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
  55. dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
  56. scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1
  57. dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
  58. dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
  59. dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
  60. cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
  61. dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
  62. dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
  63. dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
  64. %endif
  65. pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
  66. pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
  67. pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
  68. pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
  69. pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
  70. pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
  71. pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
  72. pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
  73. pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
  74. pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
  75. pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
  76. pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
  77. pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
  78. pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
  79. pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
  80. pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
  81. pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
  82. pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
  83. pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
  84. pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
  85. pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
  86. pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
  87. pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
  88. pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
  89. pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
  90. pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
  91. pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
  92. pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
  93. pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
  94. pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
  95. pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
  96. pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
  97. pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
  98. SECTION .text
  99. cextern pw_32_0
  100. cextern pw_32
  101. cextern pw_512
  102. cextern pw_8000
  103. cextern pw_pixel_max
  104. cextern hsub_mul
  105. cextern pb_1
  106. cextern pw_1
  107. cextern pd_1
  108. cextern pd_32
  109. cextern pw_ppppmmmm
  110. cextern pw_pmpmpmpm
  111. cextern deinterleave_shufd
  112. cextern pb_unpackbd1
  113. cextern pb_unpackbd2
  114. %macro WALSH4_1D 6
  115. SUMSUB_BADC %1, %5, %4, %3, %2, %6
  116. SUMSUB_BADC %1, %5, %3, %4, %2, %6
  117. SWAP %2, %5, %4
  118. %endmacro
  119. %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
  120. movq m%3, m%4
  121. pxor m%1, m%4
  122. psubw m%3, m%2
  123. pxor m%2, m%4
  124. pavgw m%3, m%1
  125. pavgw m%2, m%1
  126. pxor m%3, m%4
  127. pxor m%2, m%4
  128. SWAP %1, %2, %3
  129. %endmacro
  130. %macro DCT_UNPACK 3
  131. punpcklwd %3, %1
  132. punpckhwd %2, %1
  133. psrad %3, 16
  134. psrad %2, 16
  135. SWAP %1, %3
  136. %endmacro
  137. %if HIGH_BIT_DEPTH
  138. ;-----------------------------------------------------------------------------
  139. ; void dct4x4dc( dctcoef d[4][4] )
  140. ;-----------------------------------------------------------------------------
  141. %macro DCT4x4_DC 0
  142. cglobal dct4x4dc, 1,1,5
  143. mova m0, [r0+ 0]
  144. mova m1, [r0+16]
  145. mova m2, [r0+32]
  146. mova m3, [r0+48]
  147. WALSH4_1D d, 0,1,2,3,4
  148. TRANSPOSE4x4D 0,1,2,3,4
  149. paddd m0, [pd_1]
  150. WALSH4_1D d, 0,1,2,3,4
  151. psrad m0, 1
  152. psrad m1, 1
  153. psrad m2, 1
  154. psrad m3, 1
  155. mova [r0+ 0], m0
  156. mova [r0+16], m1
  157. mova [r0+32], m2
  158. mova [r0+48], m3
  159. RET
  160. %endmacro ; DCT4x4_DC
  161. INIT_XMM sse2
  162. DCT4x4_DC
  163. INIT_XMM avx
  164. DCT4x4_DC
  165. %else
  166. INIT_MMX mmx2
  167. cglobal dct4x4dc, 1,1
  168. movq m3, [r0+24]
  169. movq m2, [r0+16]
  170. movq m1, [r0+ 8]
  171. movq m0, [r0+ 0]
  172. movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
  173. WALSH4_1D w, 0,1,2,3,4
  174. TRANSPOSE4x4W 0,1,2,3,4
  175. SUMSUB_BADC w, 1, 0, 3, 2, 4
  176. SWAP 0, 1
  177. SWAP 2, 3
  178. SUMSUB_17BIT 0,2,4,7
  179. SUMSUB_17BIT 1,3,5,7
  180. movq [r0+0], m0
  181. movq [r0+8], m2
  182. movq [r0+16], m3
  183. movq [r0+24], m1
  184. RET
  185. %endif ; HIGH_BIT_DEPTH
  186. %if HIGH_BIT_DEPTH
  187. ;-----------------------------------------------------------------------------
  188. ; void idct4x4dc( int32_t d[4][4] )
  189. ;-----------------------------------------------------------------------------
  190. %macro IDCT4x4DC 0
  191. cglobal idct4x4dc, 1,1
  192. mova m3, [r0+48]
  193. mova m2, [r0+32]
  194. mova m1, [r0+16]
  195. mova m0, [r0+ 0]
  196. WALSH4_1D d,0,1,2,3,4
  197. TRANSPOSE4x4D 0,1,2,3,4
  198. WALSH4_1D d,0,1,2,3,4
  199. mova [r0+ 0], m0
  200. mova [r0+16], m1
  201. mova [r0+32], m2
  202. mova [r0+48], m3
  203. RET
  204. %endmacro ; IDCT4x4DC
  205. INIT_XMM sse2
  206. IDCT4x4DC
  207. INIT_XMM avx
  208. IDCT4x4DC
  209. %else
  210. ;-----------------------------------------------------------------------------
  211. ; void idct4x4dc( int16_t d[4][4] )
  212. ;-----------------------------------------------------------------------------
  213. INIT_MMX mmx
  214. cglobal idct4x4dc, 1,1
  215. movq m3, [r0+24]
  216. movq m2, [r0+16]
  217. movq m1, [r0+ 8]
  218. movq m0, [r0+ 0]
  219. WALSH4_1D w,0,1,2,3,4
  220. TRANSPOSE4x4W 0,1,2,3,4
  221. WALSH4_1D w,0,1,2,3,4
  222. movq [r0+ 0], m0
  223. movq [r0+ 8], m1
  224. movq [r0+16], m2
  225. movq [r0+24], m3
  226. RET
  227. %endif ; HIGH_BIT_DEPTH
  228. ;-----------------------------------------------------------------------------
  229. ; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
  230. ;-----------------------------------------------------------------------------
  231. %if WIN64
  232. DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
  233. %else
  234. DECLARE_REG_TMP 2
  235. %endif
  236. %macro INSERT_COEFF 3 ; dst, src, imm
  237. %if %3
  238. %if HIGH_BIT_DEPTH
  239. %if cpuflag(sse4)
  240. pinsrd %1, %2, %3
  241. %elif %3 == 2
  242. movd m2, %2
  243. %elif %3 == 1
  244. punpckldq %1, %2
  245. %else
  246. punpckldq m2, %2
  247. punpcklqdq %1, m2
  248. %endif
  249. %else
  250. %if %3 == 2
  251. punpckldq %1, %2
  252. %else
  253. pinsrw %1, %2, %3
  254. %endif
  255. %endif
  256. %else
  257. movd %1, %2
  258. %endif
  259. %if HIGH_BIT_DEPTH
  260. mov %2, t0d
  261. %else
  262. mov %2, t0w
  263. %endif
  264. %endmacro
  265. %macro DCT2x4DC 2
  266. cglobal dct2x4dc, 2,3
  267. xor t0d, t0d
  268. INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
  269. INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
  270. add r1, 4*16*SIZEOF_DCTCOEF
  271. INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
  272. INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
  273. INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
  274. INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
  275. INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
  276. INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
  277. SUMSUB_BA %1, 1, 0, 2
  278. SBUTTERFLY %2, 1, 0, 2
  279. SUMSUB_BA %1, 0, 1, 2
  280. SBUTTERFLY %2, 0, 1, 2
  281. SUMSUB_BA %1, 1, 0, 2
  282. pshuf%1 m0, m0, q1032
  283. mova [r0], m1
  284. mova [r0+mmsize], m0
  285. RET
  286. %endmacro
  287. %if HIGH_BIT_DEPTH
  288. INIT_XMM sse2
  289. DCT2x4DC d, dq
  290. INIT_XMM avx
  291. DCT2x4DC d, dq
  292. %else
  293. INIT_MMX mmx2
  294. DCT2x4DC w, wd
  295. %endif
  296. %if HIGH_BIT_DEPTH
  297. ;-----------------------------------------------------------------------------
  298. ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
  299. ;-----------------------------------------------------------------------------
  300. INIT_MMX mmx
  301. cglobal sub4x4_dct, 3,3
  302. .skip_prologue:
  303. LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
  304. LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
  305. LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
  306. LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
  307. DCT4_1D 0,1,2,3,4
  308. TRANSPOSE4x4W 0,1,2,3,4
  309. SUMSUB_BADC w, 3, 0, 2, 1
  310. SUMSUB_BA w, 2, 3, 4
  311. DCT_UNPACK m2, m4, m5
  312. DCT_UNPACK m3, m6, m7
  313. mova [r0+ 0], m2 ; s03 + s12
  314. mova [r0+ 8], m4
  315. mova [r0+32], m3 ; s03 - s12
  316. mova [r0+40], m6
  317. DCT_UNPACK m0, m2, m4
  318. DCT_UNPACK m1, m3, m5
  319. SUMSUB2_AB d, 0, 1, 4
  320. SUMSUB2_AB d, 2, 3, 5
  321. mova [r0+16], m0 ; d03*2 + d12
  322. mova [r0+24], m2
  323. mova [r0+48], m4 ; d03 - 2*d12
  324. mova [r0+56], m5
  325. RET
  326. %else
  327. %macro SUB_DCT4 0
  328. cglobal sub4x4_dct, 3,3
  329. .skip_prologue:
  330. %if cpuflag(ssse3)
  331. mova m5, [hsub_mul]
  332. %endif
  333. LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
  334. DCT4_1D 0,1,2,3,4
  335. TRANSPOSE4x4W 0,1,2,3,4
  336. DCT4_1D 0,1,2,3,4
  337. movq [r0+ 0], m0
  338. movq [r0+ 8], m1
  339. movq [r0+16], m2
  340. movq [r0+24], m3
  341. RET
  342. %endmacro
  343. INIT_MMX mmx
  344. SUB_DCT4
  345. INIT_MMX ssse3
  346. SUB_DCT4
  347. %endif ; HIGH_BIT_DEPTH
  348. %if HIGH_BIT_DEPTH
  349. ;-----------------------------------------------------------------------------
  350. ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
  351. ;-----------------------------------------------------------------------------
  352. %macro STORE_DIFFx2 6
  353. psrad %1, 6
  354. psrad %2, 6
  355. packssdw %1, %2
  356. movq %3, %5
  357. movhps %3, %6
  358. paddsw %1, %3
  359. CLIPW %1, %4, [pw_pixel_max]
  360. movq %5, %1
  361. movhps %6, %1
  362. %endmacro
  363. %macro ADD4x4_IDCT 0
  364. cglobal add4x4_idct, 2,2,6
  365. add r0, 2*FDEC_STRIDEB
  366. .skip_prologue:
  367. mova m1, [r1+16]
  368. mova m3, [r1+48]
  369. mova m2, [r1+32]
  370. mova m0, [r1+ 0]
  371. IDCT4_1D d,0,1,2,3,4,5
  372. TRANSPOSE4x4D 0,1,2,3,4
  373. paddd m0, [pd_32]
  374. IDCT4_1D d,0,1,2,3,4,5
  375. pxor m5, m5
  376. STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
  377. STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
  378. RET
  379. %endmacro
  380. INIT_XMM sse2
  381. ADD4x4_IDCT
  382. INIT_XMM avx
  383. ADD4x4_IDCT
  384. %else ; !HIGH_BIT_DEPTH
  385. INIT_MMX mmx
  386. cglobal add4x4_idct, 2,2
  387. pxor m7, m7
  388. .skip_prologue:
  389. movq m1, [r1+ 8]
  390. movq m3, [r1+24]
  391. movq m2, [r1+16]
  392. movq m0, [r1+ 0]
  393. IDCT4_1D w,0,1,2,3,4,5
  394. TRANSPOSE4x4W 0,1,2,3,4
  395. paddw m0, [pw_32]
  396. IDCT4_1D w,0,1,2,3,4,5
  397. STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
  398. STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
  399. STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
  400. STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
  401. RET
  402. %macro ADD4x4 0
  403. cglobal add4x4_idct, 2,2,6
  404. mova m1, [r1+0x00] ; row1/row0
  405. mova m3, [r1+0x10] ; row3/row2
  406. psraw m0, m1, 1 ; row1>>1/...
  407. psraw m2, m3, 1 ; row3>>1/...
  408. movsd m0, m1 ; row1>>1/row0
  409. movsd m2, m3 ; row3>>1/row2
  410. psubw m0, m3 ; row1>>1-row3/row0-2
  411. paddw m2, m1 ; row3>>1+row1/row0+2
  412. SBUTTERFLY2 wd, 0, 2, 1
  413. SUMSUB_BA w, 2, 0, 1
  414. pshuflw m1, m2, q2301
  415. pshufhw m2, m2, q2301
  416. punpckldq m1, m0
  417. punpckhdq m2, m0
  418. SWAP 0, 1
  419. mova m1, [pw_32_0]
  420. paddw m1, m0 ; row1/row0 corrected
  421. psraw m0, 1 ; row1>>1/...
  422. psraw m3, m2, 1 ; row3>>1/...
  423. movsd m0, m1 ; row1>>1/row0
  424. movsd m3, m2 ; row3>>1/row2
  425. psubw m0, m2 ; row1>>1-row3/row0-2
  426. paddw m3, m1 ; row3>>1+row1/row0+2
  427. SBUTTERFLY2 qdq, 0, 3, 1
  428. SUMSUB_BA w, 3, 0, 1
  429. movd m4, [r0+FDEC_STRIDE*0]
  430. movd m1, [r0+FDEC_STRIDE*1]
  431. movd m2, [r0+FDEC_STRIDE*2]
  432. movd m5, [r0+FDEC_STRIDE*3]
  433. punpckldq m1, m4 ; row0/row1
  434. pxor m4, m4
  435. punpckldq m2, m5 ; row3/row2
  436. punpcklbw m1, m4
  437. psraw m3, 6
  438. punpcklbw m2, m4
  439. psraw m0, 6
  440. paddsw m3, m1
  441. paddsw m0, m2
  442. packuswb m0, m3 ; row0/row1/row3/row2
  443. pextrd [r0+FDEC_STRIDE*0], m0, 3
  444. pextrd [r0+FDEC_STRIDE*1], m0, 2
  445. movd [r0+FDEC_STRIDE*2], m0
  446. pextrd [r0+FDEC_STRIDE*3], m0, 1
  447. RET
  448. %endmacro ; ADD4x4
  449. INIT_XMM sse4
  450. ADD4x4
  451. INIT_XMM avx
  452. ADD4x4
  453. %macro STOREx2_AVX2 9
  454. movq xm%3, [r0+%5*FDEC_STRIDE]
  455. vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
  456. movq xm%4, [r0+%7*FDEC_STRIDE]
  457. vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
  458. punpcklbw m%3, m%9
  459. punpcklbw m%4, m%9
  460. psraw m%1, 6
  461. psraw m%2, 6
  462. paddsw m%1, m%3
  463. paddsw m%2, m%4
  464. packuswb m%1, m%2
  465. vextracti128 xm%2, m%1, 1
  466. movq [r0+%5*FDEC_STRIDE], xm%1
  467. movq [r0+%6*FDEC_STRIDE], xm%2
  468. movhps [r0+%7*FDEC_STRIDE], xm%1
  469. movhps [r0+%8*FDEC_STRIDE], xm%2
  470. %endmacro
  471. INIT_YMM avx2
  472. cglobal add8x8_idct, 2,3,8
  473. add r0, 4*FDEC_STRIDE
  474. pxor m7, m7
  475. TAIL_CALL .skip_prologue, 0
  476. cglobal_label .skip_prologue
  477. ; TRANSPOSE4x4Q
  478. mova xm0, [r1+ 0]
  479. mova xm1, [r1+32]
  480. mova xm2, [r1+16]
  481. mova xm3, [r1+48]
  482. vinserti128 m0, m0, [r1+ 64], 1
  483. vinserti128 m1, m1, [r1+ 96], 1
  484. vinserti128 m2, m2, [r1+ 80], 1
  485. vinserti128 m3, m3, [r1+112], 1
  486. SBUTTERFLY qdq, 0, 1, 4
  487. SBUTTERFLY qdq, 2, 3, 4
  488. IDCT4_1D w,0,1,2,3,4,5
  489. TRANSPOSE2x4x4W 0,1,2,3,4
  490. paddw m0, [pw_32]
  491. IDCT4_1D w,0,1,2,3,4,5
  492. STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
  493. STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
  494. ret
  495. ; 2xdst, 2xtmp, 4xsrcrow, 1xzero
  496. %macro LOAD_DIFF8x2_AVX2 9
  497. movq xm%1, [r1+%5*FENC_STRIDE]
  498. movq xm%2, [r1+%6*FENC_STRIDE]
  499. vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
  500. vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
  501. punpcklbw m%1, m%9
  502. punpcklbw m%2, m%9
  503. movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
  504. movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
  505. vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
  506. vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
  507. punpcklbw m%3, m%9
  508. punpcklbw m%4, m%9
  509. psubw m%1, m%3
  510. psubw m%2, m%4
  511. %endmacro
  512. ; 4x src, 1x tmp
  513. %macro STORE8_DCT_AVX2 5
  514. SBUTTERFLY qdq, %1, %2, %5
  515. SBUTTERFLY qdq, %3, %4, %5
  516. mova [r0+ 0], xm%1
  517. mova [r0+ 16], xm%3
  518. mova [r0+ 32], xm%2
  519. mova [r0+ 48], xm%4
  520. vextracti128 [r0+ 64], m%1, 1
  521. vextracti128 [r0+ 80], m%3, 1
  522. vextracti128 [r0+ 96], m%2, 1
  523. vextracti128 [r0+112], m%4, 1
  524. %endmacro
  525. %macro STORE16_DCT_AVX2 5
  526. SBUTTERFLY qdq, %1, %2, %5
  527. SBUTTERFLY qdq, %3, %4, %5
  528. mova [r0+ 0-128], xm%1
  529. mova [r0+16-128], xm%3
  530. mova [r0+32-128], xm%2
  531. mova [r0+48-128], xm%4
  532. vextracti128 [r0+ 0], m%1, 1
  533. vextracti128 [r0+16], m%3, 1
  534. vextracti128 [r0+32], m%2, 1
  535. vextracti128 [r0+48], m%4, 1
  536. %endmacro
  537. INIT_YMM avx2
  538. cglobal sub8x8_dct, 3,3,7
  539. pxor m6, m6
  540. add r2, 4*FDEC_STRIDE
  541. LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
  542. LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
  543. DCT4_1D 0, 1, 2, 3, 4
  544. TRANSPOSE2x4x4W 0, 1, 2, 3, 4
  545. DCT4_1D 0, 1, 2, 3, 4
  546. STORE8_DCT_AVX2 0, 1, 2, 3, 4
  547. RET
  548. INIT_YMM avx2
  549. cglobal sub16x16_dct, 3,3,6
  550. add r0, 128
  551. add r2, 4*FDEC_STRIDE
  552. call .sub16x4_dct
  553. add r0, 64
  554. add r1, 4*FENC_STRIDE
  555. add r2, 4*FDEC_STRIDE
  556. call .sub16x4_dct
  557. add r0, 256-64
  558. add r1, 4*FENC_STRIDE
  559. add r2, 4*FDEC_STRIDE
  560. call .sub16x4_dct
  561. add r0, 64
  562. add r1, 4*FENC_STRIDE
  563. add r2, 4*FDEC_STRIDE
  564. call .sub16x4_dct
  565. RET
  566. .sub16x4_dct:
  567. LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
  568. LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
  569. DCT4_1D 0, 1, 2, 3, 4
  570. TRANSPOSE2x4x4W 0, 1, 2, 3, 4
  571. DCT4_1D 0, 1, 2, 3, 4
  572. STORE16_DCT_AVX2 0, 1, 2, 3, 4
  573. ret
  574. %macro DCT4x4_AVX512 0
  575. psubw m0, m2 ; 0 1
  576. psubw m1, m3 ; 3 2
  577. SUMSUB_BA w, 1, 0, 2
  578. SBUTTERFLY wd, 1, 0, 2
  579. paddw m2, m1, m0
  580. psubw m3, m1, m0
  581. vpaddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
  582. vpsubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
  583. shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
  584. punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
  585. SUMSUB_BA w, 1, 2, 3
  586. shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
  587. shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
  588. paddw m2, m1, m3
  589. psubw m0, m1, m3
  590. vpaddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
  591. vpsubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
  592. %endmacro
  593. INIT_XMM avx512
  594. cglobal sub4x4_dct
  595. mov eax, 0xf0aa
  596. kmovw k1, eax
  597. PROLOGUE 3,3
  598. movd m0, [r1+0*FENC_STRIDE]
  599. movd m2, [r2+0*FDEC_STRIDE]
  600. vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
  601. vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
  602. movd m1, [r1+3*FENC_STRIDE]
  603. movd m3, [r2+3*FDEC_STRIDE]
  604. vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
  605. vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
  606. kshiftrw k2, k1, 8
  607. pxor m4, m4
  608. punpcklbw m0, m4
  609. punpcklbw m2, m4
  610. punpcklbw m1, m4
  611. punpcklbw m3, m4
  612. DCT4x4_AVX512
  613. mova [r0], m2
  614. mova [r0+16], m0
  615. RET
  616. INIT_ZMM avx512
  617. cglobal dct4x4x4_internal
  618. punpcklbw m0, m1, m4
  619. punpcklbw m2, m3, m4
  620. punpckhbw m1, m4
  621. punpckhbw m3, m4
  622. DCT4x4_AVX512
  623. mova m1, m2
  624. vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
  625. vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
  626. ret
  627. %macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
  628. movu %1, [r1+%3*FENC_STRIDE]
  629. vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
  630. %endmacro
  631. %macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
  632. movu %1, [r2+(%4 )*FDEC_STRIDE]
  633. vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
  634. movu %3, [r2+(%5 )*FDEC_STRIDE]
  635. vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
  636. vpermt2d %1, %2, %3
  637. %endmacro
  638. cglobal sub8x8_dct, 3,3
  639. mova m0, [dct_avx512]
  640. DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
  641. mov r1d, 0xaaaaaaaa
  642. kmovd k1, r1d
  643. psrld m0, 5
  644. DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
  645. mov r1d, 0xf0f0f0f0
  646. kmovd k2, r1d
  647. pxor xm4, xm4
  648. knotw k3, k2
  649. call dct4x4x4_internal_avx512
  650. mova [r0], m0
  651. mova [r0+64], m1
  652. RET
  653. %macro SUB4x16_DCT_AVX512 2 ; dst, src
  654. vpermd m1, m5, [r1+1*%2*64]
  655. mova m3, [r2+2*%2*64]
  656. vpermt2d m3, m6, [r2+2*%2*64+64]
  657. call dct4x4x4_internal_avx512
  658. mova [r0+%1*64 ], m0
  659. mova [r0+%1*64+128], m1
  660. %endmacro
  661. cglobal sub16x16_dct
  662. psrld m5, [dct_avx512], 10
  663. mov eax, 0xaaaaaaaa
  664. kmovd k1, eax
  665. mov eax, 0xf0f0f0f0
  666. kmovd k2, eax
  667. PROLOGUE 3,3
  668. pxor xm4, xm4
  669. knotw k3, k2
  670. psrld m6, m5, 4
  671. SUB4x16_DCT_AVX512 0, 0
  672. SUB4x16_DCT_AVX512 1, 1
  673. SUB4x16_DCT_AVX512 4, 2
  674. SUB4x16_DCT_AVX512 5, 3
  675. RET
  676. cglobal sub8x8_dct_dc, 3,3
  677. mova m3, [dct_avx512]
  678. DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
  679. mov r1d, 0xaa
  680. kmovb k1, r1d
  681. psrld m3, 5
  682. DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
  683. pxor xm3, xm3
  684. psadbw m0, m3
  685. psadbw m1, m3
  686. psubw m0, m1
  687. vpmovqw xmm0, m0
  688. vprold xmm1, xmm0, 16
  689. paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
  690. punpckhqdq xmm2, xmm0, xmm0
  691. psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
  692. paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
  693. punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
  694. punpcklqdq xmm1, xmm0, xmm0
  695. vpsubw xmm0 {k1}, xm3, xmm0
  696. paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
  697. movhps [r0], xmm0
  698. RET
  699. cglobal sub8x16_dct_dc, 3,3
  700. mova m5, [dct_avx512]
  701. DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
  702. DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
  703. mov r1d, 0xaa
  704. kmovb k1, r1d
  705. psrld m5, 5
  706. DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
  707. DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
  708. pxor xm4, xm4
  709. psadbw m0, m4
  710. psadbw m1, m4
  711. psadbw m2, m4
  712. psadbw m3, m4
  713. psubw m0, m2
  714. psubw m1, m3
  715. SBUTTERFLY qdq, 0, 1, 2
  716. paddw m0, m1
  717. vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
  718. psrlq xmm2, xmm0, 32
  719. psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
  720. paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
  721. punpckhdq xmm2, xmm0, xmm1
  722. punpckldq xmm0, xmm1
  723. psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
  724. paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
  725. punpcklwd xmm0, xmm1
  726. psrlq xmm2, xmm0, 32
  727. psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
  728. paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
  729. shufps xmm0, xmm1, q0220
  730. mova [r0], xmm0
  731. RET
  732. %macro SARSUMSUB 3 ; a, b, tmp
  733. mova m%3, m%1
  734. vpsraw m%1 {k1}, 1
  735. psubw m%1, m%2 ; 0-2 1>>1-3
  736. vpsraw m%2 {k1}, 1
  737. paddw m%2, m%3 ; 0+2 1+3>>1
  738. %endmacro
  739. cglobal add8x8_idct, 2,2
  740. mova m1, [r1]
  741. mova m2, [r1+64]
  742. mova m3, [dct_avx512]
  743. vbroadcasti32x4 m4, [pw_32]
  744. mov r1d, 0xf0f0f0f0
  745. kxnorb k2, k2, k2
  746. kmovd k1, r1d
  747. kmovb k3, k2
  748. vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
  749. vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
  750. psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
  751. vpgatherqq m6 {k2}, [r0+m5]
  752. SARSUMSUB 0, 1, 2
  753. SBUTTERFLY wd, 1, 0, 2
  754. psrlq m7, m3, 28
  755. SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
  756. vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
  757. SBUTTERFLY dq, 0, 1, 2
  758. psrlq m3, 24
  759. SARSUMSUB 0, 1, 2
  760. vpermi2q m3, m1, m0
  761. vpermt2q m1, m7, m0
  762. paddw m3, m4 ; += 32
  763. SUMSUB_BA w, 1, 3, 0
  764. psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
  765. psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
  766. pxor xm0, xm0
  767. SBUTTERFLY bw, 6, 0, 2
  768. paddsw m1, m6
  769. paddsw m3, m0
  770. packuswb m1, m3
  771. vpscatterqq [r0+m5] {k3}, m1
  772. RET
  773. %endif ; HIGH_BIT_DEPTH
  774. INIT_MMX
  775. ;-----------------------------------------------------------------------------
  776. ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
  777. ;-----------------------------------------------------------------------------
  778. %macro SUB_NxN_DCT 7
  779. cglobal %1, 3,3,%7
  780. %if HIGH_BIT_DEPTH == 0
  781. %if mmsize == 8
  782. pxor m7, m7
  783. %else
  784. add r2, 4*FDEC_STRIDE
  785. mova m7, [hsub_mul]
  786. %endif
  787. %endif ; !HIGH_BIT_DEPTH
  788. .skip_prologue:
  789. call %2.skip_prologue
  790. add r0, %3
  791. add r1, %4-%5-%6*FENC_STRIDE
  792. add r2, %4-%5-%6*FDEC_STRIDE
  793. call %2.skip_prologue
  794. add r0, %3
  795. add r1, (%4-%6)*FENC_STRIDE-%5-%4
  796. add r2, (%4-%6)*FDEC_STRIDE-%5-%4
  797. call %2.skip_prologue
  798. add r0, %3
  799. add r1, %4-%5-%6*FENC_STRIDE
  800. add r2, %4-%5-%6*FDEC_STRIDE
  801. TAIL_CALL %2.skip_prologue, 1
  802. %endmacro
  803. ;-----------------------------------------------------------------------------
  804. ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
  805. ;-----------------------------------------------------------------------------
  806. %macro ADD_NxN_IDCT 6-7
  807. %if HIGH_BIT_DEPTH
  808. cglobal %1, 2,2,%7
  809. %if %3==256
  810. add r1, 128
  811. %endif
  812. %else
  813. cglobal %1, 2,2,11
  814. pxor m7, m7
  815. %endif
  816. %if mmsize>=16 && %3!=256
  817. add r0, 4*FDEC_STRIDE
  818. %endif
  819. .skip_prologue:
  820. call %2.skip_prologue
  821. add r0, %4-%5-%6*FDEC_STRIDE
  822. add r1, %3
  823. call %2.skip_prologue
  824. add r0, (%4-%6)*FDEC_STRIDE-%5-%4
  825. add r1, %3
  826. call %2.skip_prologue
  827. add r0, %4-%5-%6*FDEC_STRIDE
  828. add r1, %3
  829. TAIL_CALL %2.skip_prologue, 1
  830. %endmacro
  831. %if HIGH_BIT_DEPTH
  832. INIT_MMX
  833. SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
  834. SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
  835. INIT_XMM
  836. ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
  837. ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
  838. ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
  839. ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
  840. cextern add8x8_idct8_sse2.skip_prologue
  841. cextern add8x8_idct8_avx.skip_prologue
  842. ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
  843. ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
  844. cextern sub8x8_dct8_sse2.skip_prologue
  845. cextern sub8x8_dct8_sse4.skip_prologue
  846. cextern sub8x8_dct8_avx.skip_prologue
  847. SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
  848. SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
  849. SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
  850. %else ; !HIGH_BIT_DEPTH
  851. %if ARCH_X86_64 == 0
  852. INIT_MMX
  853. SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
  854. ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
  855. SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
  856. ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
  857. cextern sub8x8_dct8_mmx.skip_prologue
  858. cextern add8x8_idct8_mmx.skip_prologue
  859. SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
  860. ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
  861. %endif
  862. INIT_XMM
  863. cextern sub8x8_dct_sse2.skip_prologue
  864. cextern sub8x8_dct_ssse3.skip_prologue
  865. cextern sub8x8_dct_avx.skip_prologue
  866. cextern sub8x8_dct_xop.skip_prologue
  867. SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
  868. SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
  869. SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
  870. SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
  871. cextern add8x8_idct_sse2.skip_prologue
  872. cextern add8x8_idct_avx.skip_prologue
  873. ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
  874. ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
  875. cextern add8x8_idct8_sse2.skip_prologue
  876. cextern add8x8_idct8_avx.skip_prologue
  877. ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
  878. ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
  879. cextern sub8x8_dct8_sse2.skip_prologue
  880. cextern sub8x8_dct8_ssse3.skip_prologue
  881. cextern sub8x8_dct8_avx.skip_prologue
  882. SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
  883. SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
  884. SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
  885. INIT_YMM
  886. ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
  887. %endif ; HIGH_BIT_DEPTH
  888. %if HIGH_BIT_DEPTH
  889. ;-----------------------------------------------------------------------------
  890. ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
  891. ;-----------------------------------------------------------------------------
  892. %macro ADD_DC 2
  893. mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
  894. mova m1, [%1+FDEC_STRIDEB*1]
  895. mova m2, [%1+FDEC_STRIDEB*2]
  896. paddsw m0, %2
  897. paddsw m1, %2
  898. paddsw m2, %2
  899. paddsw %2, [%1+FDEC_STRIDEB*3]
  900. CLIPW m0, m5, m6
  901. CLIPW m1, m5, m6
  902. CLIPW m2, m5, m6
  903. CLIPW %2, m5, m6
  904. mova [%1+FDEC_STRIDEB*0], m0
  905. mova [%1+FDEC_STRIDEB*1], m1
  906. mova [%1+FDEC_STRIDEB*2], m2
  907. mova [%1+FDEC_STRIDEB*3], %2
  908. %endmacro
  909. %macro ADD_IDCT_DC 0
  910. cglobal add8x8_idct_dc, 2,2,7
  911. mova m6, [pw_pixel_max]
  912. pxor m5, m5
  913. mova m3, [r1]
  914. paddd m3, [pd_32]
  915. psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
  916. pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
  917. pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
  918. pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
  919. pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
  920. ADD_DC r0+FDEC_STRIDEB*0, m4
  921. ADD_DC r0+FDEC_STRIDEB*4, m3
  922. RET
  923. cglobal add16x16_idct_dc, 2,3,8
  924. mov r2, 4
  925. mova m6, [pw_pixel_max]
  926. mova m7, [pd_32]
  927. pxor m5, m5
  928. .loop:
  929. mova m3, [r1]
  930. paddd m3, m7
  931. psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
  932. pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
  933. pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
  934. pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
  935. pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
  936. ADD_DC r0+FDEC_STRIDEB*0, m4
  937. ADD_DC r0+SIZEOF_PIXEL*8, m3
  938. add r1, 16
  939. add r0, 4*FDEC_STRIDEB
  940. dec r2
  941. jg .loop
  942. RET
  943. %endmacro ; ADD_IDCT_DC
  944. INIT_XMM sse2
  945. ADD_IDCT_DC
  946. INIT_XMM avx
  947. ADD_IDCT_DC
  948. %else ;!HIGH_BIT_DEPTH
  949. %macro ADD_DC 3
  950. mova m4, [%3+FDEC_STRIDE*0]
  951. mova m5, [%3+FDEC_STRIDE*1]
  952. mova m6, [%3+FDEC_STRIDE*2]
  953. paddusb m4, %1
  954. paddusb m5, %1
  955. paddusb m6, %1
  956. paddusb %1, [%3+FDEC_STRIDE*3]
  957. psubusb m4, %2
  958. psubusb m5, %2
  959. psubusb m6, %2
  960. psubusb %1, %2
  961. mova [%3+FDEC_STRIDE*0], m4
  962. mova [%3+FDEC_STRIDE*1], m5
  963. mova [%3+FDEC_STRIDE*2], m6
  964. mova [%3+FDEC_STRIDE*3], %1
  965. %endmacro
  966. INIT_MMX mmx2
  967. cglobal add8x8_idct_dc, 2,2
  968. mova m0, [r1]
  969. pxor m1, m1
  970. add r0, FDEC_STRIDE*4
  971. paddw m0, [pw_32]
  972. psraw m0, 6
  973. psubw m1, m0
  974. packuswb m0, m0
  975. packuswb m1, m1
  976. punpcklbw m0, m0
  977. punpcklbw m1, m1
  978. pshufw m2, m0, q3322
  979. pshufw m3, m1, q3322
  980. punpcklbw m0, m0
  981. punpcklbw m1, m1
  982. ADD_DC m0, m1, r0-FDEC_STRIDE*4
  983. ADD_DC m2, m3, r0
  984. RET
  985. INIT_XMM ssse3
  986. cglobal add8x8_idct_dc, 2,2
  987. movh m0, [r1]
  988. pxor m1, m1
  989. add r0, FDEC_STRIDE*4
  990. pmulhrsw m0, [pw_512]
  991. psubw m1, m0
  992. mova m5, [pb_unpackbd1]
  993. packuswb m0, m0
  994. packuswb m1, m1
  995. pshufb m0, m5
  996. pshufb m1, m5
  997. movh m2, [r0+FDEC_STRIDE*-4]
  998. movh m3, [r0+FDEC_STRIDE*-3]
  999. movh m4, [r0+FDEC_STRIDE*-2]
  1000. movh m5, [r0+FDEC_STRIDE*-1]
  1001. movhps m2, [r0+FDEC_STRIDE* 0]
  1002. movhps m3, [r0+FDEC_STRIDE* 1]
  1003. movhps m4, [r0+FDEC_STRIDE* 2]
  1004. movhps m5, [r0+FDEC_STRIDE* 3]
  1005. paddusb m2, m0
  1006. paddusb m3, m0
  1007. paddusb m4, m0
  1008. paddusb m5, m0
  1009. psubusb m2, m1
  1010. psubusb m3, m1
  1011. psubusb m4, m1
  1012. psubusb m5, m1
  1013. movh [r0+FDEC_STRIDE*-4], m2
  1014. movh [r0+FDEC_STRIDE*-3], m3
  1015. movh [r0+FDEC_STRIDE*-2], m4
  1016. movh [r0+FDEC_STRIDE*-1], m5
  1017. movhps [r0+FDEC_STRIDE* 0], m2
  1018. movhps [r0+FDEC_STRIDE* 1], m3
  1019. movhps [r0+FDEC_STRIDE* 2], m4
  1020. movhps [r0+FDEC_STRIDE* 3], m5
  1021. RET
  1022. INIT_MMX mmx2
  1023. cglobal add16x16_idct_dc, 2,3
  1024. mov r2, 4
  1025. .loop:
  1026. mova m0, [r1]
  1027. pxor m1, m1
  1028. paddw m0, [pw_32]
  1029. psraw m0, 6
  1030. psubw m1, m0
  1031. packuswb m0, m0
  1032. packuswb m1, m1
  1033. punpcklbw m0, m0
  1034. punpcklbw m1, m1
  1035. pshufw m2, m0, q3322
  1036. pshufw m3, m1, q3322
  1037. punpcklbw m0, m0
  1038. punpcklbw m1, m1
  1039. ADD_DC m0, m1, r0
  1040. ADD_DC m2, m3, r0+8
  1041. add r1, 8
  1042. add r0, FDEC_STRIDE*4
  1043. dec r2
  1044. jg .loop
  1045. RET
  1046. INIT_XMM sse2
  1047. cglobal add16x16_idct_dc, 2,2,8
  1048. call .loop
  1049. add r0, FDEC_STRIDE*4
  1050. TAIL_CALL .loop, 0
  1051. .loop:
  1052. add r0, FDEC_STRIDE*4
  1053. movq m0, [r1+0]
  1054. movq m2, [r1+8]
  1055. add r1, 16
  1056. punpcklwd m0, m0
  1057. punpcklwd m2, m2
  1058. pxor m3, m3
  1059. paddw m0, [pw_32]
  1060. paddw m2, [pw_32]
  1061. psraw m0, 6
  1062. psraw m2, 6
  1063. psubw m1, m3, m0
  1064. packuswb m0, m1
  1065. psubw m3, m2
  1066. punpckhbw m1, m0, m0
  1067. packuswb m2, m3
  1068. punpckhbw m3, m2, m2
  1069. punpcklbw m0, m0
  1070. punpcklbw m2, m2
  1071. ADD_DC m0, m1, r0+FDEC_STRIDE*-4
  1072. ADD_DC m2, m3, r0
  1073. ret
  1074. %macro ADD16x16 0
  1075. cglobal add16x16_idct_dc, 2,2,8
  1076. call .loop
  1077. add r0, FDEC_STRIDE*4
  1078. TAIL_CALL .loop, 0
  1079. .loop:
  1080. add r0, FDEC_STRIDE*4
  1081. mova m0, [r1]
  1082. add r1, 16
  1083. pxor m1, m1
  1084. pmulhrsw m0, [pw_512]
  1085. psubw m1, m0
  1086. mova m5, [pb_unpackbd1]
  1087. mova m6, [pb_unpackbd2]
  1088. packuswb m0, m0
  1089. packuswb m1, m1
  1090. pshufb m2, m0, m6
  1091. pshufb m0, m5
  1092. pshufb m3, m1, m6
  1093. pshufb m1, m5
  1094. ADD_DC m0, m1, r0+FDEC_STRIDE*-4
  1095. ADD_DC m2, m3, r0
  1096. ret
  1097. %endmacro ; ADD16x16
  1098. INIT_XMM ssse3
  1099. ADD16x16
  1100. INIT_XMM avx
  1101. ADD16x16
  1102. %macro ADD_DC_AVX2 3
  1103. mova xm4, [r0+FDEC_STRIDE*0+%3]
  1104. mova xm5, [r0+FDEC_STRIDE*1+%3]
  1105. vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
  1106. vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
  1107. paddusb m4, %1
  1108. paddusb m5, %1
  1109. psubusb m4, %2
  1110. psubusb m5, %2
  1111. mova [r0+FDEC_STRIDE*0+%3], xm4
  1112. mova [r0+FDEC_STRIDE*1+%3], xm5
  1113. vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
  1114. vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
  1115. %endmacro
  1116. INIT_YMM avx2
  1117. cglobal add16x16_idct_dc, 2,3,6
  1118. add r0, FDEC_STRIDE*4
  1119. mova m0, [r1]
  1120. pxor m1, m1
  1121. pmulhrsw m0, [pw_512]
  1122. psubw m1, m0
  1123. mova m4, [pb_unpackbd1]
  1124. mova m5, [pb_unpackbd2]
  1125. packuswb m0, m0
  1126. packuswb m1, m1
  1127. pshufb m2, m0, m4 ; row0, row2
  1128. pshufb m3, m1, m4 ; row0, row2
  1129. pshufb m0, m5 ; row1, row3
  1130. pshufb m1, m5 ; row1, row3
  1131. lea r2, [r0+FDEC_STRIDE*8]
  1132. ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
  1133. ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
  1134. ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
  1135. ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
  1136. RET
  1137. %endif ; HIGH_BIT_DEPTH
  1138. ;-----------------------------------------------------------------------------
  1139. ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
  1140. ;-----------------------------------------------------------------------------
  1141. %macro DCTDC_2ROW_MMX 4
  1142. mova %1, [r1+FENC_STRIDE*(0+%3)]
  1143. mova m1, [r1+FENC_STRIDE*(1+%3)]
  1144. mova m2, [r2+FDEC_STRIDE*(0+%4)]
  1145. mova m3, [r2+FDEC_STRIDE*(1+%4)]
  1146. mova %2, %1
  1147. punpckldq %1, m1
  1148. punpckhdq %2, m1
  1149. mova m1, m2
  1150. punpckldq m2, m3
  1151. punpckhdq m1, m3
  1152. pxor m3, m3
  1153. psadbw %1, m3
  1154. psadbw %2, m3
  1155. psadbw m2, m3
  1156. psadbw m1, m3
  1157. psubw %1, m2
  1158. psubw %2, m1
  1159. %endmacro
  1160. %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
  1161. PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
  1162. PSHUFLW m0, %2, q2301 ; s3 __ s2 __
  1163. paddw m1, %2 ; s1 s13 s0 s02
  1164. psubw m1, m0 ; d13 s13 d02 s02
  1165. PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
  1166. psrlq m1, 32 ; __ __ d13 s13
  1167. paddw m0, m1 ; d02 s02 d02+d13 s02+s13
  1168. psllq m1, 32 ; d13 s13
  1169. psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
  1170. %endmacro
  1171. %if HIGH_BIT_DEPTH == 0
  1172. INIT_MMX mmx2
  1173. cglobal sub8x8_dct_dc, 3,3
  1174. DCTDC_2ROW_MMX m0, m4, 0, 0
  1175. DCTDC_2ROW_MMX m5, m6, 2, 2
  1176. paddw m0, m5
  1177. paddw m4, m6
  1178. punpckldq m0, m4
  1179. add r2, FDEC_STRIDE*4
  1180. DCTDC_2ROW_MMX m7, m4, 4, 0
  1181. DCTDC_2ROW_MMX m5, m6, 6, 2
  1182. paddw m7, m5
  1183. paddw m4, m6
  1184. punpckldq m7, m4
  1185. DCT2x2 m0, m7
  1186. mova [r0], m0
  1187. ret
  1188. %macro DCTDC_2ROW_SSE2 4
  1189. movh m1, [r1+FENC_STRIDE*(0+%1)]
  1190. movh m2, [r1+FENC_STRIDE*(1+%1)]
  1191. punpckldq m1, m2
  1192. movh m2, [r2+FDEC_STRIDE*(0+%2)]
  1193. punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
  1194. psadbw m1, m0
  1195. psadbw m2, m0
  1196. ACCUM paddd, %4, 1, %3
  1197. psubd m%4, m2
  1198. %endmacro
  1199. INIT_XMM sse2
  1200. cglobal sub8x8_dct_dc, 3,3
  1201. pxor m0, m0
  1202. DCTDC_2ROW_SSE2 0, 0, 0, 3
  1203. DCTDC_2ROW_SSE2 2, 2, 1, 3
  1204. add r2, FDEC_STRIDE*4
  1205. DCTDC_2ROW_SSE2 4, 0, 0, 4
  1206. DCTDC_2ROW_SSE2 6, 2, 1, 4
  1207. packssdw m3, m3
  1208. packssdw m4, m4
  1209. DCT2x2 m3, m4
  1210. movq [r0], m0
  1211. RET
  1212. %macro SUB8x16_DCT_DC 0
  1213. cglobal sub8x16_dct_dc, 3,3
  1214. pxor m0, m0
  1215. DCTDC_2ROW_SSE2 0, 0, 0, 3
  1216. DCTDC_2ROW_SSE2 2, 2, 1, 3
  1217. add r1, FENC_STRIDE*8
  1218. add r2, FDEC_STRIDE*8
  1219. DCTDC_2ROW_SSE2 -4, -4, 0, 4
  1220. DCTDC_2ROW_SSE2 -2, -2, 1, 4
  1221. shufps m3, m4, q2020
  1222. DCTDC_2ROW_SSE2 0, 0, 0, 5
  1223. DCTDC_2ROW_SSE2 2, 2, 1, 5
  1224. add r2, FDEC_STRIDE*4
  1225. DCTDC_2ROW_SSE2 4, 0, 0, 4
  1226. DCTDC_2ROW_SSE2 6, 2, 1, 4
  1227. shufps m5, m4, q2020
  1228. %if cpuflag(ssse3)
  1229. %define %%sign psignw
  1230. %else
  1231. %define %%sign pmullw
  1232. %endif
  1233. SUMSUB_BA d, 5, 3, 0
  1234. packssdw m5, m3
  1235. pshuflw m0, m5, q2301
  1236. pshufhw m0, m0, q2301
  1237. %%sign m5, [pw_pmpmpmpm]
  1238. paddw m0, m5
  1239. pshufd m1, m0, q1320
  1240. pshufd m0, m0, q0231
  1241. %%sign m1, [pw_ppppmmmm]
  1242. paddw m0, m1
  1243. mova [r0], m0
  1244. RET
  1245. %endmacro ; SUB8x16_DCT_DC
  1246. INIT_XMM sse2
  1247. SUB8x16_DCT_DC
  1248. INIT_XMM ssse3
  1249. SUB8x16_DCT_DC
  1250. %endif ; !HIGH_BIT_DEPTH
  1251. %macro DCTDC_4ROW_SSE2 2
  1252. mova %1, [r1+FENC_STRIDEB*%2]
  1253. mova m0, [r2+FDEC_STRIDEB*%2]
  1254. %assign Y (%2+1)
  1255. %rep 3
  1256. paddw %1, [r1+FENC_STRIDEB*Y]
  1257. paddw m0, [r2+FDEC_STRIDEB*Y]
  1258. %assign Y (Y+1)
  1259. %endrep
  1260. psubw %1, m0
  1261. pshufd m0, %1, q2301
  1262. paddw %1, m0
  1263. %endmacro
  1264. %if HIGH_BIT_DEPTH
  1265. %macro SUB8x8_DCT_DC_10 0
  1266. cglobal sub8x8_dct_dc, 3,3,3
  1267. DCTDC_4ROW_SSE2 m1, 0
  1268. DCTDC_4ROW_SSE2 m2, 4
  1269. mova m0, [pw_ppmmmmpp]
  1270. pmaddwd m1, m0
  1271. pmaddwd m2, m0
  1272. pshufd m0, m1, q2200 ; -1 -1 +0 +0
  1273. pshufd m1, m1, q0033 ; +0 +0 +1 +1
  1274. paddd m1, m0
  1275. pshufd m0, m2, q1023 ; -2 +2 -3 +3
  1276. paddd m1, m2
  1277. paddd m1, m0
  1278. mova [r0], m1
  1279. RET
  1280. %endmacro
  1281. INIT_XMM sse2
  1282. SUB8x8_DCT_DC_10
  1283. %macro SUB8x16_DCT_DC_10 0
  1284. cglobal sub8x16_dct_dc, 3,3,6
  1285. DCTDC_4ROW_SSE2 m1, 0
  1286. DCTDC_4ROW_SSE2 m2, 4
  1287. DCTDC_4ROW_SSE2 m3, 8
  1288. DCTDC_4ROW_SSE2 m4, 12
  1289. mova m0, [pw_ppmmmmpp]
  1290. pmaddwd m1, m0
  1291. pmaddwd m2, m0
  1292. pshufd m5, m1, q2200 ; -1 -1 +0 +0
  1293. pshufd m1, m1, q0033 ; +0 +0 +1 +1
  1294. paddd m1, m5
  1295. pshufd m5, m2, q1023 ; -2 +2 -3 +3
  1296. paddd m1, m2
  1297. paddd m1, m5 ; a6 a2 a4 a0
  1298. pmaddwd m3, m0
  1299. pmaddwd m4, m0
  1300. pshufd m5, m3, q2200
  1301. pshufd m3, m3, q0033
  1302. paddd m3, m5
  1303. pshufd m5, m4, q1023
  1304. paddd m3, m4
  1305. paddd m3, m5 ; a7 a3 a5 a1
  1306. paddd m0, m1, m3
  1307. psubd m1, m3
  1308. pshufd m0, m0, q3120
  1309. pshufd m1, m1, q3120
  1310. punpcklqdq m2, m0, m1
  1311. punpckhqdq m1, m0
  1312. mova [r0+ 0], m2
  1313. mova [r0+16], m1
  1314. RET
  1315. %endmacro
  1316. INIT_XMM sse2
  1317. SUB8x16_DCT_DC_10
  1318. INIT_XMM avx
  1319. SUB8x16_DCT_DC_10
  1320. %endif
  1321. ;-----------------------------------------------------------------------------
  1322. ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
  1323. ;-----------------------------------------------------------------------------
  1324. %macro SCAN_8x8 0
  1325. cglobal zigzag_scan_8x8_frame, 2,2,8
  1326. movdqa xmm0, [r1]
  1327. movdqa xmm1, [r1+16]
  1328. movdq2q mm0, xmm0
  1329. PALIGNR xmm1, xmm1, 14, xmm2
  1330. movdq2q mm1, xmm1
  1331. movdqa xmm2, [r1+32]
  1332. movdqa xmm3, [r1+48]
  1333. PALIGNR xmm2, xmm2, 12, xmm4
  1334. movdq2q mm2, xmm2
  1335. PALIGNR xmm3, xmm3, 10, xmm4
  1336. movdq2q mm3, xmm3
  1337. punpckhwd xmm0, xmm1
  1338. punpckhwd xmm2, xmm3
  1339. movq mm4, mm1
  1340. movq mm5, mm1
  1341. movq mm6, mm2
  1342. movq mm7, mm3
  1343. punpckhwd mm1, mm0
  1344. psllq mm0, 16
  1345. psrlq mm3, 16
  1346. punpckhdq mm1, mm1
  1347. punpckhdq mm2, mm0
  1348. punpcklwd mm0, mm4
  1349. punpckhwd mm4, mm3
  1350. punpcklwd mm4, mm2
  1351. punpckhdq mm0, mm2
  1352. punpcklwd mm6, mm3
  1353. punpcklwd mm5, mm7
  1354. punpcklwd mm5, mm6
  1355. movdqa xmm4, [r1+64]
  1356. movdqa xmm5, [r1+80]
  1357. movdqa xmm6, [r1+96]
  1358. movdqa xmm7, [r1+112]
  1359. movq [r0+2*00], mm0
  1360. movq [r0+2*04], mm4
  1361. movd [r0+2*08], mm1
  1362. movq [r0+2*36], mm5
  1363. movq [r0+2*46], mm6
  1364. PALIGNR xmm4, xmm4, 14, xmm3
  1365. movdq2q mm4, xmm4
  1366. PALIGNR xmm5, xmm5, 12, xmm3
  1367. movdq2q mm5, xmm5
  1368. PALIGNR xmm6, xmm6, 10, xmm3
  1369. movdq2q mm6, xmm6
  1370. %if cpuflag(ssse3)
  1371. PALIGNR xmm7, xmm7, 8, xmm3
  1372. movdq2q mm7, xmm7
  1373. %else
  1374. movhlps xmm3, xmm7
  1375. punpcklqdq xmm7, xmm7
  1376. movdq2q mm7, xmm3
  1377. %endif
  1378. punpckhwd xmm4, xmm5
  1379. punpckhwd xmm6, xmm7
  1380. movq mm0, mm4
  1381. movq mm1, mm5
  1382. movq mm3, mm7
  1383. punpcklwd mm7, mm6
  1384. psrlq mm6, 16
  1385. punpcklwd mm4, mm6
  1386. punpcklwd mm5, mm4
  1387. punpckhdq mm4, mm3
  1388. punpcklwd mm3, mm6
  1389. punpckhwd mm3, mm4
  1390. punpckhwd mm0, mm1
  1391. punpckldq mm4, mm0
  1392. punpckhdq mm0, mm6
  1393. pshufw mm4, mm4, q1230
  1394. movq [r0+2*14], mm4
  1395. movq [r0+2*25], mm0
  1396. movd [r0+2*54], mm7
  1397. movq [r0+2*56], mm5
  1398. movq [r0+2*60], mm3
  1399. punpckhdq xmm3, xmm0, xmm2
  1400. punpckldq xmm0, xmm2
  1401. punpckhdq xmm7, xmm4, xmm6
  1402. punpckldq xmm4, xmm6
  1403. pshufhw xmm0, xmm0, q0123
  1404. pshuflw xmm4, xmm4, q0123
  1405. pshufhw xmm3, xmm3, q0123
  1406. pshuflw xmm7, xmm7, q0123
  1407. movlps [r0+2*10], xmm0
  1408. movhps [r0+2*17], xmm0
  1409. movlps [r0+2*21], xmm3
  1410. movlps [r0+2*28], xmm4
  1411. movhps [r0+2*32], xmm3
  1412. movhps [r0+2*39], xmm4
  1413. movlps [r0+2*43], xmm7
  1414. movhps [r0+2*50], xmm7
  1415. RET
  1416. %endmacro
  1417. %if HIGH_BIT_DEPTH == 0
  1418. INIT_XMM sse2
  1419. SCAN_8x8
  1420. INIT_XMM ssse3
  1421. SCAN_8x8
  1422. %endif
  1423. ;-----------------------------------------------------------------------------
  1424. ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
  1425. ;-----------------------------------------------------------------------------
  1426. ; Output order:
  1427. ; 0 8 1 2 9 16 24 17
  1428. ; 10 3 4 11 18 25 32 40
  1429. ; 33 26 19 12 5 6 13 20
  1430. ; 27 34 41 48 56 49 42 35
  1431. ; 28 21 14 7 15 22 29 36
  1432. ; 43 50 57 58 51 44 37 30
  1433. ; 23 31 38 45 52 59 60 53
  1434. ; 46 39 47 54 61 62 55 63
  1435. %macro SCAN_8x8_FRAME 5
  1436. cglobal zigzag_scan_8x8_frame, 2,2,8
  1437. mova m0, [r1]
  1438. mova m1, [r1+ 8*SIZEOF_DCTCOEF]
  1439. movu m2, [r1+14*SIZEOF_DCTCOEF]
  1440. movu m3, [r1+21*SIZEOF_DCTCOEF]
  1441. mova m4, [r1+28*SIZEOF_DCTCOEF]
  1442. punpckl%4 m5, m0, m1
  1443. psrl%2 m0, %1
  1444. punpckh%4 m6, m1, m0
  1445. punpckl%3 m5, m0
  1446. punpckl%3 m1, m1
  1447. punpckh%4 m1, m3
  1448. mova m7, [r1+52*SIZEOF_DCTCOEF]
  1449. mova m0, [r1+60*SIZEOF_DCTCOEF]
  1450. punpckh%4 m1, m2
  1451. punpckl%4 m2, m4
  1452. punpckh%4 m4, m3
  1453. punpckl%3 m3, m3
  1454. punpckh%4 m3, m2
  1455. mova [r0], m5
  1456. mova [r0+ 4*SIZEOF_DCTCOEF], m1
  1457. mova [r0+ 8*SIZEOF_DCTCOEF], m6
  1458. punpckl%4 m6, m0
  1459. punpckl%4 m6, m7
  1460. mova m1, [r1+32*SIZEOF_DCTCOEF]
  1461. movu m5, [r1+39*SIZEOF_DCTCOEF]
  1462. movu m2, [r1+46*SIZEOF_DCTCOEF]
  1463. movu [r0+35*SIZEOF_DCTCOEF], m3
  1464. movu [r0+47*SIZEOF_DCTCOEF], m4
  1465. punpckh%4 m7, m0
  1466. psll%2 m0, %1
  1467. punpckh%3 m3, m5, m5
  1468. punpckl%4 m5, m1
  1469. punpckh%4 m1, m2
  1470. mova [r0+52*SIZEOF_DCTCOEF], m6
  1471. movu [r0+13*SIZEOF_DCTCOEF], m5
  1472. movu m4, [r1+11*SIZEOF_DCTCOEF]
  1473. movu m6, [r1+25*SIZEOF_DCTCOEF]
  1474. punpckl%4 m5, m7
  1475. punpckl%4 m1, m3
  1476. punpckh%3 m0, m7
  1477. mova m3, [r1+ 4*SIZEOF_DCTCOEF]
  1478. movu m7, [r1+18*SIZEOF_DCTCOEF]
  1479. punpckl%4 m2, m5
  1480. movu [r0+25*SIZEOF_DCTCOEF], m1
  1481. mova m1, m4
  1482. mova m5, m6
  1483. punpckl%4 m4, m3
  1484. punpckl%4 m6, m7
  1485. punpckh%4 m1, m3
  1486. punpckh%4 m5, m7
  1487. punpckh%3 m3, m6, m4
  1488. punpckh%3 m7, m5, m1
  1489. punpckl%3 m6, m4
  1490. punpckl%3 m5, m1
  1491. movu m4, [r1+35*SIZEOF_DCTCOEF]
  1492. movu m1, [r1+49*SIZEOF_DCTCOEF]
  1493. pshuf%5 m6, m6, q0123
  1494. pshuf%5 m5, m5, q0123
  1495. mova [r0+60*SIZEOF_DCTCOEF], m0
  1496. mova [r0+56*SIZEOF_DCTCOEF], m2
  1497. movu m0, [r1+42*SIZEOF_DCTCOEF]
  1498. mova m2, [r1+56*SIZEOF_DCTCOEF]
  1499. movu [r0+17*SIZEOF_DCTCOEF], m3
  1500. mova [r0+32*SIZEOF_DCTCOEF], m7
  1501. movu [r0+10*SIZEOF_DCTCOEF], m6
  1502. movu [r0+21*SIZEOF_DCTCOEF], m5
  1503. punpckh%4 m3, m0, m4
  1504. punpckh%4 m7, m2, m1
  1505. punpckl%4 m0, m4
  1506. punpckl%4 m2, m1
  1507. punpckl%3 m4, m2, m0
  1508. punpckl%3 m1, m7, m3
  1509. punpckh%3 m2, m0
  1510. punpckh%3 m7, m3
  1511. pshuf%5 m2, m2, q0123
  1512. pshuf%5 m7, m7, q0123
  1513. mova [r0+28*SIZEOF_DCTCOEF], m4
  1514. movu [r0+43*SIZEOF_DCTCOEF], m1
  1515. movu [r0+39*SIZEOF_DCTCOEF], m2
  1516. movu [r0+50*SIZEOF_DCTCOEF], m7
  1517. RET
  1518. %endmacro
  1519. %if HIGH_BIT_DEPTH
  1520. INIT_XMM sse2
  1521. SCAN_8x8_FRAME 4 , dq, qdq, dq, d
  1522. INIT_XMM avx
  1523. SCAN_8x8_FRAME 4 , dq, qdq, dq, d
  1524. %else
  1525. INIT_MMX mmx2
  1526. SCAN_8x8_FRAME 16, q , dq , wd, w
  1527. %endif
  1528. ;-----------------------------------------------------------------------------
  1529. ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
  1530. ;-----------------------------------------------------------------------------
  1531. %macro SCAN_4x4 4
  1532. cglobal zigzag_scan_4x4_frame, 2,2,6
  1533. mova m0, [r1+ 0*SIZEOF_DCTCOEF]
  1534. mova m1, [r1+ 4*SIZEOF_DCTCOEF]
  1535. mova m2, [r1+ 8*SIZEOF_DCTCOEF]
  1536. mova m3, [r1+12*SIZEOF_DCTCOEF]
  1537. punpckl%4 m4, m0, m1
  1538. psrl%2 m0, %1
  1539. punpckl%3 m4, m0
  1540. mova [r0+ 0*SIZEOF_DCTCOEF], m4
  1541. punpckh%4 m0, m2
  1542. punpckh%4 m4, m2, m3
  1543. psll%2 m3, %1
  1544. punpckl%3 m2, m2
  1545. punpckl%4 m5, m1, m3
  1546. punpckh%3 m1, m1
  1547. punpckh%4 m5, m2
  1548. punpckl%4 m1, m0
  1549. punpckh%3 m3, m4
  1550. mova [r0+ 4*SIZEOF_DCTCOEF], m5
  1551. mova [r0+ 8*SIZEOF_DCTCOEF], m1
  1552. mova [r0+12*SIZEOF_DCTCOEF], m3
  1553. RET
  1554. %endmacro
  1555. %if HIGH_BIT_DEPTH
  1556. INIT_XMM sse2
  1557. SCAN_4x4 4, dq, qdq, dq
  1558. INIT_XMM avx
  1559. SCAN_4x4 4, dq, qdq, dq
  1560. %else
  1561. INIT_MMX mmx
  1562. SCAN_4x4 16, q , dq , wd
  1563. ;-----------------------------------------------------------------------------
  1564. ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
  1565. ;-----------------------------------------------------------------------------
  1566. %macro SCAN_4x4_FRAME 0
  1567. cglobal zigzag_scan_4x4_frame, 2,2
  1568. mova m1, [r1+16]
  1569. mova m0, [r1+ 0]
  1570. pshufb m1, [pb_scan4frameb]
  1571. pshufb m0, [pb_scan4framea]
  1572. psrldq m2, m1, 6
  1573. palignr m1, m0, 6
  1574. pslldq m0, 10
  1575. palignr m2, m0, 10
  1576. mova [r0+ 0], m1
  1577. mova [r0+16], m2
  1578. RET
  1579. %endmacro
  1580. INIT_XMM ssse3
  1581. SCAN_4x4_FRAME
  1582. INIT_XMM avx
  1583. SCAN_4x4_FRAME
  1584. INIT_XMM xop
  1585. cglobal zigzag_scan_4x4_frame, 2,2
  1586. mova m0, [r1+ 0]
  1587. mova m1, [r1+16]
  1588. vpperm m2, m0, m1, [pb_scan4frame2a]
  1589. vpperm m1, m0, m1, [pb_scan4frame2b]
  1590. mova [r0+ 0], m2
  1591. mova [r0+16], m1
  1592. RET
  1593. %endif ; !HIGH_BIT_DEPTH
  1594. %if HIGH_BIT_DEPTH
  1595. ;-----------------------------------------------------------------------------
  1596. ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
  1597. ;-----------------------------------------------------------------------------
  1598. INIT_XMM sse2
  1599. cglobal zigzag_scan_4x4_field, 2,2
  1600. movu m0, [r1+ 8]
  1601. pshufd m0, m0, q3102
  1602. mova m1, [r1+32]
  1603. mova m2, [r1+48]
  1604. movu [r0+ 8], m0
  1605. mova [r0+32], m1
  1606. mova [r0+48], m2
  1607. movq mm0, [r1]
  1608. movq [r0], mm0
  1609. movq mm0, [r1+24]
  1610. movq [r0+24], mm0
  1611. RET
  1612. %else
  1613. ;-----------------------------------------------------------------------------
  1614. ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
  1615. ;-----------------------------------------------------------------------------
  1616. INIT_XMM sse
  1617. cglobal zigzag_scan_4x4_field, 2,2
  1618. mova m0, [r1]
  1619. mova m1, [r1+16]
  1620. pshufw mm0, [r1+4], q3102
  1621. mova [r0], m0
  1622. mova [r0+16], m1
  1623. movq [r0+4], mm0
  1624. RET
  1625. %endif ; HIGH_BIT_DEPTH
  1626. ;-----------------------------------------------------------------------------
  1627. ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
  1628. ;-----------------------------------------------------------------------------
  1629. ; Output order:
  1630. ; 0 1 2 8 9 3 4 10
  1631. ; 16 11 5 6 7 12 17 24
  1632. ; 18 13 14 15 19 25 32 26
  1633. ; 20 21 22 23 27 33 40 34
  1634. ; 28 29 30 31 35 41 48 42
  1635. ; 36 37 38 39 43 49 50 44
  1636. ; 45 46 47 51 56 57 52 53
  1637. ; 54 55 58 59 60 61 62 63
  1638. %undef SCAN_8x8
  1639. %macro SCAN_8x8 5
  1640. cglobal zigzag_scan_8x8_field, 2,3,8
  1641. mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
  1642. mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
  1643. mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
  1644. pshuf%1 m3, m0, q3333 ; 03 03 03 03
  1645. movd r2d, m2 ; 09 08
  1646. pshuf%1 m2, m2, q0321 ; 08 11 10 09
  1647. punpckl%2 m3, m1 ; 05 03 04 03
  1648. pinsr%1 m0, r2d, 3 ; 08 02 01 00
  1649. punpckl%2 m4, m2, m3 ; 04 10 03 09
  1650. pshuf%1 m4, m4, q2310 ; 10 04 03 09
  1651. mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
  1652. mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
  1653. mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
  1654. mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
  1655. punpckl%3 m6, m5 ; 17 16 XX XX
  1656. psrl%4 m1, %5 ; XX 07 06 05
  1657. punpckh%2 m6, m2 ; 08 17 11 16
  1658. punpckl%3 m6, m1 ; 06 05 11 16
  1659. mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
  1660. psrl%4 m1, %5 ; XX XX 07 06
  1661. punpckl%2 m1, m5 ; 17 07 16 06
  1662. mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
  1663. mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
  1664. punpckh%3 m1, m1 ; 17 07 17 07
  1665. punpckl%2 m6, m3, m2 ; 25 13 24 12
  1666. pextr%1 r2d, m5, 2
  1667. mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
  1668. punpckl%2 m1, m6 ; 24 17 12 07
  1669. mova [r0+12*SIZEOF_DCTCOEF], m1
  1670. pinsr%1 m3, r2d, 0 ; 15 14 13 18
  1671. mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
  1672. mova m7, [r1+28*SIZEOF_DCTCOEF]
  1673. mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
  1674. psrl%4 m5, %5*3 ; XX XX XX 19
  1675. pshuf%1 m1, m2, q3321 ; 27 27 26 25
  1676. punpckl%2 m5, m0 ; 33 XX 32 19
  1677. psrl%4 m2, %5*3 ; XX XX XX 27
  1678. punpckl%2 m5, m1 ; 26 32 25 19
  1679. mova [r0+32*SIZEOF_DCTCOEF], m7
  1680. mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
  1681. mova m7, [r1+36*SIZEOF_DCTCOEF]
  1682. mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
  1683. pshuf%1 m3, m0, q3321 ; 35 35 34 33
  1684. punpckl%2 m2, m1 ; 41 XX 40 27
  1685. mova [r0+40*SIZEOF_DCTCOEF], m7
  1686. punpckl%2 m2, m3 ; 34 40 33 27
  1687. mova [r0+28*SIZEOF_DCTCOEF], m2
  1688. mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
  1689. mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
  1690. psrl%4 m0, %5*3 ; XX XX XX 35
  1691. punpckl%2 m0, m2 ; 49 XX 48 35
  1692. pshuf%1 m3, m1, q3321 ; 43 43 42 41
  1693. punpckl%2 m0, m3 ; 42 48 41 35
  1694. mova [r0+36*SIZEOF_DCTCOEF], m0
  1695. pextr%1 r2d, m2, 3 ; 51
  1696. psrl%4 m1, %5*3 ; XX XX XX 43
  1697. punpckl%2 m1, m7 ; 45 XX 44 43
  1698. psrl%4 m2, %5 ; XX 51 50 49
  1699. punpckl%2 m1, m2 ; 50 44 49 43
  1700. pshuf%1 m1, m1, q2310 ; 44 50 49 43
  1701. mova [r0+44*SIZEOF_DCTCOEF], m1
  1702. psrl%4 m7, %5 ; XX 47 46 45
  1703. pinsr%1 m7, r2d, 3 ; 51 47 46 45
  1704. mova [r0+48*SIZEOF_DCTCOEF], m7
  1705. mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
  1706. mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
  1707. mova m7, [r1+60*SIZEOF_DCTCOEF]
  1708. punpckl%3 m2, m0, m1 ; 53 52 57 56
  1709. punpckh%3 m1, m0 ; 59 58 55 54
  1710. mova [r0+52*SIZEOF_DCTCOEF], m2
  1711. mova [r0+56*SIZEOF_DCTCOEF], m1
  1712. mova [r0+60*SIZEOF_DCTCOEF], m7
  1713. RET
  1714. %endmacro
  1715. %if HIGH_BIT_DEPTH
  1716. INIT_XMM sse4
  1717. SCAN_8x8 d, dq, qdq, dq, 4
  1718. INIT_XMM avx
  1719. SCAN_8x8 d, dq, qdq, dq, 4
  1720. %else
  1721. INIT_MMX mmx2
  1722. SCAN_8x8 w, wd, dq , q , 16
  1723. %endif
  1724. ;-----------------------------------------------------------------------------
  1725. ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
  1726. ;-----------------------------------------------------------------------------
  1727. %macro ZIGZAG_SUB_4x4 2
  1728. %ifidn %1, ac
  1729. cglobal zigzag_sub_4x4%1_%2, 4,4,8
  1730. %else
  1731. cglobal zigzag_sub_4x4%1_%2, 3,3,8
  1732. %endif
  1733. movd m0, [r1+0*FENC_STRIDE]
  1734. movd m1, [r1+1*FENC_STRIDE]
  1735. movd m2, [r1+2*FENC_STRIDE]
  1736. movd m3, [r1+3*FENC_STRIDE]
  1737. movd m4, [r2+0*FDEC_STRIDE]
  1738. movd m5, [r2+1*FDEC_STRIDE]
  1739. movd m6, [r2+2*FDEC_STRIDE]
  1740. movd m7, [r2+3*FDEC_STRIDE]
  1741. movd [r2+0*FDEC_STRIDE], m0
  1742. movd [r2+1*FDEC_STRIDE], m1
  1743. movd [r2+2*FDEC_STRIDE], m2
  1744. movd [r2+3*FDEC_STRIDE], m3
  1745. punpckldq m0, m1
  1746. punpckldq m2, m3
  1747. punpckldq m4, m5
  1748. punpckldq m6, m7
  1749. punpcklqdq m0, m2
  1750. punpcklqdq m4, m6
  1751. mova m7, [pb_sub4%2]
  1752. pshufb m0, m7
  1753. pshufb m4, m7
  1754. mova m7, [hsub_mul]
  1755. punpckhbw m1, m0, m4
  1756. punpcklbw m0, m4
  1757. pmaddubsw m1, m7
  1758. pmaddubsw m0, m7
  1759. %ifidn %1, ac
  1760. movd r2d, m0
  1761. pand m0, [pb_subacmask]
  1762. %endif
  1763. mova [r0+ 0], m0
  1764. por m0, m1
  1765. pxor m2, m2
  1766. mova [r0+16], m1
  1767. pcmpeqb m0, m2
  1768. pmovmskb eax, m0
  1769. %ifidn %1, ac
  1770. mov [r3], r2w
  1771. %endif
  1772. sub eax, 0xffff
  1773. shr eax, 31
  1774. RET
  1775. %endmacro
  1776. %if HIGH_BIT_DEPTH == 0
  1777. INIT_XMM ssse3
  1778. ZIGZAG_SUB_4x4 , frame
  1779. ZIGZAG_SUB_4x4 ac, frame
  1780. ZIGZAG_SUB_4x4 , field
  1781. ZIGZAG_SUB_4x4 ac, field
  1782. INIT_XMM avx
  1783. ZIGZAG_SUB_4x4 , frame
  1784. ZIGZAG_SUB_4x4 ac, frame
  1785. ZIGZAG_SUB_4x4 , field
  1786. ZIGZAG_SUB_4x4 ac, field
  1787. %endif ; !HIGH_BIT_DEPTH
  1788. %if HIGH_BIT_DEPTH == 0
  1789. INIT_XMM xop
  1790. cglobal zigzag_scan_8x8_field, 2,3,7
  1791. lea r2, [pb_scan8field1]
  1792. %define off(m) (r2+m-pb_scan8field1)
  1793. mova m0, [r1+ 0]
  1794. mova m1, [r1+ 16]
  1795. vpperm m5, m0, m1, [off(pb_scan8field1)]
  1796. mova [r0+ 0], m5
  1797. vpperm m0, m0, m1, [off(pb_scan8field2a)]
  1798. mova m2, [r1+ 32]
  1799. mova m3, [r1+ 48]
  1800. vpperm m5, m2, m3, [off(pb_scan8field2b)]
  1801. por m5, m0
  1802. mova [r0+ 16], m5
  1803. mova m4, [off(pb_scan8field3b)]
  1804. vpperm m1, m1, m2, [off(pb_scan8field3a)]
  1805. mova m0, [r1+ 64]
  1806. vpperm m5, m3, m0, m4
  1807. por m5, m1
  1808. mova [r0+ 32], m5
  1809. ; 4b, 5b are the same as pb_scan8field3b.
  1810. ; 5a is the same as pb_scan8field4a.
  1811. mova m5, [off(pb_scan8field4a)]
  1812. vpperm m2, m2, m3, m5
  1813. mova m1, [r1+ 80]
  1814. vpperm m6, m0, m1, m4
  1815. por m6, m2
  1816. mova [r0+ 48], m6
  1817. vpperm m3, m3, m0, m5
  1818. mova m2, [r1+ 96]
  1819. vpperm m5, m1, m2, m4
  1820. por m5, m3
  1821. mova [r0+ 64], m5
  1822. vpperm m5, m0, m1, [off(pb_scan8field6)]
  1823. mova [r0+ 80], m5
  1824. vpperm m5, m1, m2, [off(pb_scan8field7)]
  1825. mov r2d, [r1+ 98]
  1826. mov [r0+ 90], r2d
  1827. mova [r0+ 96], m5
  1828. mova m3, [r1+112]
  1829. movd [r0+104], m3
  1830. mov r2d, [r1+108]
  1831. mova [r0+112], m3
  1832. mov [r0+112], r2d
  1833. %undef off
  1834. RET
  1835. cglobal zigzag_scan_8x8_frame, 2,3,8
  1836. lea r2, [pb_scan8frame1]
  1837. %define off(m) (r2+m-pb_scan8frame1)
  1838. mova m7, [r1+ 16]
  1839. mova m3, [r1+ 32]
  1840. vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
  1841. mova m2, [r1+ 48]
  1842. vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
  1843. mova m1, [r1+ 80]
  1844. mova m4, [r1+ 64]
  1845. vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
  1846. vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
  1847. vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
  1848. vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
  1849. mova m4, [r1+ 96]
  1850. vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
  1851. mova m1, [r1+ 0]
  1852. vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
  1853. vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
  1854. mova [r0+ 0], m1
  1855. movh m0, [r1+ 6]
  1856. movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
  1857. vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
  1858. mova [r0+ 16], m1
  1859. vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
  1860. mova [r0+ 32], m1
  1861. vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
  1862. mova [r0+ 64], m1
  1863. movh m0, [r1+100]
  1864. movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
  1865. vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
  1866. mova [r0+ 80], m1
  1867. vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
  1868. mova [r0+ 96], m1
  1869. mova m1, [r1+112]
  1870. vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
  1871. vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
  1872. mova [r0+ 48], m1
  1873. vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
  1874. mova [r0+112], m1
  1875. %undef off
  1876. RET
  1877. %endif
  1878. ;-----------------------------------------------------------------------------
  1879. ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
  1880. ;-----------------------------------------------------------------------------
  1881. %macro INTERLEAVE 2
  1882. mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
  1883. mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
  1884. mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
  1885. mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
  1886. TRANSPOSE4x4%2 0,1,2,3,4
  1887. mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
  1888. mova [r0+(%1+32)*SIZEOF_PIXEL], m1
  1889. mova [r0+(%1+64)*SIZEOF_PIXEL], m2
  1890. mova [r0+(%1+96)*SIZEOF_PIXEL], m3
  1891. packsswb m0, m1
  1892. ACCUM por, 6, 2, %1
  1893. ACCUM por, 7, 3, %1
  1894. ACCUM por, 5, 0, %1
  1895. %endmacro
  1896. %macro ZIGZAG_8x8_CAVLC 1
  1897. cglobal zigzag_interleave_8x8_cavlc, 3,3,8
  1898. INTERLEAVE 0, %1
  1899. INTERLEAVE 8, %1
  1900. INTERLEAVE 16, %1
  1901. INTERLEAVE 24, %1
  1902. packsswb m6, m7
  1903. packsswb m5, m6
  1904. packsswb m5, m5
  1905. pxor m0, m0
  1906. %if HIGH_BIT_DEPTH
  1907. packsswb m5, m5
  1908. %endif
  1909. pcmpeqb m5, m0
  1910. paddb m5, [pb_1]
  1911. movd r0d, m5
  1912. mov [r2+0], r0w
  1913. shr r0d, 16
  1914. mov [r2+8], r0w
  1915. RET
  1916. %endmacro
  1917. %if HIGH_BIT_DEPTH
  1918. INIT_XMM sse2
  1919. ZIGZAG_8x8_CAVLC D
  1920. INIT_XMM avx
  1921. ZIGZAG_8x8_CAVLC D
  1922. %else
  1923. INIT_MMX mmx
  1924. ZIGZAG_8x8_CAVLC W
  1925. %endif
  1926. %macro INTERLEAVE_XMM 1
  1927. mova m0, [r1+%1*4+ 0]
  1928. mova m1, [r1+%1*4+16]
  1929. mova m4, [r1+%1*4+32]
  1930. mova m5, [r1+%1*4+48]
  1931. SBUTTERFLY wd, 0, 1, 6
  1932. SBUTTERFLY wd, 4, 5, 7
  1933. SBUTTERFLY wd, 0, 1, 6
  1934. SBUTTERFLY wd, 4, 5, 7
  1935. movh [r0+%1+ 0], m0
  1936. movhps [r0+%1+ 32], m0
  1937. movh [r0+%1+ 64], m1
  1938. movhps [r0+%1+ 96], m1
  1939. movh [r0+%1+ 8], m4
  1940. movhps [r0+%1+ 40], m4
  1941. movh [r0+%1+ 72], m5
  1942. movhps [r0+%1+104], m5
  1943. ACCUM por, 2, 0, %1
  1944. ACCUM por, 3, 1, %1
  1945. por m2, m4
  1946. por m3, m5
  1947. %endmacro
  1948. %if HIGH_BIT_DEPTH == 0
  1949. %macro ZIGZAG_8x8_CAVLC 0
  1950. cglobal zigzag_interleave_8x8_cavlc, 3,3,8
  1951. INTERLEAVE_XMM 0
  1952. INTERLEAVE_XMM 16
  1953. packsswb m2, m3
  1954. pxor m5, m5
  1955. packsswb m2, m2
  1956. packsswb m2, m2
  1957. pcmpeqb m5, m2
  1958. paddb m5, [pb_1]
  1959. movd r0d, m5
  1960. mov [r2+0], r0w
  1961. shr r0d, 16
  1962. mov [r2+8], r0w
  1963. RET
  1964. %endmacro
  1965. INIT_XMM sse2
  1966. ZIGZAG_8x8_CAVLC
  1967. INIT_XMM avx
  1968. ZIGZAG_8x8_CAVLC
  1969. INIT_YMM avx2
  1970. cglobal zigzag_interleave_8x8_cavlc, 3,3,6
  1971. mova m0, [r1+ 0]
  1972. mova m1, [r1+32]
  1973. mova m2, [r1+64]
  1974. mova m3, [r1+96]
  1975. mova m5, [deinterleave_shufd]
  1976. SBUTTERFLY wd, 0, 1, 4
  1977. SBUTTERFLY wd, 2, 3, 4
  1978. SBUTTERFLY wd, 0, 1, 4
  1979. SBUTTERFLY wd, 2, 3, 4
  1980. vpermd m0, m5, m0
  1981. vpermd m1, m5, m1
  1982. vpermd m2, m5, m2
  1983. vpermd m3, m5, m3
  1984. mova [r0+ 0], xm0
  1985. mova [r0+ 16], xm2
  1986. vextracti128 [r0+ 32], m0, 1
  1987. vextracti128 [r0+ 48], m2, 1
  1988. mova [r0+ 64], xm1
  1989. mova [r0+ 80], xm3
  1990. vextracti128 [r0+ 96], m1, 1
  1991. vextracti128 [r0+112], m3, 1
  1992. packsswb m0, m2 ; nnz0, nnz1
  1993. packsswb m1, m3 ; nnz2, nnz3
  1994. packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
  1995. vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
  1996. pxor m5, m5
  1997. pcmpeqq m0, m5
  1998. pmovmskb r0d, m0
  1999. not r0d
  2000. and r0d, 0x01010101
  2001. mov [r2+0], r0w
  2002. shr r0d, 16
  2003. mov [r2+8], r0w
  2004. RET
  2005. %endif ; !HIGH_BIT_DEPTH
  2006. %if HIGH_BIT_DEPTH
  2007. INIT_ZMM avx512
  2008. cglobal zigzag_scan_4x4_frame, 2,2
  2009. mova m0, [scan_frame_avx512]
  2010. vpermd m0, m0, [r1]
  2011. mova [r0], m0
  2012. RET
  2013. cglobal zigzag_scan_4x4_field, 2,2
  2014. mova m0, [r1]
  2015. pshufd xmm1, [r1+8], q3102
  2016. mova [r0], m0
  2017. movu [r0+8], xmm1
  2018. RET
  2019. cglobal zigzag_scan_8x8_frame, 2,2
  2020. psrld m0, [scan_frame_avx512], 4
  2021. mova m1, [r1+0*64]
  2022. mova m2, [r1+1*64]
  2023. mova m3, [r1+2*64]
  2024. mova m4, [r1+3*64]
  2025. mov r1d, 0x01fe7f80
  2026. kmovd k1, r1d
  2027. kshiftrd k2, k1, 16
  2028. vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
  2029. psrld m6, m0, 5
  2030. vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
  2031. vmovdqa64 m0 {k1}, m5
  2032. mova [r0+0*64], m0
  2033. mova m5, m1
  2034. vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
  2035. psrld m0, m6, 5
  2036. vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
  2037. vmovdqa32 m6 {k2}, m1
  2038. mova [r0+1*64], m6
  2039. vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
  2040. psrld m1, m0, 5
  2041. vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
  2042. vmovdqa32 m5 {k1}, m0
  2043. mova [r0+2*64], m5
  2044. vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
  2045. vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
  2046. vmovdqa64 m2 {k2}, m3
  2047. mova [r0+3*64], m2
  2048. RET
  2049. cglobal zigzag_scan_8x8_field, 2,2
  2050. mova m0, [scan_field_avx512]
  2051. mova m1, [r1+0*64]
  2052. mova m2, [r1+1*64]
  2053. mova m3, [r1+2*64]
  2054. mova m4, [r1+3*64]
  2055. mov r1d, 0x3f
  2056. kmovb k1, r1d
  2057. psrld m5, m0, 5
  2058. vpermi2d m0, m1, m2
  2059. vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
  2060. vpermt2d m1, m5, m2
  2061. psrld m5, 5
  2062. vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
  2063. vpermt2d m2, m5, m3
  2064. psrld m5, 5
  2065. vpermt2d m3, m5, m4
  2066. mova [r0+0*64], m0
  2067. mova [r0+1*64], m1
  2068. mova [r0+2*64], m2
  2069. mova [r0+3*64], m3
  2070. RET
  2071. cglobal zigzag_interleave_8x8_cavlc, 3,3
  2072. mova m0, [cavlc_shuf_avx512]
  2073. mova m1, [r1+0*64]
  2074. mova m2, [r1+1*64]
  2075. mova m3, [r1+2*64]
  2076. mova m4, [r1+3*64]
  2077. kxnorb k1, k1, k1
  2078. por m7, m1, m2
  2079. psrld m5, m0, 5
  2080. vpermi2d m0, m1, m2 ; a0 a1 b0 b1
  2081. vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
  2082. psrld m6, m5, 5
  2083. vpermi2d m5, m3, m4 ; b2 b3 a2 a3
  2084. vptestmd k0, m7, m7
  2085. vpermt2d m1, m6, m2 ; c0 c1 d0 d1
  2086. psrld m6, 5
  2087. vpermt2d m3, m6, m4 ; d2 d3 c2 c3
  2088. vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
  2089. vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
  2090. vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
  2091. vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
  2092. mova [r0+0*64], m5
  2093. mova [r0+1*64], m2
  2094. mova [r0+2*64], m3
  2095. mova [r0+3*64], m4
  2096. kmovw r1d, k0
  2097. test r1d, 0x1111
  2098. setnz [r2]
  2099. test r1d, 0x2222
  2100. setnz [r2+1]
  2101. test r1d, 0x4444
  2102. setnz [r2+8]
  2103. test r1d, 0x8888
  2104. setnz [r2+9]
  2105. RET
  2106. %else ; !HIGH_BIT_DEPTH
  2107. INIT_YMM avx512
  2108. cglobal zigzag_scan_4x4_frame, 2,2
  2109. mova m0, [scan_frame_avx512]
  2110. vpermw m0, m0, [r1]
  2111. mova [r0], m0
  2112. RET
  2113. cglobal zigzag_scan_4x4_field, 2,2
  2114. mova m0, [r1]
  2115. pshuflw xmm1, [r1+4], q3102
  2116. mova [r0], m0
  2117. movq [r0+4], xmm1
  2118. RET
  2119. INIT_ZMM avx512
  2120. cglobal zigzag_scan_8x8_frame, 2,2
  2121. psrlw m0, [scan_frame_avx512], 4
  2122. scan8_avx512:
  2123. mova m1, [r1]
  2124. mova m2, [r1+64]
  2125. psrlw m3, m0, 6
  2126. vpermi2w m0, m1, m2
  2127. vpermt2w m1, m3, m2
  2128. mova [r0], m0
  2129. mova [r0+64], m1
  2130. RET
  2131. cglobal zigzag_scan_8x8_field, 2,2
  2132. mova m0, [scan_field_avx512]
  2133. jmp scan8_avx512
  2134. cglobal zigzag_interleave_8x8_cavlc, 3,3
  2135. mova m0, [cavlc_shuf_avx512]
  2136. mova m1, [r1]
  2137. mova m2, [r1+64]
  2138. psrlw m3, m0, 6
  2139. vpermi2w m0, m1, m2
  2140. vpermt2w m1, m3, m2
  2141. kxnorb k2, k2, k2
  2142. vptestmd k0, m0, m0
  2143. vptestmd k1, m1, m1
  2144. mova [r0], m0
  2145. mova [r0+64], m1
  2146. ktestw k2, k0
  2147. setnz [r2]
  2148. setnc [r2+1]
  2149. ktestw k2, k1
  2150. setnz [r2+8]
  2151. setnc [r2+9]
  2152. RET
  2153. %endif ; !HIGH_BIT_DEPTH