123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274 |
- ;*****************************************************************************
- ;* quant-a.asm: x86 quantization and level-run
- ;*****************************************************************************
- ;* Copyright (C) 2005-2018 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;* Fiona Glaser <fiona@x264.com>
- ;* Christian Heine <sennindemokrit@gmx.net>
- ;* Oskar Arvidsson <oskar@irock.se>
- ;* Henrik Gramner <henrik@gramner.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 64
- %if HIGH_BIT_DEPTH
- decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15
- %else
- dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
- dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
- %endif
- %macro DQM4 3
- dw %1, %2, %1, %2, %2, %3, %2, %3
- %endmacro
- %macro DQM8 6
- dw %1, %4, %5, %4, %1, %4, %5, %4
- dw %4, %2, %6, %2, %4, %2, %6, %2
- dw %5, %6, %3, %6, %5, %6, %3, %6
- dw %4, %2, %6, %2, %4, %2, %6, %2
- %endmacro
- dequant8_scale:
- DQM8 20, 18, 32, 19, 25, 24
- DQM8 22, 19, 35, 21, 28, 26
- DQM8 26, 23, 42, 24, 33, 31
- DQM8 28, 25, 45, 26, 35, 33
- DQM8 32, 28, 51, 30, 40, 38
- DQM8 36, 32, 58, 34, 46, 43
- dequant4_scale:
- DQM4 10, 13, 16
- DQM4 11, 14, 18
- DQM4 13, 16, 20
- DQM4 14, 18, 23
- DQM4 16, 20, 25
- DQM4 18, 23, 29
- decimate_mask_table4:
- db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
- db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
- db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
- db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
- db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
- db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
- db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
- db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
- db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24
- chroma_dc_dct_mask_mmx: dw 0, 0,-1,-1, 0, 0,-1,-1
- chroma_dc_dmf_mask_mmx: dw 0, 0,-1,-1, 0,-1,-1, 0
- chroma_dc_dct_mask: dw 1, 1,-1,-1, 1, 1,-1,-1
- chroma_dc_dmf_mask: dw 1, 1,-1,-1, 1,-1,-1, 1
- %if HIGH_BIT_DEPTH==0
- dct_coef_shuffle:
- %macro DCT_COEF_SHUFFLE 8
- %assign y x
- %rep 8
- %rep 7
- %rotate (~(y>>7))&1
- %assign y y<<((~(y>>7))&1)
- %endrep
- db %1*2
- %rotate 1
- %assign y y<<1
- %endrep
- %endmacro
- %assign x 0
- %rep 256
- DCT_COEF_SHUFFLE 7, 6, 5, 4, 3, 2, 1, 0
- %assign x x+1
- %endrep
- %endif
- SECTION .text
- cextern pb_1
- cextern pw_1
- cextern pw_2
- cextern pw_256
- cextern pd_1
- cextern pb_01
- cextern pd_1024
- cextern deinterleave_shufd
- cextern popcnt_table
- %macro QUANT_DC_START 2
- movd xm%1, r1m ; mf
- movd xm%2, r2m ; bias
- %if cpuflag(avx2)
- vpbroadcastdct m%1, xm%1
- vpbroadcastdct m%2, xm%2
- %elif HIGH_BIT_DEPTH
- SPLATD m%1, m%1
- SPLATD m%2, m%2
- %elif cpuflag(sse4) ; ssse3, but not faster on conroe
- mova m5, [pb_01]
- pshufb m%1, m5
- pshufb m%2, m5
- %else
- SPLATW m%1, m%1
- SPLATW m%2, m%2
- %endif
- %endmacro
- %macro QUANT_END 0
- xor eax, eax
- %if cpuflag(sse4)
- ptest m5, m5
- %else ; !sse4
- %if ARCH_X86_64
- %if mmsize == 16
- packsswb m5, m5
- %endif
- movq rcx, m5
- test rcx, rcx
- %else
- %if mmsize == 16
- pxor m4, m4
- pcmpeqb m5, m4
- pmovmskb ecx, m5
- cmp ecx, (1<<mmsize)-1
- %else
- packsswb m5, m5
- movd ecx, m5
- test ecx, ecx
- %endif
- %endif
- %endif ; cpuflag
- setne al
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro QUANT_ONE_DC 4
- %if cpuflag(sse4)
- mova m0, [%1]
- ABSD m1, m0
- paddd m1, %3
- pmulld m1, %2
- psrad m1, 16
- %else ; !sse4
- mova m0, [%1]
- ABSD m1, m0
- paddd m1, %3
- mova m2, m1
- psrlq m2, 32
- pmuludq m1, %2
- pmuludq m2, %2
- psllq m2, 32
- paddd m1, m2
- psrld m1, 16
- %endif ; cpuflag
- PSIGND m1, m0
- mova [%1], m1
- ACCUM por, 5, 1, %4
- %endmacro
- %macro QUANT_TWO_DC 4
- %if cpuflag(sse4)
- mova m0, [%1 ]
- mova m1, [%1+mmsize]
- ABSD m2, m0
- ABSD m3, m1
- paddd m2, %3
- paddd m3, %3
- pmulld m2, %2
- pmulld m3, %2
- psrad m2, 16
- psrad m3, 16
- PSIGND m2, m0
- PSIGND m3, m1
- mova [%1 ], m2
- mova [%1+mmsize], m3
- ACCUM por, 5, 2, %4
- por m5, m3
- %else ; !sse4
- QUANT_ONE_DC %1, %2, %3, %4
- QUANT_ONE_DC %1+mmsize, %2, %3, %4+mmsize
- %endif ; cpuflag
- %endmacro
- %macro QUANT_ONE_AC_MMX 5
- mova m0, [%1]
- mova m2, [%2]
- ABSD m1, m0
- mova m4, m2
- paddd m1, [%3]
- mova m3, m1
- psrlq m4, 32
- psrlq m3, 32
- pmuludq m1, m2
- pmuludq m3, m4
- psllq m3, 32
- paddd m1, m3
- psrad m1, 16
- PSIGND m1, m0
- mova [%1], m1
- ACCUM por, %5, 1, %4
- %endmacro
- %macro QUANT_TWO_AC 5
- %if cpuflag(sse4)
- mova m0, [%1 ]
- mova m1, [%1+mmsize]
- ABSD m2, m0
- ABSD m3, m1
- paddd m2, [%3 ]
- paddd m3, [%3+mmsize]
- pmulld m2, [%2 ]
- pmulld m3, [%2+mmsize]
- psrad m2, 16
- psrad m3, 16
- PSIGND m2, m0
- PSIGND m3, m1
- mova [%1 ], m2
- mova [%1+mmsize], m3
- ACCUM por, %5, 2, %4
- por m%5, m3
- %else ; !sse4
- QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
- QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
- %endif ; cpuflag
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int quant_2x2( int32_t dct[M*N], int mf, int bias )
- ;-----------------------------------------------------------------------------
- %macro QUANT_DC 2
- cglobal quant_%1x%2_dc, 3,3,8
- QUANT_DC_START 6,7
- %if %1*%2 <= mmsize/4
- QUANT_ONE_DC r0, m6, m7, 0
- %else
- %assign x 0
- %rep %1*%2/(mmsize/2)
- QUANT_TWO_DC r0+x, m6, m7, x
- %assign x x+mmsize*2
- %endrep
- %endif
- QUANT_END
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
- ;-----------------------------------------------------------------------------
- %macro QUANT_AC 2
- cglobal quant_%1x%2, 3,3,8
- %assign x 0
- %rep %1*%2/(mmsize/2)
- QUANT_TWO_AC r0+x, r1+x, r2+x, x, 5
- %assign x x+mmsize*2
- %endrep
- QUANT_END
- RET
- %endmacro
- %macro QUANT_4x4 2
- QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
- QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
- %endmacro
- %macro QUANT_4x4x4 0
- cglobal quant_4x4x4, 3,3,8
- QUANT_4x4 0, 5
- QUANT_4x4 64, 6
- add r0, 128
- packssdw m5, m6
- QUANT_4x4 0, 6
- QUANT_4x4 64, 7
- packssdw m6, m7
- packssdw m5, m6 ; AAAA BBBB CCCC DDDD
- pxor m4, m4
- pcmpeqd m5, m4
- movmskps eax, m5
- xor eax, 0xf
- RET
- %endmacro
- INIT_XMM sse2
- QUANT_DC 2, 2
- QUANT_DC 4, 4
- QUANT_AC 4, 4
- QUANT_AC 8, 8
- QUANT_4x4x4
- INIT_XMM ssse3
- QUANT_DC 2, 2
- QUANT_DC 4, 4
- QUANT_AC 4, 4
- QUANT_AC 8, 8
- QUANT_4x4x4
- INIT_XMM sse4
- QUANT_DC 2, 2
- QUANT_DC 4, 4
- QUANT_AC 4, 4
- QUANT_AC 8, 8
- QUANT_4x4x4
- INIT_YMM avx2
- QUANT_DC 4, 4
- QUANT_AC 4, 4
- QUANT_AC 8, 8
- INIT_YMM avx2
- cglobal quant_4x4x4, 3,3,6
- QUANT_TWO_AC r0, r1, r2, 0, 4
- QUANT_TWO_AC r0+64, r1, r2, 0, 5
- add r0, 128
- packssdw m4, m5
- QUANT_TWO_AC r0, r1, r2, 0, 5
- QUANT_TWO_AC r0+64, r1, r2, 0, 1
- packssdw m5, m1
- packssdw m4, m5
- pxor m3, m3
- pcmpeqd m4, m3
- movmskps eax, m4
- mov edx, eax
- shr eax, 4
- and eax, edx
- xor eax, 0xf
- RET
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH == 0
- %macro QUANT_ONE 5
- ;;; %1 (m64) dct[y][x]
- ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
- ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
- mova m1, %1 ; load dct coeffs
- ABSW m0, m1, sign
- paddusw m0, %3 ; round
- pmulhuw m0, %2 ; divide
- PSIGNW m0, m1 ; restore sign
- mova %1, m0 ; store
- ACCUM por, %5, 0, %4
- %endmacro
- %macro QUANT_TWO 8
- mova m1, %1
- mova m3, %2
- ABSW m0, m1, sign
- ABSW m2, m3, sign
- paddusw m0, %5
- paddusw m2, %6
- pmulhuw m0, %3
- pmulhuw m2, %4
- PSIGNW m0, m1
- PSIGNW m2, m3
- mova %1, m0
- mova %2, m2
- ACCUM por, %8, 0, %7
- ACCUM por, %8, 2, %7+mmsize
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
- ;-----------------------------------------------------------------------------
- %macro QUANT_DC 2-3 0
- cglobal %1, 1,1,%3
- %if %2==1
- QUANT_DC_START 2,3
- QUANT_ONE [r0], m2, m3, 0, 5
- %else
- QUANT_DC_START 4,6
- %assign x 0
- %rep %2/2
- QUANT_TWO [r0+x], [r0+x+mmsize], m4, m4, m6, m6, x, 5
- %assign x x+mmsize*2
- %endrep
- %endif
- QUANT_END
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
- ;-----------------------------------------------------------------------------
- %macro QUANT_AC 2
- cglobal %1, 3,3
- %if %2==1
- QUANT_ONE [r0], [r1], [r2], 0, 5
- %else
- %assign x 0
- %rep %2/2
- QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x, 5
- %assign x x+mmsize*2
- %endrep
- %endif
- QUANT_END
- RET
- %endmacro
- %macro QUANT_4x4 2
- %if UNIX64
- QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], m8, m9, m10, m11, mmsize*0, %2
- %else
- QUANT_TWO [r0+%1+mmsize*0], [r0+%1+mmsize*1], [r1+mmsize*0], [r1+mmsize*1], [r2+mmsize*0], [r2+mmsize*1], mmsize*0, %2
- %if mmsize==8
- QUANT_TWO [r0+%1+mmsize*2], [r0+%1+mmsize*3], [r1+mmsize*2], [r1+mmsize*3], [r2+mmsize*2], [r2+mmsize*3], mmsize*2, %2
- %endif
- %endif
- %endmacro
- %macro QUANT_4x4x4 0
- cglobal quant_4x4x4, 3,3,7
- %if UNIX64
- mova m8, [r1+mmsize*0]
- mova m9, [r1+mmsize*1]
- mova m10, [r2+mmsize*0]
- mova m11, [r2+mmsize*1]
- %endif
- QUANT_4x4 0, 4
- QUANT_4x4 32, 5
- packssdw m4, m5
- QUANT_4x4 64, 5
- QUANT_4x4 96, 6
- packssdw m5, m6
- packssdw m4, m5 ; AAAA BBBB CCCC DDDD
- pxor m3, m3
- pcmpeqd m4, m3
- movmskps eax, m4
- xor eax, 0xf
- RET
- %endmacro
- INIT_MMX mmx2
- QUANT_DC quant_2x2_dc, 1
- %if ARCH_X86_64 == 0 ; not needed because sse2 is faster
- QUANT_DC quant_4x4_dc, 4
- INIT_MMX mmx2
- QUANT_AC quant_4x4, 4
- QUANT_AC quant_8x8, 16
- %endif
- INIT_XMM sse2
- QUANT_DC quant_4x4_dc, 2, 7
- QUANT_AC quant_4x4, 2
- QUANT_AC quant_8x8, 8
- QUANT_4x4x4
- INIT_XMM ssse3
- QUANT_DC quant_4x4_dc, 2, 7
- QUANT_AC quant_4x4, 2
- QUANT_AC quant_8x8, 8
- QUANT_4x4x4
- INIT_MMX ssse3
- QUANT_DC quant_2x2_dc, 1
- INIT_XMM sse4
- ;Not faster on Conroe, so only used in SSE4 versions
- QUANT_DC quant_4x4_dc, 2, 7
- QUANT_AC quant_4x4, 2
- QUANT_AC quant_8x8, 8
- INIT_YMM avx2
- QUANT_AC quant_4x4, 1
- QUANT_AC quant_8x8, 4
- QUANT_DC quant_4x4_dc, 1, 6
- INIT_YMM avx2
- cglobal quant_4x4x4, 3,3,6
- mova m2, [r1]
- mova m3, [r2]
- QUANT_ONE [r0+ 0], m2, m3, 0, 4
- QUANT_ONE [r0+32], m2, m3, 0, 5
- packssdw m4, m5
- QUANT_ONE [r0+64], m2, m3, 0, 5
- QUANT_ONE [r0+96], m2, m3, 0, 1
- packssdw m5, m1
- packssdw m4, m5
- pxor m3, m3
- pcmpeqd m4, m3
- movmskps eax, m4
- mov edx, eax
- shr eax, 4
- and eax, edx
- xor eax, 0xf
- RET
- %endif ; !HIGH_BIT_DEPTH
- ;=============================================================================
- ; dequant
- ;=============================================================================
- %macro DEQUANT16_L 4
- ;;; %1 dct[y][x]
- ;;; %2,%3 dequant_mf[i_mf][y][x]
- ;;; m2 i_qbits
- %if HIGH_BIT_DEPTH
- mova m0, %1
- mova m1, %4
- pmaddwd m0, %2
- pmaddwd m1, %3
- pslld m0, xm2
- pslld m1, xm2
- mova %1, m0
- mova %4, m1
- %else
- mova m0, %2
- packssdw m0, %3
- %if mmsize==32
- vpermq m0, m0, q3120
- %endif
- pmullw m0, %1
- psllw m0, xm2
- mova %1, m0
- %endif
- %endmacro
- %macro DEQUANT32_R 4
- ;;; %1 dct[y][x]
- ;;; %2,%3 dequant_mf[i_mf][y][x]
- ;;; m2 -i_qbits
- ;;; m3 f
- ;;; m4 0
- %if HIGH_BIT_DEPTH
- mova m0, %1
- mova m1, %4
- pmadcswd m0, m0, %2, m3
- pmadcswd m1, m1, %3, m3
- psrad m0, xm2
- psrad m1, xm2
- mova %1, m0
- mova %4, m1
- %else
- %if mmsize == 32
- pmovzxwd m0, %1
- pmovzxwd m1, %4
- %else
- mova m0, %1
- punpckhwd m1, m0, m4
- punpcklwd m0, m4
- %endif
- pmadcswd m0, m0, %2, m3
- pmadcswd m1, m1, %3, m3
- psrad m0, xm2
- psrad m1, xm2
- packssdw m0, m1
- %if mmsize == 32
- vpermq m0, m0, q3120
- %endif
- mova %1, m0
- %endif
- %endmacro
- %macro DEQUANT_LOOP 3
- %if 8*(%2-2*%3) > 0
- mov t0d, 8*(%2-2*%3)
- %%loop:
- %1 [r0+(t0 )*SIZEOF_PIXEL], [r1+t0*2 ], [r1+t0*2+ 8*%3], [r0+(t0+ 4*%3)*SIZEOF_PIXEL]
- %1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3], [r0+(t0+12*%3)*SIZEOF_PIXEL]
- sub t0d, 16*%3
- jge %%loop
- RET
- %else
- %if mmsize < 32
- %1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3], [r0+(12*%3)*SIZEOF_PIXEL]
- %endif
- %1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3], [r0+( 4*%3)*SIZEOF_PIXEL]
- RET
- %endif
- %endmacro
- %macro DEQUANT16_FLAT 2-5
- mova m0, %1
- psllw m0, m4
- %assign i %0-2
- %rep %0-1
- %if i
- mova m %+ i, [r0+%2]
- pmullw m %+ i, m0
- %else
- pmullw m0, [r0+%2]
- %endif
- mova [r0+%2], m %+ i
- %assign i i-1
- %rotate 1
- %endrep
- %endmacro
- %if ARCH_X86_64
- DECLARE_REG_TMP 6,3,2
- %else
- DECLARE_REG_TMP 2,0,1
- %endif
- %macro DEQUANT_START 2
- movifnidn t2d, r2m
- imul t0d, t2d, 0x2b
- shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1d, [t0*5]
- sub t2d, t0d
- sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %1
- %if ARCH_X86_64
- add r1, t2 ; dequant_mf[i_mf]
- %else
- add r1, r1mp ; dequant_mf[i_mf]
- mov r0, r0mp ; dct
- %endif
- sub t0d, %2
- jl .rshift32 ; negative qbits => rightshift
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
- ;-----------------------------------------------------------------------------
- %macro DEQUANT 3
- cglobal dequant_%1x%1, 0,3,6
- .skip_prologue:
- DEQUANT_START %2+2, %2
- .lshift:
- movd xm2, t0d
- DEQUANT_LOOP DEQUANT16_L, %1*%1/4, %3
- .rshift32:
- neg t0d
- mova m3, [pd_1]
- movd xm2, t0d
- pslld m3, xm2
- pxor m4, m4
- psrld m3, 1
- DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
- %if HIGH_BIT_DEPTH == 0 && (notcpuflag(avx) || mmsize == 32)
- cglobal dequant_%1x%1_flat16, 0,3
- movifnidn t2d, r2m
- %if %1 == 8
- cmp t2d, 12
- jl dequant_%1x%1 %+ SUFFIX %+ .skip_prologue
- sub t2d, 12
- %endif
- imul t0d, t2d, 0x2b
- shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1d, [t0*5]
- sub t2d, t0d
- sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %2
- %ifdef PIC
- lea r1, [dequant%1_scale]
- add r1, t2
- %else
- lea r1, [dequant%1_scale + t2]
- %endif
- movifnidn r0, r0mp
- movd xm4, t0d
- %if %1 == 4
- %if mmsize == 8
- DEQUANT16_FLAT [r1], 0, 16
- DEQUANT16_FLAT [r1+8], 8, 24
- %elif mmsize == 16
- DEQUANT16_FLAT [r1], 0, 16
- %else
- vbroadcasti128 m0, [r1]
- psllw m0, xm4
- pmullw m0, [r0]
- mova [r0], m0
- %endif
- %elif mmsize == 8
- DEQUANT16_FLAT [r1], 0, 8, 64, 72
- DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
- DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
- DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
- %elif mmsize == 16
- DEQUANT16_FLAT [r1], 0, 64
- DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
- DEQUANT16_FLAT [r1+32], 32, 96
- %else
- mova m1, [r1+ 0]
- mova m2, [r1+32]
- psllw m1, xm4
- psllw m2, xm4
- pmullw m0, m1, [r0+ 0]
- pmullw m3, m2, [r0+32]
- pmullw m4, m1, [r0+64]
- pmullw m5, m2, [r0+96]
- mova [r0+ 0], m0
- mova [r0+32], m3
- mova [r0+64], m4
- mova [r0+96], m5
- %endif
- RET
- %endif ; !HIGH_BIT_DEPTH && !AVX
- %endmacro ; DEQUANT
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- DEQUANT 4, 4, 2
- DEQUANT 8, 6, 2
- INIT_XMM xop
- DEQUANT 4, 4, 2
- DEQUANT 8, 6, 2
- INIT_YMM avx2
- DEQUANT 4, 4, 4
- DEQUANT 8, 6, 4
- %else
- %if ARCH_X86_64 == 0
- INIT_MMX mmx
- DEQUANT 4, 4, 1
- DEQUANT 8, 6, 1
- %endif
- INIT_XMM sse2
- DEQUANT 4, 4, 2
- DEQUANT 8, 6, 2
- INIT_XMM avx
- DEQUANT 4, 4, 2
- DEQUANT 8, 6, 2
- INIT_XMM xop
- DEQUANT 4, 4, 2
- DEQUANT 8, 6, 2
- INIT_YMM avx2
- DEQUANT 4, 4, 4
- DEQUANT 8, 6, 4
- %endif
- %macro DEQUANT_START_AVX512 1-2 0 ; shift, flat
- %if %2 == 0
- movifnidn t2d, r2m
- %endif
- imul t0d, t2d, 0x2b
- shr t0d, 8 ; i_qbits = i_qp / 6
- lea t1d, [t0*5]
- sub t2d, t0d
- sub t2d, t1d ; i_mf = i_qp % 6
- shl t2d, %1
- %if %2
- %ifdef PIC
- %define dmf r1+t2
- lea r1, [dequant8_scale]
- %else
- %define dmf t2+dequant8_scale
- %endif
- %elif ARCH_X86_64
- %define dmf r1+t2
- %else
- %define dmf r1
- add r1, r1mp ; dequant_mf[i_mf]
- %endif
- movifnidn r0, r0mp
- %endmacro
- INIT_ZMM avx512
- cglobal dequant_4x4, 0,3
- DEQUANT_START_AVX512 6
- mova m0, [dmf]
- %if HIGH_BIT_DEPTH
- pmaddwd m0, [r0]
- %endif
- sub t0d, 4
- jl .rshift
- %if HIGH_BIT_DEPTH
- vpbroadcastd m1, t0d
- vpsllvd m0, m1
- mova [r0], m0
- %else
- vpbroadcastw ym1, t0d
- vpmovsdw ym0, m0
- pmullw ym0, [r0]
- vpsllvw ym0, ym1
- mova [r0], ym0
- %endif
- RET
- .rshift:
- %if HIGH_BIT_DEPTH == 0
- pmovzxwd m1, [r0]
- pmaddwd m0, m1
- %endif
- mov r1d, 1<<31
- shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
- neg t0d
- vpbroadcastd m1, r1d
- vpbroadcastd m2, t0d
- paddd m0, m1
- vpsravd m0, m2
- %if HIGH_BIT_DEPTH
- mova [r0], m0
- %else
- vpmovsdw [r0], m0
- %endif
- RET
- cglobal dequant_8x8, 0,3
- DEQUANT_START_AVX512 8
- mova m0, [dmf+0*64]
- mova m1, [dmf+1*64]
- mova m2, [dmf+2*64]
- mova m3, [dmf+3*64]
- %if HIGH_BIT_DEPTH
- pmaddwd m0, [r0+0*64]
- pmaddwd m1, [r0+1*64]
- pmaddwd m2, [r0+2*64]
- pmaddwd m3, [r0+3*64]
- %else
- mova m6, [dequant_shuf_avx512]
- %endif
- sub t0d, 6
- jl .rshift
- %if HIGH_BIT_DEPTH
- vpbroadcastd m4, t0d
- vpsllvd m0, m4
- vpsllvd m1, m4
- vpsllvd m2, m4
- vpsllvd m3, m4
- jmp .end
- .rshift:
- %else
- vpbroadcastw m4, t0d
- vpermt2w m0, m6, m1
- vpermt2w m2, m6, m3
- pmullw m0, [r0]
- pmullw m2, [r0+64]
- vpsllvw m0, m4
- vpsllvw m2, m4
- mova [r0], m0
- mova [r0+64], m2
- RET
- .rshift:
- pmovzxwd m4, [r0+0*32]
- pmovzxwd m5, [r0+1*32]
- pmaddwd m0, m4
- pmaddwd m1, m5
- pmovzxwd m4, [r0+2*32]
- pmovzxwd m5, [r0+3*32]
- pmaddwd m2, m4
- pmaddwd m3, m5
- %endif
- mov r1d, 1<<31
- shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
- neg t0d
- vpbroadcastd m4, r1d
- vpbroadcastd m5, t0d
- paddd m0, m4
- paddd m1, m4
- vpsravd m0, m5
- vpsravd m1, m5
- paddd m2, m4
- paddd m3, m4
- vpsravd m2, m5
- vpsravd m3, m5
- %if HIGH_BIT_DEPTH
- .end:
- mova [r0+0*64], m0
- mova [r0+1*64], m1
- mova [r0+2*64], m2
- mova [r0+3*64], m3
- %else
- vpermt2w m0, m6, m1
- vpermt2w m2, m6, m3
- mova [r0], m0
- mova [r0+64], m2
- %endif
- RET
- %if HIGH_BIT_DEPTH == 0
- cglobal dequant_8x8_flat16, 0,3
- movifnidn t2d, r2m
- cmp t2d, 12
- jl dequant_8x8_avx512
- sub t2d, 12
- DEQUANT_START_AVX512 6, 1
- vpbroadcastw m0, t0d
- mova m1, [dmf]
- vpsllvw m1, m0
- pmullw m0, m1, [r0]
- pmullw m1, [r0+64]
- mova [r0], m0
- mova [r0+64], m1
- RET
- %endif
- %undef dmf
- %macro DEQUANT_DC 2
- cglobal dequant_4x4dc, 0,3,6
- DEQUANT_START 6, 6
- .lshift:
- %if cpuflag(avx2)
- vpbroadcastdct m3, [r1]
- %else
- movd xm3, [r1]
- SPLAT%1 m3, xm3
- %endif
- movd xm2, t0d
- pslld m3, xm2
- %assign %%x 0
- %rep SIZEOF_PIXEL*32/mmsize
- %2 m0, m3, [r0+%%x]
- mova [r0+%%x], m0
- %assign %%x %%x+mmsize
- %endrep
- RET
- .rshift32:
- neg t0d
- %if cpuflag(avx2)
- vpbroadcastdct m2, [r1]
- %else
- movd xm2, [r1]
- %endif
- mova m5, [p%1_1]
- movd xm3, t0d
- pslld m4, m5, xm3
- psrld m4, 1
- %if HIGH_BIT_DEPTH
- %if notcpuflag(avx2)
- pshufd m2, m2, 0
- %endif
- %assign %%x 0
- %rep SIZEOF_PIXEL*32/mmsize
- pmadcswd m0, m2, [r0+%%x], m4
- psrad m0, xm3
- mova [r0+%%x], m0
- %assign %%x %%x+mmsize
- %endrep
- %else ; !HIGH_BIT_DEPTH
- %if notcpuflag(avx2)
- PSHUFLW m2, m2, 0
- %endif
- punpcklwd m2, m4
- %assign %%x 0
- %rep SIZEOF_PIXEL*32/mmsize
- mova m0, [r0+%%x]
- punpckhwd m1, m0, m5
- punpcklwd m0, m5
- pmaddwd m0, m2
- pmaddwd m1, m2
- psrad m0, xm3
- psrad m1, xm3
- packssdw m0, m1
- mova [r0+%%x], m0
- %assign %%x %%x+mmsize
- %endrep
- %endif ; !HIGH_BIT_DEPTH
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- DEQUANT_DC d, pmaddwd
- INIT_XMM xop
- DEQUANT_DC d, pmaddwd
- INIT_YMM avx2
- DEQUANT_DC d, pmaddwd
- %else
- %if ARCH_X86_64 == 0
- INIT_MMX mmx2
- DEQUANT_DC w, pmullw
- %endif
- INIT_XMM sse2
- DEQUANT_DC w, pmullw
- INIT_XMM avx
- DEQUANT_DC w, pmullw
- INIT_YMM avx2
- DEQUANT_DC w, pmullw
- %endif
- %macro PEXTRW 4
- %if cpuflag(sse4)
- pextrw %1, %2, %3
- %else
- ; pextrw with a memory destination requires SSE4.1, go through a GPR as a fallback
- %if %3
- pextrw %4d, %2, %3
- %else
- movd %4d, %2
- %endif
- mov %1, %4w
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
- ; void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
- ;-----------------------------------------------------------------------------
- %macro DEQUANT_2x4_DC 1
- %ifidn %1, dconly
- DECLARE_REG_TMP 6,3,2
- %define %%args dct, dmf, qp
- %else
- DECLARE_REG_TMP 6,4,3
- %define %%args dct, dct4x4, dmf, qp
- %endif
- %if ARCH_X86_64 == 0
- DECLARE_REG_TMP 2,0,1
- %endif
- cglobal idct_dequant_2x4_%1, 0,3,5, %%args
- movifnidn t2d, qpm
- imul t0d, t2d, 0x2b
- shr t0d, 8 ; qp / 6
- lea t1d, [t0*5]
- sub t2d, t0d
- sub t2d, t1d ; qp % 6
- shl t2d, 6 ; 16 * sizeof(int)
- %if ARCH_X86_64
- imul t2d, [dmfq+t2], -0xffff ; (-dmf) << 16 | dmf
- %else
- mov dctq, dctmp
- add t2, dmfmp
- imul t2d, [t2], -0xffff
- %endif
- %if HIGH_BIT_DEPTH
- mova m0, [dctq]
- mova m1, [dctq+16]
- SUMSUB_BA d, 1, 0, 2 ; 16-bit intermediate precision is enough for the first two sumsub steps,
- packssdw m1, m0 ; and by packing to words we can use pmaddwd instead of pmulld later.
- %else
- movq m0, [dctq]
- movq m1, [dctq+8]
- SUMSUB_BA w, 1, 0, 2
- punpcklqdq m1, m0 ; a0 a1 a2 a3 a4 a5 a6 a7
- %endif
- pshufd m0, m1, q2301 ; a2 a3 a0 a1 a6 a7 a4 a5
- movd m3, t2d
- pshuflw m3, m3, q1000 ; + + + -
- SUMSUB_BA w, 0, 1, 2
- punpcklqdq m3, m3 ; + + + - + + + -
- pshufd m1, m1, q0022
- sub t0d, 6
- jl .rshift
- movd m2, t0d
- psllw m3, m2
- pmaddwd m0, m3
- pmaddwd m1, m3
- jmp .end
- .rshift:
- neg t0d
- movd m2, t0d
- pcmpeqd m4, m4
- pmaddwd m0, m3
- pmaddwd m1, m3
- pslld m4, m2
- psrad m4, 1
- psubd m0, m4 ; + 1 << (qp/6-1)
- psubd m1, m4
- psrad m0, m2
- psrad m1, m2
- .end:
- %ifidn %1, dconly
- %if HIGH_BIT_DEPTH
- mova [dctq], m0
- mova [dctq+16], m1
- %else
- packssdw m0, m1
- mova [dctq], m0
- %endif
- %else
- movifnidn dct4x4q, dct4x4mp
- %if HIGH_BIT_DEPTH
- movd [dct4x4q+0*64], m0
- %if cpuflag(sse4)
- pextrd [dct4x4q+1*64], m0, 1
- add dct4x4q, 4*64
- pextrd [dct4x4q-2*64], m0, 2
- pextrd [dct4x4q-1*64], m0, 3
- movd [dct4x4q+0*64], m1
- pextrd [dct4x4q+1*64], m1, 1
- pextrd [dct4x4q+2*64], m1, 2
- pextrd [dct4x4q+3*64], m1, 3
- %else
- MOVHL m2, m0
- psrlq m0, 32
- movd [dct4x4q+1*64], m0
- add dct4x4q, 4*64
- movd [dct4x4q-2*64], m2
- psrlq m2, 32
- movd [dct4x4q-1*64], m2
- movd [dct4x4q+0*64], m1
- MOVHL m2, m1
- psrlq m1, 32
- movd [dct4x4q+1*64], m1
- movd [dct4x4q+2*64], m2
- psrlq m2, 32
- movd [dct4x4q+3*64], m2
- %endif
- %else
- PEXTRW [dct4x4q+0*32], m0, 0, eax
- PEXTRW [dct4x4q+1*32], m0, 2, eax
- PEXTRW [dct4x4q+2*32], m0, 4, eax
- PEXTRW [dct4x4q+3*32], m0, 6, eax
- add dct4x4q, 4*32
- PEXTRW [dct4x4q+0*32], m1, 0, eax
- PEXTRW [dct4x4q+1*32], m1, 2, eax
- PEXTRW [dct4x4q+2*32], m1, 4, eax
- PEXTRW [dct4x4q+3*32], m1, 6, eax
- %endif
- %endif
- RET
- %endmacro
- ; sse4 reduces code size compared to sse2 but isn't any faster, so just go with sse2+avx
- INIT_XMM sse2
- DEQUANT_2x4_DC dc
- DEQUANT_2x4_DC dconly
- INIT_XMM avx
- DEQUANT_2x4_DC dc
- DEQUANT_2x4_DC dconly
- ; t4 is eax for return value.
- %if ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
- %else
- DECLARE_REG_TMP 4,1,2,3,0,5
- %endif
- ;-----------------------------------------------------------------------------
- ; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
- ;-----------------------------------------------------------------------------
- %macro OPTIMIZE_CHROMA_2x2_DC 0
- cglobal optimize_chroma_2x2_dc, 0,6-cpuflag(sse4),7
- movifnidn t0, r0mp
- movd m2, r1m
- movq m1, [t0]
- %if cpuflag(sse4)
- pcmpeqb m4, m4
- pslld m4, 11
- %else
- pxor m4, m4
- %endif
- %if cpuflag(ssse3)
- mova m3, [chroma_dc_dct_mask]
- mova m5, [chroma_dc_dmf_mask]
- %else
- mova m3, [chroma_dc_dct_mask_mmx]
- mova m5, [chroma_dc_dmf_mask_mmx]
- %endif
- pshuflw m2, m2, 0
- pshufd m0, m1, q0101 ; 1 0 3 2 1 0 3 2
- punpcklqdq m2, m2
- punpcklqdq m1, m1 ; 3 2 1 0 3 2 1 0
- mova m6, [pd_1024] ; 32<<5, elements are shifted 5 bits to the left
- PSIGNW m0, m3 ; -1 -0 3 2 -1 -0 3 2
- PSIGNW m2, m5 ; + - - + - - + +
- paddw m0, m1 ; -1+3 -0+2 1+3 0+2 -1+3 -0+2 1+3 0+2
- pmaddwd m0, m2 ; 0-1-2+3 0-1+2-3 0+1-2-3 0+1+2+3 * dmf
- punpcklwd m1, m1
- psrad m2, 16 ; + - - +
- mov t1d, 3
- paddd m0, m6
- xor t4d, t4d
- %if notcpuflag(ssse3)
- psrad m1, 31 ; has to be 0 or -1 in order for PSIGND_MMX to work correctly
- %endif
- %if cpuflag(sse4)
- ptest m0, m4
- %else
- mova m6, m0
- SWAP 0, 6
- psrad m6, 11
- pcmpeqd m6, m4
- pmovmskb t5d, m6
- cmp t5d, 0xffff
- %endif
- jz .ret ; if the DC coefficients already round to zero, terminate early
- mova m3, m0
- .outer_loop:
- movsx t3d, word [t0+2*t1] ; dct[coeff]
- pshufd m6, m1, q3333
- pshufd m1, m1, q2100 ; move the next element to high dword
- PSIGND m5, m2, m6
- test t3d, t3d
- jz .loop_end
- .outer_loop_0:
- mov t2d, t3d
- sar t3d, 31
- or t3d, 1
- .inner_loop:
- psubd m3, m5 ; coeff -= sign
- pxor m6, m0, m3
- %if cpuflag(sse4)
- ptest m6, m4
- %else
- psrad m6, 11
- pcmpeqd m6, m4
- pmovmskb t5d, m6
- cmp t5d, 0xffff
- %endif
- jz .round_coeff
- paddd m3, m5 ; coeff += sign
- mov t4d, 1
- .loop_end:
- dec t1d
- jz .last_coeff
- pshufd m2, m2, q1320 ; - + - + / - - + +
- jg .outer_loop
- .ret:
- REP_RET
- .round_coeff:
- sub t2d, t3d
- mov [t0+2*t1], t2w
- jnz .inner_loop
- jmp .loop_end
- .last_coeff:
- movsx t3d, word [t0]
- punpcklqdq m2, m2 ; + + + +
- PSIGND m5, m2, m1
- test t3d, t3d
- jnz .outer_loop_0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH == 0
- INIT_XMM sse2
- OPTIMIZE_CHROMA_2x2_DC
- INIT_XMM ssse3
- OPTIMIZE_CHROMA_2x2_DC
- INIT_XMM sse4
- OPTIMIZE_CHROMA_2x2_DC
- INIT_XMM avx
- OPTIMIZE_CHROMA_2x2_DC
- %endif ; !HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
- ;-----------------------------------------------------------------------------
- %macro DENOISE_DCT 0
- cglobal denoise_dct, 4,4,6
- pxor m5, m5
- movsxdifnidn r3, r3d
- .loop:
- mova m2, [r0+r3*4-2*mmsize]
- mova m3, [r0+r3*4-1*mmsize]
- ABSD m0, m2
- ABSD m1, m3
- paddd m4, m0, [r1+r3*4-2*mmsize]
- psubd m0, [r2+r3*4-2*mmsize]
- mova [r1+r3*4-2*mmsize], m4
- paddd m4, m1, [r1+r3*4-1*mmsize]
- psubd m1, [r2+r3*4-1*mmsize]
- mova [r1+r3*4-1*mmsize], m4
- pcmpgtd m4, m0, m5
- pand m0, m4
- pcmpgtd m4, m1, m5
- pand m1, m4
- PSIGND m0, m2
- PSIGND m1, m3
- mova [r0+r3*4-2*mmsize], m0
- mova [r0+r3*4-1*mmsize], m1
- sub r3d, mmsize/2
- jg .loop
- RET
- %endmacro
- %if ARCH_X86_64 == 0
- INIT_MMX mmx
- DENOISE_DCT
- %endif
- INIT_XMM sse2
- DENOISE_DCT
- INIT_XMM ssse3
- DENOISE_DCT
- INIT_XMM avx
- DENOISE_DCT
- INIT_YMM avx2
- DENOISE_DCT
- %else ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
- ;-----------------------------------------------------------------------------
- %macro DENOISE_DCT 0
- cglobal denoise_dct, 4,4,7
- pxor m6, m6
- movsxdifnidn r3, r3d
- .loop:
- mova m2, [r0+r3*2-2*mmsize]
- mova m3, [r0+r3*2-1*mmsize]
- ABSW m0, m2, sign
- ABSW m1, m3, sign
- psubusw m4, m0, [r2+r3*2-2*mmsize]
- psubusw m5, m1, [r2+r3*2-1*mmsize]
- PSIGNW m4, m2
- PSIGNW m5, m3
- mova [r0+r3*2-2*mmsize], m4
- mova [r0+r3*2-1*mmsize], m5
- punpcklwd m2, m0, m6
- punpcklwd m3, m1, m6
- punpckhwd m0, m6
- punpckhwd m1, m6
- paddd m2, [r1+r3*4-4*mmsize]
- paddd m0, [r1+r3*4-3*mmsize]
- paddd m3, [r1+r3*4-2*mmsize]
- paddd m1, [r1+r3*4-1*mmsize]
- mova [r1+r3*4-4*mmsize], m2
- mova [r1+r3*4-3*mmsize], m0
- mova [r1+r3*4-2*mmsize], m3
- mova [r1+r3*4-1*mmsize], m1
- sub r3, mmsize
- jg .loop
- RET
- %endmacro
- %if ARCH_X86_64 == 0
- INIT_MMX mmx
- DENOISE_DCT
- %endif
- INIT_XMM sse2
- DENOISE_DCT
- INIT_XMM ssse3
- DENOISE_DCT
- INIT_XMM avx
- DENOISE_DCT
- INIT_YMM avx2
- cglobal denoise_dct, 4,4,4
- pxor m3, m3
- movsxdifnidn r3, r3d
- .loop:
- mova m1, [r0+r3*2-mmsize]
- pabsw m0, m1
- psubusw m2, m0, [r2+r3*2-mmsize]
- vpermq m0, m0, q3120
- psignw m2, m1
- mova [r0+r3*2-mmsize], m2
- punpcklwd m1, m0, m3
- punpckhwd m0, m3
- paddd m1, [r1+r3*4-2*mmsize]
- paddd m0, [r1+r3*4-1*mmsize]
- mova [r1+r3*4-2*mmsize], m1
- mova [r1+r3*4-1*mmsize], m0
- sub r3, mmsize/2
- jg .loop
- RET
- %endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; int decimate_score( dctcoef *dct )
- ;-----------------------------------------------------------------------------
- %macro DECIMATE_MASK 4
- %if HIGH_BIT_DEPTH
- mova m0, [%3+0*16]
- packssdw m0, [%3+1*16]
- mova m1, [%3+2*16]
- packssdw m1, [%3+3*16]
- ABSW2 m0, m1, m0, m1, m3, m4
- %else
- ABSW m0, [%3+ 0], m3
- ABSW m1, [%3+16], m4
- %endif
- packsswb m0, m1
- pxor m2, m2
- pcmpeqb m2, m0
- pcmpgtb m0, %4
- pmovmskb %1, m2
- pmovmskb %2, m0
- %endmacro
- %macro DECIMATE_MASK16_AVX512 0
- mova m0, [r0]
- %if HIGH_BIT_DEPTH
- vptestmd k0, m0, m0
- pabsd m0, m0
- vpcmpud k1, m0, [pd_1] {1to16}, 6
- %else
- vptestmw k0, m0, m0
- pabsw m0, m0
- vpcmpuw k1, m0, [pw_1], 6
- %endif
- %endmacro
- %macro SHRX 2
- %if cpuflag(bmi2)
- shrx %1, %1, %2
- %else
- shr %1, %2b ; %2 has to be rcx/ecx
- %endif
- %endmacro
- %macro BLSR 2
- %if cpuflag(bmi1)
- blsr %1, %2
- %else
- lea %1, [%2-1]
- and %1, %2
- %endif
- %endmacro
- cextern_common decimate_table4
- cextern_common decimate_table8
- %macro DECIMATE4x4 1
- cglobal decimate_score%1, 1,3
- %if cpuflag(avx512)
- DECIMATE_MASK16_AVX512
- xor eax, eax
- kmovw edx, k0
- %if %1 == 15
- shr edx, 1
- %else
- test edx, edx
- %endif
- jz .ret
- ktestw k1, k1
- jnz .ret9
- %else
- DECIMATE_MASK edx, eax, r0, [pb_1]
- xor edx, 0xffff
- jz .ret
- test eax, eax
- jnz .ret9
- %if %1 == 15
- shr edx, 1
- %endif
- %endif
- %ifdef PIC
- lea r4, [decimate_mask_table4]
- %define mask_table r4
- %else
- %define mask_table decimate_mask_table4
- %endif
- movzx ecx, dl
- movzx eax, byte [mask_table + rcx]
- %if ARCH_X86_64
- xor edx, ecx
- jz .ret
- %if cpuflag(lzcnt)
- lzcnt ecx, ecx
- lea r5, [decimate_table4-32]
- add r5, rcx
- %else
- bsr ecx, ecx
- lea r5, [decimate_table4-1]
- sub r5, rcx
- %endif
- %define table r5
- %else
- cmp edx, ecx
- jz .ret
- bsr ecx, ecx
- shr edx, 1
- SHRX edx, ecx
- %define table decimate_table4
- %endif
- tzcnt ecx, edx
- shr edx, 1
- SHRX edx, ecx
- add al, byte [table + rcx]
- add al, byte [mask_table + rdx]
- .ret:
- REP_RET
- .ret9:
- mov eax, 9
- RET
- %endmacro
- %macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high
- mova m0, [r0+0*32]
- packsswb m0, [r0+1*32]
- mova m1, [r0+2*32]
- packsswb m1, [r0+3*32]
- mova m4, [pb_1]
- pabsb m2, m0
- pabsb m3, m1
- por m2, m3 ; the > 1 checks don't care about order, so
- ptest m4, m2 ; we can save latency by doing them here
- jnc .ret9
- vpermq m0, m0, q3120
- vpermq m1, m1, q3120
- pxor m4, m4
- pcmpeqb m0, m4
- pcmpeqb m1, m4
- pmovmskb %1, m0
- pmovmskb %2, m1
- %endmacro
- %macro DECIMATE_MASK64_AVX512 0
- mova m0, [r0]
- %if HIGH_BIT_DEPTH
- packssdw m0, [r0+1*64]
- mova m1, [r0+2*64]
- packssdw m1, [r0+3*64]
- packsswb m0, m1
- vbroadcasti32x4 m1, [pb_1]
- pabsb m2, m0
- vpcmpub k0, m2, m1, 6
- ktestq k0, k0
- jnz .ret9
- mova m1, [decimate_shuf_avx512]
- vpermd m0, m1, m0
- vptestmb k1, m0, m0
- %else
- mova m1, [r0+64]
- vbroadcasti32x4 m3, [pb_1]
- packsswb m2, m0, m1
- pabsb m2, m2
- vpcmpub k0, m2, m3, 6
- ktestq k0, k0
- jnz .ret9
- vptestmw k1, m0, m0
- vptestmw k2, m1, m1
- %endif
- %endmacro
- %macro DECIMATE8x8 0
- %if ARCH_X86_64
- cglobal decimate_score64, 1,5
- %if mmsize == 64
- DECIMATE_MASK64_AVX512
- xor eax, eax
- %if HIGH_BIT_DEPTH
- kmovq r1, k1
- test r1, r1
- jz .ret
- %else
- kortestd k1, k2
- jz .ret
- kunpckdq k1, k2, k1
- kmovq r1, k1
- %endif
- %elif mmsize == 32
- DECIMATE_MASK64_AVX2 r1d, eax
- not r1
- shl rax, 32
- xor r1, rax
- jz .ret
- %else
- mova m5, [pb_1]
- DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5
- test eax, eax
- jnz .ret9
- DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5
- shl r2d, 16
- or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5
- shl r2, 32
- or eax, r3d
- or r1, r2
- DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5
- not r1
- shl r2, 48
- xor r1, r2
- jz .ret
- add eax, r3d
- jnz .ret9
- %endif
- %ifdef PIC
- lea r4, [decimate_table8]
- %define table r4
- %else
- %define table decimate_table8
- %endif
- mov al, -6
- .loop:
- tzcnt rcx, r1
- add al, byte [table + rcx]
- jge .ret9
- shr r1, 1
- SHRX r1, rcx
- %if cpuflag(bmi2)
- test r1, r1
- %endif
- jnz .loop
- add al, 6
- .ret:
- REP_RET
- .ret9:
- mov eax, 9
- RET
- %else ; ARCH
- cglobal decimate_score64, 1,4
- %if mmsize == 64
- DECIMATE_MASK64_AVX512
- xor eax, eax
- %if HIGH_BIT_DEPTH
- kshiftrq k2, k1, 32
- %endif
- kmovd r2, k1
- kmovd r3, k2
- test r2, r2
- jz .tryret
- %elif mmsize == 32
- DECIMATE_MASK64_AVX2 r2, r3
- xor eax, eax
- not r3
- xor r2, -1
- jz .tryret
- %else
- mova m5, [pb_1]
- DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5
- test r1, r1
- jnz .ret9
- DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5
- not r2
- shl r3, 16
- xor r2, r3
- mov r0m, r2
- DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5
- or r2, r1
- DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5
- add r0, r2
- jnz .ret9
- mov r2, r0m
- not r3
- shl r1, 16
- xor r3, r1
- test r2, r2
- jz .tryret
- %endif
- mov al, -6
- .loop:
- tzcnt ecx, r2
- add al, byte [decimate_table8 + ecx]
- jge .ret9
- sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well
- jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31
- shrd r2, r3, cl
- SHRX r3, ecx
- %if notcpuflag(bmi2)
- test r2, r2
- %endif
- jnz .loop
- BLSR r2, r3
- jz .end
- .largerun:
- tzcnt ecx, r3
- shr r3, 1
- SHRX r3, ecx
- .loop2:
- tzcnt ecx, r3
- add al, byte [decimate_table8 + ecx]
- jge .ret9
- shr r3, 1
- SHRX r3, ecx
- .run31:
- test r3, r3
- jnz .loop2
- .end:
- add al, 6
- RET
- .tryret:
- BLSR r2, r3
- jz .ret
- mov al, -6
- jmp .largerun
- .ret9:
- mov eax, 9
- .ret:
- REP_RET
- %endif ; ARCH
- %endmacro
- INIT_XMM sse2
- DECIMATE4x4 15
- DECIMATE4x4 16
- DECIMATE8x8
- INIT_XMM ssse3
- DECIMATE4x4 15
- DECIMATE4x4 16
- DECIMATE8x8
- %if HIGH_BIT_DEPTH
- INIT_ZMM avx512
- %else
- INIT_YMM avx2
- DECIMATE8x8
- INIT_YMM avx512
- %endif
- DECIMATE4x4 15
- DECIMATE4x4 16
- INIT_ZMM avx512
- DECIMATE8x8
- ;-----------------------------------------------------------------------------
- ; int coeff_last( dctcoef *dct )
- ;-----------------------------------------------------------------------------
- %macro BSR 3
- %if cpuflag(lzcnt)
- lzcnt %1, %2
- xor %1, %3
- %else
- bsr %1, %2
- %endif
- %endmacro
- %macro LZCOUNT 3
- %if cpuflag(lzcnt)
- lzcnt %1, %2
- %else
- bsr %1, %2
- xor %1, %3
- %endif
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro LAST_MASK 3-4
- %if %1 == 4
- movq mm0, [%3]
- packssdw mm0, [%3+8]
- packsswb mm0, mm0
- pcmpeqb mm0, mm2
- pmovmskb %2, mm0
- %elif mmsize == 16
- movdqa xmm0, [%3+ 0]
- %if %1 == 8
- packssdw xmm0, [%3+16]
- packsswb xmm0, xmm0
- %else
- movdqa xmm1, [%3+32]
- packssdw xmm0, [%3+16]
- packssdw xmm1, [%3+48]
- packsswb xmm0, xmm1
- %endif
- pcmpeqb xmm0, xmm2
- pmovmskb %2, xmm0
- %elif %1 == 8
- movq mm0, [%3+ 0]
- movq mm1, [%3+16]
- packssdw mm0, [%3+ 8]
- packssdw mm1, [%3+24]
- packsswb mm0, mm1
- pcmpeqb mm0, mm2
- pmovmskb %2, mm0
- %else
- movq mm0, [%3+ 0]
- movq mm1, [%3+16]
- packssdw mm0, [%3+ 8]
- packssdw mm1, [%3+24]
- movq mm3, [%3+32]
- movq mm4, [%3+48]
- packssdw mm3, [%3+40]
- packssdw mm4, [%3+56]
- packsswb mm0, mm1
- packsswb mm3, mm4
- pcmpeqb mm0, mm2
- pcmpeqb mm3, mm2
- pmovmskb %2, mm0
- pmovmskb %4, mm3
- shl %4, 8
- or %2, %4
- %endif
- %endmacro
- %macro COEFF_LAST4 0
- cglobal coeff_last4, 1,3
- pxor mm2, mm2
- LAST_MASK 4, r1d, r0
- xor r1d, 0xff
- shr r1d, 4
- BSR eax, r1d, 0x1f
- RET
- %endmacro
- INIT_MMX mmx2
- COEFF_LAST4
- INIT_MMX lzcnt
- COEFF_LAST4
- %macro COEFF_LAST8 0
- cglobal coeff_last8, 1,3
- pxor m2, m2
- LAST_MASK 8, r1d, r0
- %if mmsize == 16
- xor r1d, 0xffff
- shr r1d, 8
- %else
- xor r1d, 0xff
- %endif
- BSR eax, r1d, 0x1f
- RET
- %endmacro
- %if ARCH_X86_64 == 0
- INIT_MMX mmx2
- COEFF_LAST8
- %endif
- INIT_XMM sse2
- COEFF_LAST8
- INIT_XMM lzcnt
- COEFF_LAST8
- %else ; !HIGH_BIT_DEPTH
- %macro LAST_MASK 3-4
- %if %1 <= 8
- movq mm0, [%3+ 0]
- %if %1 == 4
- packsswb mm0, mm0
- %else
- packsswb mm0, [%3+ 8]
- %endif
- pcmpeqb mm0, mm2
- pmovmskb %2, mm0
- %elif mmsize == 16
- movdqa xmm0, [%3+ 0]
- packsswb xmm0, [%3+16]
- pcmpeqb xmm0, xmm2
- pmovmskb %2, xmm0
- %else
- movq mm0, [%3+ 0]
- movq mm1, [%3+16]
- packsswb mm0, [%3+ 8]
- packsswb mm1, [%3+24]
- pcmpeqb mm0, mm2
- pcmpeqb mm1, mm2
- pmovmskb %2, mm0
- pmovmskb %4, mm1
- shl %4, 8
- or %2, %4
- %endif
- %endmacro
- %macro COEFF_LAST48 0
- %if ARCH_X86_64
- cglobal coeff_last4, 1,1
- BSR rax, [r0], 0x3f
- shr eax, 4
- RET
- %else
- cglobal coeff_last4, 0,3
- mov edx, r0mp
- mov eax, [edx+4]
- xor ecx, ecx
- test eax, eax
- cmovz eax, [edx]
- setnz cl
- BSR eax, eax, 0x1f
- shr eax, 4
- lea eax, [eax+ecx*2]
- RET
- %endif
- cglobal coeff_last8, 1,3
- pxor m2, m2
- LAST_MASK 8, r1d, r0, r2d
- xor r1d, 0xff
- BSR eax, r1d, 0x1f
- RET
- %endmacro
- INIT_MMX mmx2
- COEFF_LAST48
- INIT_MMX lzcnt
- COEFF_LAST48
- %endif ; HIGH_BIT_DEPTH
- %macro COEFF_LAST 0
- cglobal coeff_last15, 1,3
- pxor m2, m2
- LAST_MASK 15, r1d, r0-SIZEOF_DCTCOEF, r2d
- xor r1d, 0xffff
- BSR eax, r1d, 0x1f
- dec eax
- RET
- cglobal coeff_last16, 1,3
- pxor m2, m2
- LAST_MASK 16, r1d, r0, r2d
- xor r1d, 0xffff
- BSR eax, r1d, 0x1f
- RET
- %if ARCH_X86_64 == 0
- cglobal coeff_last64, 1, 4-mmsize/16
- pxor m2, m2
- LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 32, r3d
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 48, r3d
- shl r2d, 16
- or r1d, r2d
- xor r1d, -1
- jne .secondhalf
- LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0, r3d
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16, r3d
- shl r2d, 16
- or r1d, r2d
- not r1d
- BSR eax, r1d, 0x1f
- RET
- .secondhalf:
- BSR eax, r1d, 0x1f
- add eax, 32
- RET
- %else
- cglobal coeff_last64, 1,3
- pxor m2, m2
- LAST_MASK 16, r1d, r0+SIZEOF_DCTCOEF* 0
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*16
- shl r2d, 16
- or r1d, r2d
- LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF*32
- LAST_MASK 16, r0d, r0+SIZEOF_DCTCOEF*48
- shl r0d, 16
- or r2d, r0d
- shl r2, 32
- or r1, r2
- not r1
- BSR rax, r1, 0x3f
- RET
- %endif
- %endmacro
- %if ARCH_X86_64 == 0
- INIT_MMX mmx2
- COEFF_LAST
- %endif
- INIT_XMM sse2
- COEFF_LAST
- INIT_XMM lzcnt
- COEFF_LAST
- %macro LAST_MASK_AVX2 2
- %if HIGH_BIT_DEPTH
- mova m0, [%2+ 0]
- packssdw m0, [%2+32]
- mova m1, [%2+64]
- packssdw m1, [%2+96]
- packsswb m0, m1
- mova m1, [deinterleave_shufd]
- vpermd m0, m1, m0
- %else
- mova m0, [%2+ 0]
- packsswb m0, [%2+32]
- vpermq m0, m0, q3120
- %endif
- pcmpeqb m0, m2
- pmovmskb %1, m0
- %endmacro
- %if ARCH_X86_64 == 0
- INIT_YMM avx2
- cglobal coeff_last64, 1,2
- pxor m2, m2
- LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
- xor r1d, -1
- jne .secondhalf
- LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
- not r1d
- BSR eax, r1d, 0x1f
- RET
- .secondhalf:
- BSR eax, r1d, 0x1f
- add eax, 32
- RET
- %else
- INIT_YMM avx2
- cglobal coeff_last64, 1,3
- pxor m2, m2
- LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
- LAST_MASK_AVX2 r2d, r0+SIZEOF_DCTCOEF*32
- shl r2, 32
- or r1, r2
- not r1
- BSR rax, r1, 0x3f
- RET
- %endif
- %macro COEFF_LAST_AVX512 2 ; num, w/d
- cglobal coeff_last%1, 1,2
- mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
- vptestm%2 k0, m0, m0
- %if %1 == 15
- mov eax, 30
- kmovw r1d, k0
- lzcnt r1d, r1d
- sub eax, r1d
- %else
- kmovw eax, k0
- lzcnt eax, eax
- xor eax, 31
- %endif
- RET
- %endmacro
- %macro COEFF_LAST64_AVX512 1 ; w/d
- cglobal coeff_last64, 1,2
- pxor xm0, xm0
- vpcmp%1 k0, m0, [r0+0*64], 4
- vpcmp%1 k1, m0, [r0+1*64], 4
- %if HIGH_BIT_DEPTH
- vpcmp%1 k2, m0, [r0+2*64], 4
- vpcmp%1 k3, m0, [r0+3*64], 4
- kunpckwd k0, k1, k0
- kunpckwd k1, k3, k2
- %endif
- %if ARCH_X86_64
- kunpckdq k0, k1, k0
- kmovq rax, k0
- lzcnt rax, rax
- xor eax, 63
- %else
- kmovd r1d, k1
- kmovd eax, k0
- lzcnt r1d, r1d
- lzcnt eax, eax
- xor r1d, 32
- cmovnz eax, r1d
- xor eax, 31
- %endif
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM avx512
- COEFF_LAST_AVX512 4, d
- INIT_YMM avx512
- COEFF_LAST_AVX512 8, d
- INIT_ZMM avx512
- COEFF_LAST_AVX512 15, d
- COEFF_LAST_AVX512 16, d
- COEFF_LAST64_AVX512 d
- %else ; !HIGH_BIT_DEPTH
- INIT_XMM avx512
- COEFF_LAST_AVX512 8, w
- INIT_YMM avx512
- COEFF_LAST_AVX512 15, w
- COEFF_LAST_AVX512 16, w
- INIT_ZMM avx512
- COEFF_LAST64_AVX512 w
- %endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
- ;-----------------------------------------------------------------------------
- struc levelrun
- .last: resd 1
- .mask: resd 1
- align 16, resb 1
- .level: resw 16
- endstruc
- ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
- %if WIN64
- DECLARE_REG_TMP 3,1,2,0,4,5,6
- %elif ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,6
- %else
- DECLARE_REG_TMP 6,3,2,1,4,5,0
- %endif
- %macro COEFF_LEVELRUN 1
- cglobal coeff_level_run%1,0,7
- movifnidn t0, r0mp
- movifnidn t1, r1mp
- pxor m2, m2
- xor t3d, t3d
- LAST_MASK %1, t5d, t0-(%1&1)*SIZEOF_DCTCOEF, t4d
- %if %1==15
- shr t5d, 1
- %elif %1==8
- and t5d, 0xff
- %elif %1==4
- and t5d, 0xf
- %endif
- xor t5d, (1<<%1)-1
- mov [t1+levelrun.mask], t5d
- shl t5d, 32-%1
- mov t4d, %1-1
- LZCOUNT t3d, t5d, 0x1f
- xor t6d, t6d
- add t5d, t5d
- sub t4d, t3d
- shl t5d, t3b
- mov [t1+levelrun.last], t4d
- .loop:
- LZCOUNT t3d, t5d, 0x1f
- %if HIGH_BIT_DEPTH
- mov t2d, [t0+t4*4]
- %else
- mov t2w, [t0+t4*2]
- %endif
- inc t3d
- shl t5d, t3b
- %if HIGH_BIT_DEPTH
- mov [t1+t6*4+levelrun.level], t2d
- %else
- mov [t1+t6*2+levelrun.level], t2w
- %endif
- inc t6d
- sub t4d, t3d
- jge .loop
- RET
- %endmacro
- INIT_MMX mmx2
- %if ARCH_X86_64 == 0
- COEFF_LEVELRUN 15
- COEFF_LEVELRUN 16
- %endif
- COEFF_LEVELRUN 4
- COEFF_LEVELRUN 8
- INIT_XMM sse2
- %if HIGH_BIT_DEPTH
- COEFF_LEVELRUN 8
- %endif
- COEFF_LEVELRUN 15
- COEFF_LEVELRUN 16
- INIT_MMX lzcnt
- COEFF_LEVELRUN 4
- %if HIGH_BIT_DEPTH == 0
- COEFF_LEVELRUN 8
- %endif
- INIT_XMM lzcnt
- %if HIGH_BIT_DEPTH
- COEFF_LEVELRUN 8
- %endif
- COEFF_LEVELRUN 15
- COEFF_LEVELRUN 16
- ; Similar to the one above, but saves the DCT
- ; coefficients in m0/m1 so we don't have to load
- ; them later.
- %macro LAST_MASK_LUT 3
- pxor xm5, xm5
- %if %1 <= 8
- mova m0, [%3]
- packsswb m2, m0, m0
- %else
- mova xm0, [%3+ 0]
- mova xm1, [%3+16]
- packsswb xm2, xm0, xm1
- %if mmsize==32
- vinserti128 m0, m0, xm1, 1
- %endif
- %endif
- pcmpeqb xm2, xm5
- pmovmskb %2, xm2
- %endmacro
- %macro COEFF_LEVELRUN_LUT 1
- cglobal coeff_level_run%1,2,4+(%1/9)
- %ifdef PIC
- lea r5, [$$]
- %define GLOBAL +r5-$$
- %else
- %define GLOBAL
- %endif
- LAST_MASK_LUT %1, eax, r0-(%1&1)*SIZEOF_DCTCOEF
- %if %1==15
- shr eax, 1
- %elif %1==8
- and eax, 0xff
- %elif %1==4
- and eax, 0xf
- %endif
- xor eax, (1<<%1)-1
- mov [r1+levelrun.mask], eax
- %if %1==15
- add eax, eax
- %endif
- %if %1 > 8
- %if ARCH_X86_64
- mov r4d, eax
- shr r4d, 8
- %else
- movzx r4d, ah ; first 8 bits
- %endif
- %endif
- movzx r2d, al ; second 8 bits
- shl eax, 32-%1-(%1&1)
- LZCOUNT eax, eax, 0x1f
- mov r3d, %1-1
- sub r3d, eax
- mov [r1+levelrun.last], r3d
- ; Here we abuse pshufb, combined with a lookup table, to do a gather
- ; operation based on a bitmask. For example:
- ;
- ; dct 15-8 (input): 0 0 4 0 0 -2 1 0
- ; dct 7-0 (input): 0 0 -1 0 0 0 0 15
- ; bitmask 1: 0 0 1 0 0 1 1 0
- ; bitmask 2: 0 0 1 0 0 0 0 1
- ; gather 15-8: 4 -2 1 __ __ __ __ __
- ; gather 7-0: -1 15 __ __ __ __ __ __
- ; levels (output): 4 -2 1 -1 15 __ __ __ __ __ __ __ __ __ __ __
- ;
- ; The overlapping, dependent stores almost surely cause a mess of
- ; forwarding issues, but it's still enormously faster.
- %if %1 > 8
- movzx eax, byte [popcnt_table+r4 GLOBAL]
- movzx r3d, byte [popcnt_table+r2 GLOBAL]
- %if mmsize==16
- movh m3, [dct_coef_shuffle+r4*8 GLOBAL]
- movh m2, [dct_coef_shuffle+r2*8 GLOBAL]
- mova m4, [pw_256]
- ; Storing 8 bytes of shuffle constant and converting it (unpack + or)
- ; is neutral to slightly faster in local speed measurements, but it
- ; cuts the table size in half, which is surely a big cache win.
- punpcklbw m3, m3
- punpcklbw m2, m2
- por m3, m4
- por m2, m4
- pshufb m1, m3
- pshufb m0, m2
- mova [r1+levelrun.level], m1
- ; This obnoxious unaligned store messes with store forwarding and
- ; stalls the CPU to no end, but merging the two registers before
- ; storing requires a variable 128-bit shift. Emulating this does
- ; work, but requires a lot of ops and the gain is tiny and
- ; inconsistent, so we'll err on the side of fewer instructions.
- movu [r1+rax*2+levelrun.level], m0
- %else ; mmsize==32
- movq xm2, [dct_coef_shuffle+r4*8 GLOBAL]
- vinserti128 m2, m2, [dct_coef_shuffle+r2*8 GLOBAL], 1
- punpcklbw m2, m2
- por m2, [pw_256]
- pshufb m0, m2
- vextracti128 [r1+levelrun.level], m0, 1
- movu [r1+rax*2+levelrun.level], xm0
- %endif
- add eax, r3d
- %else
- movzx eax, byte [popcnt_table+r2 GLOBAL]
- movh m1, [dct_coef_shuffle+r2*8 GLOBAL]
- punpcklbw m1, m1
- por m1, [pw_256]
- pshufb m0, m1
- mova [r1+levelrun.level], m0
- %endif
- RET
- %endmacro
- %if HIGH_BIT_DEPTH==0
- INIT_MMX ssse3
- COEFF_LEVELRUN_LUT 4
- INIT_XMM ssse3
- COEFF_LEVELRUN_LUT 8
- COEFF_LEVELRUN_LUT 15
- COEFF_LEVELRUN_LUT 16
- INIT_MMX ssse3, lzcnt
- COEFF_LEVELRUN_LUT 4
- INIT_XMM ssse3, lzcnt
- COEFF_LEVELRUN_LUT 8
- COEFF_LEVELRUN_LUT 15
- COEFF_LEVELRUN_LUT 16
- INIT_XMM avx2
- COEFF_LEVELRUN_LUT 15
- COEFF_LEVELRUN_LUT 16
- %endif
|