123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226 |
- ;*****************************************************************************
- ;* mc-a.asm: x86 motion compensation
- ;*****************************************************************************
- ;* Copyright (C) 2003-2018 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;* Fiona Glaser <fiona@x264.com>
- ;* Laurent Aimar <fenrir@via.ecp.fr>
- ;* Dylan Yudaken <dyudaken@gmail.com>
- ;* Holger Lubitz <holger@lubitz.org>
- ;* Min Chen <chenm001.163.com>
- ;* Oskar Arvidsson <oskar@irock.se>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 32
- ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
- ch_shuf_adj: times 8 db 0
- times 8 db 2
- times 8 db 4
- times 8 db 6
- sq_1: times 1 dq 1
- SECTION .text
- cextern pb_0
- cextern pw_1
- cextern pw_4
- cextern pw_8
- cextern pw_32
- cextern pw_64
- cextern pw_512
- cextern pw_00ff
- cextern pw_pixel_max
- cextern sw_64
- cextern pd_32
- cextern deinterleave_shufd
- ;=============================================================================
- ; implicit weighted biprediction
- ;=============================================================================
- ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
- %if WIN64
- DECLARE_REG_TMP 0,1,2,3,4,5,4,5
- %macro AVG_START 0-1 0
- PROLOGUE 6,7,%1
- %endmacro
- %elif UNIX64
- DECLARE_REG_TMP 0,1,2,3,4,5,7,8
- %macro AVG_START 0-1 0
- PROLOGUE 6,9,%1
- %endmacro
- %else
- DECLARE_REG_TMP 1,2,3,4,5,6,1,2
- %macro AVG_START 0-1 0
- PROLOGUE 0,7,%1
- mov t0, r0m
- mov t1, r1m
- mov t2, r2m
- mov t3, r3m
- mov t4, r4m
- mov t5, r5m
- %endmacro
- %endif
- %macro AVG_END 0-1 2 ; rows
- lea t2, [t2+t3*2*SIZEOF_PIXEL]
- lea t4, [t4+t5*2*SIZEOF_PIXEL]
- lea t0, [t0+t1*2*SIZEOF_PIXEL]
- sub eax, %1
- jg .height_loop
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro BIWEIGHT_MMX 2
- movh m0, %1
- movh m1, %2
- punpcklwd m0, m1
- pmaddwd m0, m3
- paddd m0, m4
- psrad m0, 6
- %endmacro
- %macro BIWEIGHT_START_MMX 0
- movzx t6d, word r6m
- mov t7d, 64
- sub t7d, t6d
- shl t7d, 16
- add t6d, t7d
- movd m3, t6d
- SPLATD m3, m3
- mova m4, [pd_32]
- pxor m5, m5
- %endmacro
- %else ;!HIGH_BIT_DEPTH
- %macro BIWEIGHT_MMX 2
- movh m0, %1
- movh m1, %2
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, m2
- pmullw m1, m3
- paddw m0, m1
- paddw m0, m4
- psraw m0, 6
- %endmacro
- %macro BIWEIGHT_START_MMX 0
- movd m2, r6m
- SPLATW m2, m2 ; weight_dst
- mova m3, [pw_64]
- psubw m3, m2 ; weight_src
- mova m4, [pw_32] ; rounding
- pxor m5, m5
- %endmacro
- %endif ;HIGH_BIT_DEPTH
- %macro BIWEIGHT_SSSE3 2
- movh m0, %1
- movh m1, %2
- punpcklbw m0, m1
- pmaddubsw m0, m3
- pmulhrsw m0, m4
- %endmacro
- %macro BIWEIGHT_START_SSSE3 0
- movzx t6d, byte r6m ; FIXME x86_64
- %if mmsize > 16
- vbroadcasti128 m4, [pw_512]
- %else
- mova m4, [pw_512]
- %endif
- lea t7d, [t6+(64<<8)]
- shl t6d, 8
- sub t7d, t6d
- %if cpuflag(avx512)
- vpbroadcastw m3, t7d
- %else
- movd xm3, t7d
- %if cpuflag(avx2)
- vpbroadcastw m3, xm3
- %else
- SPLATW m3, m3 ; weight_dst,src
- %endif
- %endif
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro BIWEIGHT_ROW 4
- BIWEIGHT [%2], [%3]
- %if %4==mmsize/4
- packssdw m0, m0
- CLIPW m0, m5, m7
- movh [%1], m0
- %else
- SWAP 0, 6
- BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
- packssdw m6, m0
- CLIPW m6, m5, m7
- mova [%1], m6
- %endif
- %endmacro
- %else ;!HIGH_BIT_DEPTH
- %macro BIWEIGHT_ROW 4
- BIWEIGHT [%2], [%3]
- %if %4==mmsize/2
- packuswb m0, m0
- movh [%1], m0
- %else
- SWAP 0, 6
- BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
- packuswb m6, m0
- mova [%1], m6
- %endif
- %endmacro
- %endif ;HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
- ;-----------------------------------------------------------------------------
- %macro AVG_WEIGHT 1-2 0
- cglobal pixel_avg_weight_w%1
- BIWEIGHT_START
- AVG_START %2
- %if HIGH_BIT_DEPTH
- mova m7, [pw_pixel_max]
- %endif
- .height_loop:
- %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
- BIWEIGHT [t2], [t4]
- SWAP 0, 6
- BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
- %if HIGH_BIT_DEPTH
- packssdw m6, m0
- CLIPW m6, m5, m7
- %else ;!HIGH_BIT_DEPTH
- packuswb m6, m0
- %endif ;HIGH_BIT_DEPTH
- movlps [t0], m6
- movhps [t0+SIZEOF_PIXEL*t1], m6
- %else
- %assign x 0
- %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
- BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
- BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
- %assign x x+mmsize
- %endrep
- %endif
- AVG_END
- %endmacro
- %define BIWEIGHT BIWEIGHT_MMX
- %define BIWEIGHT_START BIWEIGHT_START_MMX
- INIT_MMX mmx2
- AVG_WEIGHT 4
- AVG_WEIGHT 8
- AVG_WEIGHT 16
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- AVG_WEIGHT 4, 8
- AVG_WEIGHT 8, 8
- AVG_WEIGHT 16, 8
- %else ;!HIGH_BIT_DEPTH
- INIT_XMM sse2
- AVG_WEIGHT 8, 7
- AVG_WEIGHT 16, 7
- %define BIWEIGHT BIWEIGHT_SSSE3
- %define BIWEIGHT_START BIWEIGHT_START_SSSE3
- INIT_MMX ssse3
- AVG_WEIGHT 4
- INIT_XMM ssse3
- AVG_WEIGHT 8, 7
- AVG_WEIGHT 16, 7
- INIT_YMM avx2
- cglobal pixel_avg_weight_w16
- BIWEIGHT_START
- AVG_START 5
- .height_loop:
- movu xm0, [t2]
- movu xm1, [t4]
- vinserti128 m0, m0, [t2+t3], 1
- vinserti128 m1, m1, [t4+t5], 1
- SBUTTERFLY bw, 0, 1, 2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- pmulhrsw m0, m4
- pmulhrsw m1, m4
- packuswb m0, m1
- mova [t0], xm0
- vextracti128 [t0+t1], m0, 1
- AVG_END
- INIT_YMM avx512
- cglobal pixel_avg_weight_w8
- BIWEIGHT_START
- kxnorb k1, k1, k1
- kaddb k1, k1, k1
- AVG_START 5
- .height_loop:
- movq xm0, [t2]
- movq xm2, [t4]
- movq xm1, [t2+t3]
- movq xm5, [t4+t5]
- lea t2, [t2+t3*2]
- lea t4, [t4+t5*2]
- vpbroadcastq m0 {k1}, [t2]
- vpbroadcastq m2 {k1}, [t4]
- vpbroadcastq m1 {k1}, [t2+t3]
- vpbroadcastq m5 {k1}, [t4+t5]
- punpcklbw m0, m2
- punpcklbw m1, m5
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- pmulhrsw m0, m4
- pmulhrsw m1, m4
- packuswb m0, m1
- vextracti128 xmm1, m0, 1
- movq [t0], xm0
- movhps [t0+t1], xm0
- lea t0, [t0+t1*2]
- movq [t0], xmm1
- movhps [t0+t1], xmm1
- AVG_END 4
- INIT_ZMM avx512
- cglobal pixel_avg_weight_w16
- BIWEIGHT_START
- AVG_START 5
- .height_loop:
- movu xm0, [t2]
- movu xm1, [t4]
- vinserti128 ym0, [t2+t3], 1
- vinserti128 ym1, [t4+t5], 1
- lea t2, [t2+t3*2]
- lea t4, [t4+t5*2]
- vinserti32x4 m0, [t2], 2
- vinserti32x4 m1, [t4], 2
- vinserti32x4 m0, [t2+t3], 3
- vinserti32x4 m1, [t4+t5], 3
- SBUTTERFLY bw, 0, 1, 2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- pmulhrsw m0, m4
- pmulhrsw m1, m4
- packuswb m0, m1
- mova [t0], xm0
- vextracti128 [t0+t1], ym0, 1
- lea t0, [t0+t1*2]
- vextracti32x4 [t0], m0, 2
- vextracti32x4 [t0+t1], m0, 3
- AVG_END 4
- %endif ;HIGH_BIT_DEPTH
- ;=============================================================================
- ; P frame explicit weighted prediction
- ;=============================================================================
- %if HIGH_BIT_DEPTH
- ; width
- %macro WEIGHT_START 1
- mova m0, [r4+ 0] ; 1<<denom
- mova m3, [r4+16]
- movd m2, [r4+32] ; denom
- mova m4, [pw_pixel_max]
- paddw m2, [sq_1] ; denom+1
- %endmacro
- ; src1, src2
- %macro WEIGHT 2
- movh m5, [%1]
- movh m6, [%2]
- punpcklwd m5, m0
- punpcklwd m6, m0
- pmaddwd m5, m3
- pmaddwd m6, m3
- psrad m5, m2
- psrad m6, m2
- packssdw m5, m6
- %endmacro
- ; src, dst, width
- %macro WEIGHT_TWO_ROW 4
- %assign x 0
- %rep (%3+mmsize/2-1)/(mmsize/2)
- %if %3-x/2 <= 4 && mmsize == 16
- WEIGHT %1+x, %1+r3+x
- CLIPW m5, [pb_0], m4
- movh [%2+x], m5
- movhps [%2+r1+x], m5
- %else
- WEIGHT %1+x, %1+x+mmsize/2
- SWAP 5, 7
- WEIGHT %1+r3+x, %1+r3+x+mmsize/2
- CLIPW m5, [pb_0], m4
- CLIPW m7, [pb_0], m4
- mova [%2+x], m7
- mova [%2+r1+x], m5
- %endif
- %assign x x+mmsize
- %endrep
- %endmacro
- %else ; !HIGH_BIT_DEPTH
- %macro WEIGHT_START 1
- %if cpuflag(avx2)
- vbroadcasti128 m3, [r4]
- vbroadcasti128 m4, [r4+16]
- %else
- mova m3, [r4]
- mova m4, [r4+16]
- %if notcpuflag(ssse3)
- movd m5, [r4+32]
- %endif
- %endif
- pxor m2, m2
- %endmacro
- ; src1, src2, dst1, dst2, fast
- %macro WEIGHT_ROWx2 5
- movh m0, [%1 ]
- movh m1, [%1+mmsize/2]
- movh m6, [%2 ]
- movh m7, [%2+mmsize/2]
- punpcklbw m0, m2
- punpcklbw m1, m2
- punpcklbw m6, m2
- punpcklbw m7, m2
- %if cpuflag(ssse3)
- %if %5==0
- psllw m0, 7
- psllw m1, 7
- psllw m6, 7
- psllw m7, 7
- %endif
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- pmulhrsw m6, m3
- pmulhrsw m7, m3
- paddw m0, m4
- paddw m1, m4
- paddw m6, m4
- paddw m7, m4
- %else
- pmullw m0, m3
- pmullw m1, m3
- pmullw m6, m3
- pmullw m7, m3
- paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
- paddsw m1, m4
- paddsw m6, m4
- paddsw m7, m4
- psraw m0, m5
- psraw m1, m5
- psraw m6, m5
- psraw m7, m5
- %endif
- packuswb m0, m1
- packuswb m6, m7
- mova [%3], m0
- mova [%4], m6
- %endmacro
- ; src1, src2, dst1, dst2, width, fast
- %macro WEIGHT_COL 6
- %if cpuflag(avx2)
- %if %5==16
- movu xm0, [%1]
- vinserti128 m0, m0, [%2], 1
- punpckhbw m1, m0, m2
- punpcklbw m0, m0, m2
- %if %6==0
- psllw m0, 7
- psllw m1, 7
- %endif
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- paddw m0, m4
- paddw m1, m4
- packuswb m0, m1
- mova [%3], xm0
- vextracti128 [%4], m0, 1
- %else
- movq xm0, [%1]
- vinserti128 m0, m0, [%2], 1
- punpcklbw m0, m2
- %if %6==0
- psllw m0, 7
- %endif
- pmulhrsw m0, m3
- paddw m0, m4
- packuswb m0, m0
- vextracti128 xm1, m0, 1
- %if %5 == 8
- movq [%3], xm0
- movq [%4], xm1
- %else
- movd [%3], xm0
- movd [%4], xm1
- %endif
- %endif
- %else
- movh m0, [%1]
- movh m1, [%2]
- punpcklbw m0, m2
- punpcklbw m1, m2
- %if cpuflag(ssse3)
- %if %6==0
- psllw m0, 7
- psllw m1, 7
- %endif
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- paddw m0, m4
- paddw m1, m4
- %else
- pmullw m0, m3
- pmullw m1, m3
- paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
- paddsw m1, m4
- psraw m0, m5
- psraw m1, m5
- %endif
- %if %5 == 8
- packuswb m0, m1
- movh [%3], m0
- movhps [%4], m0
- %else
- packuswb m0, m0
- packuswb m1, m1
- movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
- movd [%4], m1
- %endif
- %endif
- %endmacro
- ; src, dst, width
- %macro WEIGHT_TWO_ROW 4
- %assign x 0
- %rep %3
- %if (%3-x) >= mmsize
- WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
- %assign x (x+mmsize)
- %else
- %assign w %3-x
- %if w == 20
- %assign w 16
- %endif
- WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
- %assign x (x+w)
- %endif
- %if x >= %3
- %exitrep
- %endif
- %endrep
- %endmacro
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
- ;-----------------------------------------------------------------------------
- %macro WEIGHTER 1
- cglobal mc_weight_w%1, 6,6,8
- FIX_STRIDES r1, r3
- WEIGHT_START %1
- %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
- ; we can merge the shift step into the scale factor
- ; if (m3<<7) doesn't overflow an int16_t
- cmp byte [r4+1], 0
- jz .fast
- %endif
- .loop:
- WEIGHT_TWO_ROW r2, r0, %1, 0
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r5d, 2
- jg .loop
- RET
- %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
- .fast:
- psllw m3, 7
- .fastloop:
- WEIGHT_TWO_ROW r2, r0, %1, 1
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r5d, 2
- jg .fastloop
- RET
- %endif
- %endmacro
- INIT_MMX mmx2
- WEIGHTER 4
- WEIGHTER 8
- WEIGHTER 12
- WEIGHTER 16
- WEIGHTER 20
- INIT_XMM sse2
- WEIGHTER 8
- WEIGHTER 16
- WEIGHTER 20
- %if HIGH_BIT_DEPTH
- WEIGHTER 12
- %else
- INIT_MMX ssse3
- WEIGHTER 4
- INIT_XMM ssse3
- WEIGHTER 8
- WEIGHTER 16
- WEIGHTER 20
- INIT_YMM avx2
- WEIGHTER 8
- WEIGHTER 16
- WEIGHTER 20
- %endif
- %macro OFFSET_OP 7
- mov%6 m0, [%1]
- mov%6 m1, [%2]
- %if HIGH_BIT_DEPTH
- p%5usw m0, m2
- p%5usw m1, m2
- %ifidn %5,add
- pminsw m0, m3
- pminsw m1, m3
- %endif
- %else
- p%5usb m0, m2
- p%5usb m1, m2
- %endif
- mov%7 [%3], m0
- mov%7 [%4], m1
- %endmacro
- %macro OFFSET_TWO_ROW 4
- %assign x 0
- %rep %3
- %if (%3*SIZEOF_PIXEL-x) >= mmsize
- OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
- %assign x (x+mmsize)
- %else
- %if HIGH_BIT_DEPTH
- OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
- %else
- OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
- %endif
- %exitrep
- %endif
- %if x >= %3*SIZEOF_PIXEL
- %exitrep
- %endif
- %endrep
- %endmacro
- ;-----------------------------------------------------------------------------
- ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
- ;-----------------------------------------------------------------------------
- %macro OFFSET 2
- cglobal mc_offset%2_w%1, 6,6
- FIX_STRIDES r1, r3
- mova m2, [r4]
- %if HIGH_BIT_DEPTH
- %ifidn %2,add
- mova m3, [pw_pixel_max]
- %endif
- %endif
- .loop:
- OFFSET_TWO_ROW r2, r0, %1, %2
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
- sub r5d, 2
- jg .loop
- RET
- %endmacro
- %macro OFFSETPN 1
- OFFSET %1, add
- OFFSET %1, sub
- %endmacro
- INIT_MMX mmx2
- OFFSETPN 4
- OFFSETPN 8
- OFFSETPN 12
- OFFSETPN 16
- OFFSETPN 20
- INIT_XMM sse2
- OFFSETPN 12
- OFFSETPN 16
- OFFSETPN 20
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- OFFSETPN 8
- %endif
- ;=============================================================================
- ; pixel avg
- ;=============================================================================
- ;-----------------------------------------------------------------------------
- ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
- ; pixel *src2, intptr_t src2_stride, int weight );
- ;-----------------------------------------------------------------------------
- %macro AVGH 2
- cglobal pixel_avg_%1x%2
- mov eax, %2
- cmp dword r6m, 32
- jne pixel_avg_weight_w%1 %+ SUFFIX
- %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
- jmp pixel_avg_w%1_avx2
- %else
- %if mmsize == 16 && %1 == 16
- test dword r4m, 15
- jz pixel_avg_w%1_sse2
- %endif
- jmp pixel_avg_w%1_mmx2
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
- ; pixel *src2, intptr_t src2_stride, int height, int weight );
- ;-----------------------------------------------------------------------------
- %macro AVG_FUNC 3
- cglobal pixel_avg_w%1
- AVG_START
- .height_loop:
- %assign x 0
- %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
- %2 m0, [t2+x]
- %2 m1, [t2+x+SIZEOF_PIXEL*t3]
- %if HIGH_BIT_DEPTH
- pavgw m0, [t4+x]
- pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
- %else ;!HIGH_BIT_DEPTH
- pavgb m0, [t4+x]
- pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
- %endif
- %3 [t0+x], m0
- %3 [t0+x+SIZEOF_PIXEL*t1], m1
- %assign x x+mmsize
- %endrep
- AVG_END
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_MMX mmx2
- AVG_FUNC 4, movq, movq
- AVGH 4, 16
- AVGH 4, 8
- AVGH 4, 4
- AVGH 4, 2
- AVG_FUNC 8, movq, movq
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- AVG_FUNC 16, movq, movq
- AVGH 16, 16
- AVGH 16, 8
- INIT_XMM sse2
- AVG_FUNC 4, movq, movq
- AVGH 4, 16
- AVGH 4, 8
- AVGH 4, 4
- AVGH 4, 2
- AVG_FUNC 8, movdqu, movdqa
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- AVG_FUNC 16, movdqu, movdqa
- AVGH 16, 16
- AVGH 16, 8
- %else ;!HIGH_BIT_DEPTH
- INIT_MMX mmx2
- AVG_FUNC 4, movd, movd
- AVGH 4, 16
- AVGH 4, 8
- AVGH 4, 4
- AVGH 4, 2
- AVG_FUNC 8, movq, movq
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- AVG_FUNC 16, movq, movq
- AVGH 16, 16
- AVGH 16, 8
- INIT_XMM sse2
- AVG_FUNC 16, movdqu, movdqa
- AVGH 16, 16
- AVGH 16, 8
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- INIT_XMM ssse3
- AVGH 16, 16
- AVGH 16, 8
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- INIT_MMX ssse3
- AVGH 4, 16
- AVGH 4, 8
- AVGH 4, 4
- AVGH 4, 2
- INIT_XMM avx2
- AVG_FUNC 16, movdqu, movdqa
- AVGH 16, 16
- AVGH 16, 8
- INIT_XMM avx512
- AVGH 16, 16
- AVGH 16, 8
- AVGH 8, 16
- AVGH 8, 8
- AVGH 8, 4
- %endif ;HIGH_BIT_DEPTH
- ;=============================================================================
- ; pixel avg2
- ;=============================================================================
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
- ; uint16_t *src1, intptr_t src_stride,
- ; uint16_t *src2, int height );
- ;-----------------------------------------------------------------------------
- %macro AVG2_W_ONE 1
- cglobal pixel_avg2_w%1, 6,7,4
- sub r4, r2
- lea r6, [r4+r3*2]
- .height_loop:
- movu m0, [r2]
- movu m1, [r2+r3*2]
- %if cpuflag(avx) || mmsize == 8
- pavgw m0, [r2+r4]
- pavgw m1, [r2+r6]
- %else
- movu m2, [r2+r4]
- movu m3, [r2+r6]
- pavgw m0, m2
- pavgw m1, m3
- %endif
- mova [r0], m0
- mova [r0+r1*2], m1
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r5d, 2
- jg .height_loop
- RET
- %endmacro
- %macro AVG2_W_TWO 3
- cglobal pixel_avg2_w%1, 6,7,8
- sub r4, r2
- lea r6, [r4+r3*2]
- .height_loop:
- movu m0, [r2]
- %2 m1, [r2+mmsize]
- movu m2, [r2+r3*2]
- %2 m3, [r2+r3*2+mmsize]
- %if mmsize == 8
- pavgw m0, [r2+r4]
- pavgw m1, [r2+r4+mmsize]
- pavgw m2, [r2+r6]
- pavgw m3, [r2+r6+mmsize]
- %else
- movu m4, [r2+r4]
- %2 m5, [r2+r4+mmsize]
- movu m6, [r2+r6]
- %2 m7, [r2+r6+mmsize]
- pavgw m0, m4
- pavgw m1, m5
- pavgw m2, m6
- pavgw m3, m7
- %endif
- mova [r0], m0
- %3 [r0+mmsize], m1
- mova [r0+r1*2], m2
- %3 [r0+r1*2+mmsize], m3
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r5d, 2
- jg .height_loop
- RET
- %endmacro
- INIT_MMX mmx2
- AVG2_W_ONE 4
- AVG2_W_TWO 8, movu, mova
- INIT_XMM sse2
- AVG2_W_ONE 8
- AVG2_W_TWO 10, movd, movd
- AVG2_W_TWO 16, movu, mova
- INIT_YMM avx2
- AVG2_W_ONE 16
- INIT_MMX
- cglobal pixel_avg2_w10_mmx2, 6,7
- sub r4, r2
- lea r6, [r4+r3*2]
- .height_loop:
- movu m0, [r2+ 0]
- movu m1, [r2+ 8]
- movh m2, [r2+16]
- movu m3, [r2+r3*2+ 0]
- movu m4, [r2+r3*2+ 8]
- movh m5, [r2+r3*2+16]
- pavgw m0, [r2+r4+ 0]
- pavgw m1, [r2+r4+ 8]
- pavgw m2, [r2+r4+16]
- pavgw m3, [r2+r6+ 0]
- pavgw m4, [r2+r6+ 8]
- pavgw m5, [r2+r6+16]
- mova [r0+ 0], m0
- mova [r0+ 8], m1
- movh [r0+16], m2
- mova [r0+r1*2+ 0], m3
- mova [r0+r1*2+ 8], m4
- movh [r0+r1*2+16], m5
- lea r2, [r2+r3*2*2]
- lea r0, [r0+r1*2*2]
- sub r5d, 2
- jg .height_loop
- RET
- cglobal pixel_avg2_w16_mmx2, 6,7
- sub r4, r2
- lea r6, [r4+r3*2]
- .height_loop:
- movu m0, [r2+ 0]
- movu m1, [r2+ 8]
- movu m2, [r2+16]
- movu m3, [r2+24]
- movu m4, [r2+r3*2+ 0]
- movu m5, [r2+r3*2+ 8]
- movu m6, [r2+r3*2+16]
- movu m7, [r2+r3*2+24]
- pavgw m0, [r2+r4+ 0]
- pavgw m1, [r2+r4+ 8]
- pavgw m2, [r2+r4+16]
- pavgw m3, [r2+r4+24]
- pavgw m4, [r2+r6+ 0]
- pavgw m5, [r2+r6+ 8]
- pavgw m6, [r2+r6+16]
- pavgw m7, [r2+r6+24]
- mova [r0+ 0], m0
- mova [r0+ 8], m1
- mova [r0+16], m2
- mova [r0+24], m3
- mova [r0+r1*2+ 0], m4
- mova [r0+r1*2+ 8], m5
- mova [r0+r1*2+16], m6
- mova [r0+r1*2+24], m7
- lea r2, [r2+r3*2*2]
- lea r0, [r0+r1*2*2]
- sub r5d, 2
- jg .height_loop
- RET
- cglobal pixel_avg2_w18_mmx2, 6,7
- sub r4, r2
- .height_loop:
- movu m0, [r2+ 0]
- movu m1, [r2+ 8]
- movu m2, [r2+16]
- movu m3, [r2+24]
- movh m4, [r2+32]
- pavgw m0, [r2+r4+ 0]
- pavgw m1, [r2+r4+ 8]
- pavgw m2, [r2+r4+16]
- pavgw m3, [r2+r4+24]
- pavgw m4, [r2+r4+32]
- mova [r0+ 0], m0
- mova [r0+ 8], m1
- mova [r0+16], m2
- mova [r0+24], m3
- movh [r0+32], m4
- lea r2, [r2+r3*2]
- lea r0, [r0+r1*2]
- dec r5d
- jg .height_loop
- RET
- %macro PIXEL_AVG_W18 0
- cglobal pixel_avg2_w18, 6,7
- sub r4, r2
- .height_loop:
- movu m0, [r2+ 0]
- movd xm2, [r2+32]
- %if mmsize == 32
- pavgw m0, [r2+r4+ 0]
- movd xm1, [r2+r4+32]
- pavgw xm2, xm1
- %else
- movu m1, [r2+16]
- movu m3, [r2+r4+ 0]
- movu m4, [r2+r4+16]
- movd m5, [r2+r4+32]
- pavgw m0, m3
- pavgw m1, m4
- pavgw m2, m5
- mova [r0+16], m1
- %endif
- mova [r0+ 0], m0
- movd [r0+32], xm2
- lea r2, [r2+r3*2]
- lea r0, [r0+r1*2]
- dec r5d
- jg .height_loop
- RET
- %endmacro
- INIT_XMM sse2
- PIXEL_AVG_W18
- INIT_YMM avx2
- PIXEL_AVG_W18
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH == 0
- ;-----------------------------------------------------------------------------
- ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
- ; uint8_t *src1, intptr_t src_stride,
- ; uint8_t *src2, int height );
- ;-----------------------------------------------------------------------------
- %macro AVG2_W8 2
- cglobal pixel_avg2_w%1_mmx2, 6,7
- sub r4, r2
- lea r6, [r4+r3]
- .height_loop:
- %2 mm0, [r2]
- %2 mm1, [r2+r3]
- pavgb mm0, [r2+r4]
- pavgb mm1, [r2+r6]
- lea r2, [r2+r3*2]
- %2 [r0], mm0
- %2 [r0+r1], mm1
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- %endmacro
- INIT_MMX
- AVG2_W8 4, movd
- AVG2_W8 8, movq
- %macro AVG2_W16 2
- cglobal pixel_avg2_w%1_mmx2, 6,7
- sub r2, r4
- lea r6, [r2+r3]
- .height_loop:
- movq mm0, [r4]
- %2 mm1, [r4+8]
- movq mm2, [r4+r3]
- %2 mm3, [r4+r3+8]
- pavgb mm0, [r4+r2]
- pavgb mm1, [r4+r2+8]
- pavgb mm2, [r4+r6]
- pavgb mm3, [r4+r6+8]
- lea r4, [r4+r3*2]
- movq [r0], mm0
- %2 [r0+8], mm1
- movq [r0+r1], mm2
- %2 [r0+r1+8], mm3
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- %endmacro
- AVG2_W16 12, movd
- AVG2_W16 16, movq
- cglobal pixel_avg2_w20_mmx2, 6,7
- sub r2, r4
- lea r6, [r2+r3]
- .height_loop:
- movq mm0, [r4]
- movq mm1, [r4+8]
- movd mm2, [r4+16]
- movq mm3, [r4+r3]
- movq mm4, [r4+r3+8]
- movd mm5, [r4+r3+16]
- pavgb mm0, [r4+r2]
- pavgb mm1, [r4+r2+8]
- pavgb mm2, [r4+r2+16]
- pavgb mm3, [r4+r6]
- pavgb mm4, [r4+r6+8]
- pavgb mm5, [r4+r6+16]
- lea r4, [r4+r3*2]
- movq [r0], mm0
- movq [r0+8], mm1
- movd [r0+16], mm2
- movq [r0+r1], mm3
- movq [r0+r1+8], mm4
- movd [r0+r1+16], mm5
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- INIT_XMM
- cglobal pixel_avg2_w16_sse2, 6,7
- sub r4, r2
- lea r6, [r4+r3]
- .height_loop:
- movu m0, [r2]
- movu m2, [r2+r3]
- movu m1, [r2+r4]
- movu m3, [r2+r6]
- lea r2, [r2+r3*2]
- pavgb m0, m1
- pavgb m2, m3
- mova [r0], m0
- mova [r0+r1], m2
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- cglobal pixel_avg2_w20_sse2, 6,7
- sub r2, r4
- lea r6, [r2+r3]
- .height_loop:
- movu m0, [r4]
- movu m2, [r4+r3]
- movu m1, [r4+r2]
- movu m3, [r4+r6]
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb m0, m1
- pavgb m2, m3
- pavgb mm4, [r4+r2+16]
- pavgb mm5, [r4+r6+16]
- lea r4, [r4+r3*2]
- mova [r0], m0
- mova [r0+r1], m2
- movd [r0+16], mm4
- movd [r0+r1+16], mm5
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- INIT_YMM avx2
- cglobal pixel_avg2_w20, 6,7
- sub r2, r4
- lea r6, [r2+r3]
- .height_loop:
- movu m0, [r4]
- movu m1, [r4+r3]
- pavgb m0, [r4+r2]
- pavgb m1, [r4+r6]
- lea r4, [r4+r3*2]
- mova [r0], m0
- mova [r0+r1], m1
- lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- RET
- ; Cacheline split code for processors with high latencies for loads
- ; split over cache lines. See sad-a.asm for a more detailed explanation.
- ; This particular instance is complicated by the fact that src1 and src2
- ; can have different alignments. For simplicity and code size, only the
- ; MMX cacheline workaround is used. As a result, in the case of SSE2
- ; pixel_avg, the cacheline check functions calls the SSE2 version if there
- ; is no cacheline split, and the MMX workaround if there is.
- %macro INIT_SHIFT 2
- and eax, 7
- shl eax, 3
- movd %1, [sw_64]
- movd %2, eax
- psubw %1, %2
- %endmacro
- %macro AVG_CACHELINE_START 0
- %assign stack_offset 0
- INIT_SHIFT mm6, mm7
- mov eax, r4m
- INIT_SHIFT mm4, mm5
- PROLOGUE 6,6
- and r2, ~7
- and r4, ~7
- sub r4, r2
- .height_loop:
- %endmacro
- %macro AVG_CACHELINE_LOOP 2
- movq mm1, [r2+%1]
- movq mm0, [r2+8+%1]
- movq mm3, [r2+r4+%1]
- movq mm2, [r2+r4+8+%1]
- psrlq mm1, mm7
- psllq mm0, mm6
- psrlq mm3, mm5
- psllq mm2, mm4
- por mm0, mm1
- por mm2, mm3
- pavgb mm2, mm0
- %2 [r0+%1], mm2
- %endmacro
- %macro AVG_CACHELINE_FUNC 2
- pixel_avg2_w%1_cache_mmx2:
- AVG_CACHELINE_START
- AVG_CACHELINE_LOOP 0, movq
- %if %1>8
- AVG_CACHELINE_LOOP 8, movq
- %if %1>16
- AVG_CACHELINE_LOOP 16, movd
- %endif
- %endif
- add r2, r3
- add r0, r1
- dec r5d
- jg .height_loop
- RET
- %endmacro
- %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
- %if %1 == 12
- ;w12 isn't needed because w16 is just as fast if there's no cacheline split
- %define cachesplit pixel_avg2_w16_cache_mmx2
- %else
- %define cachesplit pixel_avg2_w%1_cache_mmx2
- %endif
- cglobal pixel_avg2_w%1_cache%2_%3
- mov eax, r2m
- and eax, %2-1
- cmp eax, (%2-%1-(%1 % 8))
- %if %1==12||%1==20
- jbe pixel_avg2_w%1_%3
- %else
- jb pixel_avg2_w%1_%3
- %endif
- %if 0 ; or %1==8 - but the extra branch seems too expensive
- ja cachesplit
- %if ARCH_X86_64
- test r4b, 1
- %else
- test byte r4m, 1
- %endif
- jz pixel_avg2_w%1_%3
- %else
- or eax, r4m
- and eax, 7
- jz pixel_avg2_w%1_%3
- mov eax, r2m
- %endif
- %if mmsize==16 || (%1==8 && %2==64)
- AVG_CACHELINE_FUNC %1, %2
- %else
- jmp cachesplit
- %endif
- %endmacro
- INIT_MMX
- AVG_CACHELINE_CHECK 8, 64, mmx2
- AVG_CACHELINE_CHECK 12, 64, mmx2
- %if ARCH_X86_64 == 0
- AVG_CACHELINE_CHECK 16, 64, mmx2
- AVG_CACHELINE_CHECK 20, 64, mmx2
- AVG_CACHELINE_CHECK 8, 32, mmx2
- AVG_CACHELINE_CHECK 12, 32, mmx2
- AVG_CACHELINE_CHECK 16, 32, mmx2
- AVG_CACHELINE_CHECK 20, 32, mmx2
- %endif
- INIT_XMM
- AVG_CACHELINE_CHECK 16, 64, sse2
- AVG_CACHELINE_CHECK 20, 64, sse2
- ; computed jump assumes this loop is exactly 48 bytes
- %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
- ALIGN 16
- avg_w16_align%1_%2_ssse3:
- %if %1==0 && %2==0
- movdqa xmm1, [r2]
- pavgb xmm1, [r2+r4]
- add r2, r3
- %elif %1==0
- movdqa xmm1, [r2+r4+16]
- palignr xmm1, [r2+r4], %2
- pavgb xmm1, [r2]
- add r2, r3
- %elif %2&15==0
- movdqa xmm1, [r2+16]
- palignr xmm1, [r2], %1
- pavgb xmm1, [r2+r4]
- add r2, r3
- %else
- movdqa xmm1, [r2+16]
- movdqa xmm2, [r2+r4+16]
- palignr xmm1, [r2], %1
- palignr xmm2, [r2+r4], %2&15
- add r2, r3
- pavgb xmm1, xmm2
- %endif
- movdqa [r0], xmm1
- add r0, r1
- dec r5d
- jg avg_w16_align%1_%2_ssse3
- ret
- %if %1==0
- ; make sure the first ones don't end up short
- ALIGN 16
- times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
- %endif
- %endmacro
- cglobal pixel_avg2_w16_cache64_ssse3
- %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
- mov eax, r2m
- and eax, 0x3f
- cmp eax, 0x30
- jb pixel_avg2_w16_sse2
- or eax, r4m
- and eax, 7
- jz pixel_avg2_w16_sse2
- %endif
- PROLOGUE 6, 8
- lea r6, [r4+r2]
- and r4, ~0xf
- and r6, 0x1f
- and r2, ~0xf
- lea r6, [r6*3] ;(offset + align*2)*3
- sub r4, r2
- shl r6, 4 ;jump = (offset + align*2)*48
- %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
- %ifdef PIC
- lea r7, [avg_w16_addr]
- add r6, r7
- %else
- lea r6, [avg_w16_addr + r6]
- %endif
- TAIL_CALL r6, 1
- %assign j 0
- %assign k 1
- %rep 16
- AVG16_CACHELINE_LOOP_SSSE3 j, j
- AVG16_CACHELINE_LOOP_SSSE3 j, k
- %assign j j+1
- %assign k k+1
- %endrep
- %endif ; !HIGH_BIT_DEPTH
- ;=============================================================================
- ; pixel copy
- ;=============================================================================
- %macro COPY1 2
- movu m0, [r2]
- movu m1, [r2+r3]
- movu m2, [r2+r3*2]
- movu m3, [r2+%2]
- mova [r0], m0
- mova [r0+r1], m1
- mova [r0+r1*2], m2
- mova [r0+%1], m3
- %endmacro
- %macro COPY2 2-4 0, 1
- movu m0, [r2+%3*mmsize]
- movu m1, [r2+%4*mmsize]
- movu m2, [r2+r3+%3*mmsize]
- movu m3, [r2+r3+%4*mmsize]
- mova [r0+%3*mmsize], m0
- mova [r0+%4*mmsize], m1
- mova [r0+r1+%3*mmsize], m2
- mova [r0+r1+%4*mmsize], m3
- movu m0, [r2+r3*2+%3*mmsize]
- movu m1, [r2+r3*2+%4*mmsize]
- movu m2, [r2+%2+%3*mmsize]
- movu m3, [r2+%2+%4*mmsize]
- mova [r0+r1*2+%3*mmsize], m0
- mova [r0+r1*2+%4*mmsize], m1
- mova [r0+%1+%3*mmsize], m2
- mova [r0+%1+%4*mmsize], m3
- %endmacro
- %macro COPY4 2
- COPY2 %1, %2, 0, 1
- COPY2 %1, %2, 2, 3
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
- ; uint8_t *src, intptr_t i_src_stride, int i_height )
- ;-----------------------------------------------------------------------------
- INIT_MMX
- cglobal mc_copy_w4_mmx, 4,6
- FIX_STRIDES r1, r3
- cmp dword r4m, 4
- lea r5, [r3*3]
- lea r4, [r1*3]
- je .end
- %if HIGH_BIT_DEPTH == 0
- %define mova movd
- %define movu movd
- %endif
- COPY1 r4, r5
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- .end:
- COPY1 r4, r5
- RET
- %macro MC_COPY 1
- %assign %%w %1*SIZEOF_PIXEL/mmsize
- %if %%w > 0
- cglobal mc_copy_w%1, 5,7
- FIX_STRIDES r1, r3
- lea r6, [r3*3]
- lea r5, [r1*3]
- .height_loop:
- COPY %+ %%w r5, r6
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- RET
- %endif
- %endmacro
- INIT_MMX mmx
- MC_COPY 8
- MC_COPY 16
- INIT_XMM sse
- MC_COPY 8
- MC_COPY 16
- INIT_XMM aligned, sse
- MC_COPY 16
- %if HIGH_BIT_DEPTH
- INIT_YMM avx
- MC_COPY 16
- INIT_YMM aligned, avx
- MC_COPY 16
- %endif
- ;=============================================================================
- ; prefetch
- ;=============================================================================
- ; assumes 64 byte cachelines
- ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
- ;-----------------------------------------------------------------------------
- ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
- ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
- ;-----------------------------------------------------------------------------
- %macro PREFETCH_FENC 1
- %if ARCH_X86_64
- cglobal prefetch_fenc_%1, 5,5
- FIX_STRIDES r1, r3
- and r4d, 3
- mov eax, r4d
- imul r4d, r1d
- lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- lea r0, [r0+r1*2]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- imul eax, r3d
- lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
- prefetcht0 [r2]
- prefetcht0 [r2+r3]
- %ifidn %1, 422
- lea r2, [r2+r3*2]
- prefetcht0 [r2]
- prefetcht0 [r2+r3]
- %endif
- RET
- %else
- cglobal prefetch_fenc_%1, 0,3
- mov r2, r4m
- mov r1, r1m
- mov r0, r0m
- FIX_STRIDES r1
- and r2, 3
- imul r2, r1
- lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- lea r0, [r0+r1*2]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- mov r2, r4m
- mov r1, r3m
- mov r0, r2m
- FIX_STRIDES r1
- and r2, 3
- imul r2, r1
- lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- %ifidn %1, 422
- lea r0, [r0+r1*2]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- %endif
- ret
- %endif ; ARCH_X86_64
- %endmacro
- INIT_MMX mmx2
- PREFETCH_FENC 420
- PREFETCH_FENC 422
- %if ARCH_X86_64
- DECLARE_REG_TMP 4
- %else
- DECLARE_REG_TMP 2
- %endif
- cglobal prefetch_fenc_400, 2,3
- movifnidn t0d, r4m
- FIX_STRIDES r1
- and t0d, 3
- imul t0d, r1d
- lea r0, [r0+t0*4+64*SIZEOF_PIXEL]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- lea r0, [r0+r1*2]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- RET
- ;-----------------------------------------------------------------------------
- ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx2
- cglobal prefetch_ref, 3,3
- FIX_STRIDES r1
- dec r2d
- and r2d, r1d
- lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
- lea r2, [r1*3]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- prefetcht0 [r0+r1*2]
- prefetcht0 [r0+r2]
- lea r0, [r0+r1*4]
- prefetcht0 [r0]
- prefetcht0 [r0+r1]
- prefetcht0 [r0+r1*2]
- prefetcht0 [r0+r2]
- RET
- ;=============================================================================
- ; chroma MC
- ;=============================================================================
- %if ARCH_X86_64
- DECLARE_REG_TMP 6,7,8
- %else
- DECLARE_REG_TMP 0,1,2
- %endif
- %macro MC_CHROMA_START 1
- %if ARCH_X86_64
- PROLOGUE 0,9,%1
- %else
- PROLOGUE 0,6,%1
- %endif
- movifnidn r3, r3mp
- movifnidn r4d, r4m
- movifnidn r5d, r5m
- movifnidn t0d, r6m
- mov t2d, t0d
- mov t1d, r5d
- sar t0d, 3
- sar t1d, 3
- imul t0d, r4d
- lea t0d, [t0+t1*2]
- FIX_STRIDES t0d
- movsxdifnidn t0, t0d
- add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro UNPACK_UNALIGNED 4
- movu %1, [%4+0]
- movu %2, [%4+4]
- punpckhwd %3, %1, %2
- punpcklwd %1, %2
- %if mmsize == 8
- mova %2, %1
- punpcklwd %1, %3
- punpckhwd %2, %3
- %else
- shufps %2, %1, %3, q3131
- shufps %1, %3, q2020
- %endif
- %endmacro
- %else ; !HIGH_BIT_DEPTH
- %macro UNPACK_UNALIGNED 3
- %if mmsize == 8
- punpcklwd %1, %3
- %else
- movh %2, %3
- punpcklwd %1, %2
- %endif
- %endmacro
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
- ; uint8_t *src, intptr_t src_stride,
- ; int dx, int dy,
- ; int width, int height )
- ;-----------------------------------------------------------------------------
- %macro MC_CHROMA 0
- cglobal mc_chroma
- MC_CHROMA_START 0
- FIX_STRIDES r4
- and r5d, 7
- %if ARCH_X86_64
- jz .mc1dy
- %endif
- and t2d, 7
- %if ARCH_X86_64
- jz .mc1dx
- %endif
- shl r5d, 16
- add t2d, r5d
- mov t0d, t2d
- shl t2d, 8
- sub t2d, t0d
- add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
- cmp dword r7m, 4
- %if mmsize==8
- .skip_prologue:
- %else
- jl mc_chroma_mmx2 %+ .skip_prologue
- WIN64_SPILL_XMM 9
- %endif
- movd m5, t2d
- movifnidn r0, r0mp
- movifnidn r1, r1mp
- movifnidn r2d, r2m
- movifnidn r5d, r8m
- pxor m6, m6
- punpcklbw m5, m6
- %if mmsize==8
- pshufw m7, m5, q3232
- pshufw m6, m5, q0000
- pshufw m5, m5, q1111
- jge .width4
- %else
- %if WIN64
- cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
- %endif
- pshufd m7, m5, q1111
- punpcklwd m5, m5
- pshufd m6, m5, q0000
- pshufd m5, m5, q1111
- jg .width8
- %endif
- %if HIGH_BIT_DEPTH
- add r2, r2
- UNPACK_UNALIGNED m0, m1, m2, r3
- %else
- movu m0, [r3]
- UNPACK_UNALIGNED m0, m1, [r3+2]
- mova m1, m0
- pand m0, [pw_00ff]
- psrlw m1, 8
- %endif ; HIGH_BIT_DEPTH
- pmaddwd m0, m7
- pmaddwd m1, m7
- packssdw m0, m1
- SWAP 3, 0
- ALIGN 4
- .loop2:
- %if HIGH_BIT_DEPTH
- UNPACK_UNALIGNED m0, m1, m2, r3+r4
- pmullw m3, m6
- %else ; !HIGH_BIT_DEPTH
- movu m0, [r3+r4]
- UNPACK_UNALIGNED m0, m1, [r3+r4+2]
- pmullw m3, m6
- mova m1, m0
- pand m0, [pw_00ff]
- psrlw m1, 8
- %endif ; HIGH_BIT_DEPTH
- pmaddwd m0, m7
- pmaddwd m1, m7
- mova m2, [pw_32]
- packssdw m0, m1
- paddw m2, m3
- mova m3, m0
- pmullw m0, m5
- paddw m0, m2
- psrlw m0, 6
- %if HIGH_BIT_DEPTH
- movh [r0], m0
- %if mmsize == 8
- psrlq m0, 32
- movh [r1], m0
- %else
- movhps [r1], m0
- %endif
- %else ; !HIGH_BIT_DEPTH
- packuswb m0, m0
- movd [r0], m0
- %if mmsize==8
- psrlq m0, 16
- %else
- psrldq m0, 4
- %endif
- movd [r1], m0
- %endif ; HIGH_BIT_DEPTH
- add r3, r4
- add r0, r2
- add r1, r2
- dec r5d
- jg .loop2
- RET
- %if mmsize==8
- .width4:
- %if ARCH_X86_64
- mov t0, r0
- mov t1, r1
- mov t2, r3
- %if WIN64
- %define multy0 r4m
- %else
- %define multy0 [rsp-8]
- %endif
- mova multy0, m5
- %else
- mov r3m, r3
- %define multy0 r4m
- mova multy0, m5
- %endif
- %else
- .width8:
- %if ARCH_X86_64
- %define multy0 m8
- SWAP 8, 5
- %else
- %define multy0 r0m
- mova multy0, m5
- %endif
- %endif
- FIX_STRIDES r2
- .loopx:
- %if HIGH_BIT_DEPTH
- UNPACK_UNALIGNED m0, m2, m4, r3
- UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
- %else
- movu m0, [r3]
- movu m1, [r3+mmsize/2]
- UNPACK_UNALIGNED m0, m2, [r3+2]
- UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- psrlw m2, m0, 8
- psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- %endif
- pmaddwd m0, m7
- pmaddwd m2, m7
- pmaddwd m1, m7
- pmaddwd m3, m7
- packssdw m0, m2
- packssdw m1, m3
- SWAP 4, 0
- SWAP 5, 1
- add r3, r4
- ALIGN 4
- .loop4:
- %if HIGH_BIT_DEPTH
- UNPACK_UNALIGNED m0, m1, m2, r3
- pmaddwd m0, m7
- pmaddwd m1, m7
- packssdw m0, m1
- UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
- pmaddwd m1, m7
- pmaddwd m2, m7
- packssdw m1, m2
- %else ; !HIGH_BIT_DEPTH
- movu m0, [r3]
- movu m1, [r3+mmsize/2]
- UNPACK_UNALIGNED m0, m2, [r3+2]
- UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- psrlw m2, m0, 8
- psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- pmaddwd m0, m7
- pmaddwd m2, m7
- pmaddwd m1, m7
- pmaddwd m3, m7
- packssdw m0, m2
- packssdw m1, m3
- %endif ; HIGH_BIT_DEPTH
- pmullw m4, m6
- pmullw m5, m6
- mova m2, [pw_32]
- paddw m3, m2, m5
- paddw m2, m4
- mova m4, m0
- mova m5, m1
- pmullw m0, multy0
- pmullw m1, multy0
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 6
- psrlw m1, 6
- %if HIGH_BIT_DEPTH
- movh [r0], m0
- movh [r0+mmsize/2], m1
- %if mmsize==8
- psrlq m0, 32
- psrlq m1, 32
- movh [r1], m0
- movh [r1+mmsize/2], m1
- %else
- movhps [r1], m0
- movhps [r1+mmsize/2], m1
- %endif
- %else ; !HIGH_BIT_DEPTH
- packuswb m0, m1
- %if mmsize==8
- pshufw m1, m0, q0020
- pshufw m0, m0, q0031
- movd [r0], m1
- movd [r1], m0
- %else
- pshufd m0, m0, q3120
- movq [r0], m0
- movhps [r1], m0
- %endif
- %endif ; HIGH_BIT_DEPTH
- add r3, r4
- add r0, r2
- add r1, r2
- dec r5d
- jg .loop4
- %if mmsize!=8
- RET
- %else
- sub dword r7m, 4
- jg .width8
- RET
- .width8:
- %if ARCH_X86_64
- lea r3, [t2+8*SIZEOF_PIXEL]
- lea r0, [t0+4*SIZEOF_PIXEL]
- lea r1, [t1+4*SIZEOF_PIXEL]
- %else
- mov r3, r3m
- mov r0, r0m
- mov r1, r1m
- add r3, 8*SIZEOF_PIXEL
- add r0, 4*SIZEOF_PIXEL
- add r1, 4*SIZEOF_PIXEL
- %endif
- mov r5d, r8m
- jmp .loopx
- %endif
- %if ARCH_X86_64 ; too many regs for x86_32
- RESET_MM_PERMUTATION
- %if WIN64
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
- %assign xmm_regs_used 0
- %endif
- .mc1dy:
- and t2d, 7
- movd m5, t2d
- mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
- jmp .mc1d
- .mc1dx:
- movd m5, r5d
- mov r6d, 2*SIZEOF_PIXEL
- .mc1d:
- %if HIGH_BIT_DEPTH && mmsize == 16
- WIN64_SPILL_XMM 8
- %endif
- mova m4, [pw_8]
- SPLATW m5, m5
- psubw m4, m5
- movifnidn r0, r0mp
- movifnidn r1, r1mp
- movifnidn r2d, r2m
- FIX_STRIDES r2
- movifnidn r5d, r8m
- cmp dword r7m, 4
- jg .mc1d_w8
- mov r7, r2
- mov r8, r4
- %if mmsize!=8
- shr r5d, 1
- %endif
- .loop1d_w4:
- %if HIGH_BIT_DEPTH
- %if mmsize == 8
- movq m0, [r3+0]
- movq m2, [r3+8]
- movq m1, [r3+r6+0]
- movq m3, [r3+r6+8]
- %else
- movu m0, [r3]
- movu m1, [r3+r6]
- add r3, r8
- movu m2, [r3]
- movu m3, [r3+r6]
- %endif
- SBUTTERFLY wd, 0, 2, 6
- SBUTTERFLY wd, 1, 3, 7
- SBUTTERFLY wd, 0, 2, 6
- SBUTTERFLY wd, 1, 3, 7
- %if mmsize == 16
- SBUTTERFLY wd, 0, 2, 6
- SBUTTERFLY wd, 1, 3, 7
- %endif
- %else ; !HIGH_BIT_DEPTH
- movq m0, [r3]
- movq m1, [r3+r6]
- %if mmsize!=8
- add r3, r8
- movhps m0, [r3]
- movhps m1, [r3+r6]
- %endif
- psrlw m2, m0, 8
- psrlw m3, m1, 8
- pand m0, [pw_00ff]
- pand m1, [pw_00ff]
- %endif ; HIGH_BIT_DEPTH
- pmullw m0, m4
- pmullw m1, m5
- pmullw m2, m4
- pmullw m3, m5
- paddw m0, [pw_4]
- paddw m2, [pw_4]
- paddw m0, m1
- paddw m2, m3
- psrlw m0, 3
- psrlw m2, 3
- %if HIGH_BIT_DEPTH
- %if mmsize == 8
- xchg r4, r8
- xchg r2, r7
- %endif
- movq [r0], m0
- movq [r1], m2
- %if mmsize == 16
- add r0, r7
- add r1, r7
- movhps [r0], m0
- movhps [r1], m2
- %endif
- %else ; !HIGH_BIT_DEPTH
- packuswb m0, m2
- %if mmsize==8
- xchg r4, r8
- xchg r2, r7
- movd [r0], m0
- psrlq m0, 32
- movd [r1], m0
- %else
- movhlps m1, m0
- movd [r0], m0
- movd [r1], m1
- add r0, r7
- add r1, r7
- psrldq m0, 4
- psrldq m1, 4
- movd [r0], m0
- movd [r1], m1
- %endif
- %endif ; HIGH_BIT_DEPTH
- add r3, r4
- add r0, r2
- add r1, r2
- dec r5d
- jg .loop1d_w4
- RET
- .mc1d_w8:
- sub r2, 4*SIZEOF_PIXEL
- sub r4, 8*SIZEOF_PIXEL
- mov r7, 4*SIZEOF_PIXEL
- mov r8, 8*SIZEOF_PIXEL
- %if mmsize==8
- shl r5d, 1
- %endif
- jmp .loop1d_w4
- %endif ; ARCH_X86_64
- %endmacro ; MC_CHROMA
- %macro MC_CHROMA_SSSE3 0
- cglobal mc_chroma
- MC_CHROMA_START 10-cpuflag(avx2)
- and r5d, 7
- and t2d, 7
- mov t0d, r5d
- shl t0d, 8
- sub t0d, r5d
- mov r5d, 8
- add t0d, 8
- sub r5d, t2d
- imul t2d, t0d ; (x*255+8)*y
- imul r5d, t0d ; (x*255+8)*(8-y)
- movd xm6, t2d
- movd xm7, r5d
- %if cpuflag(cache64)
- mov t0d, r3d
- and t0d, 7
- %ifdef PIC
- lea t1, [ch_shuf_adj]
- movddup xm5, [t1 + t0*4]
- %else
- movddup xm5, [ch_shuf_adj + t0*4]
- %endif
- paddb xm5, [ch_shuf]
- and r3, ~7
- %else
- mova m5, [ch_shuf]
- %endif
- movifnidn r0, r0mp
- movifnidn r1, r1mp
- movifnidn r2d, r2m
- movifnidn r5d, r8m
- %if cpuflag(avx2)
- vpbroadcastw m6, xm6
- vpbroadcastw m7, xm7
- %else
- SPLATW m6, m6
- SPLATW m7, m7
- %endif
- %if ARCH_X86_64
- %define shiftround m8
- mova m8, [pw_512]
- %else
- %define shiftround [pw_512]
- %endif
- cmp dword r7m, 4
- jg .width8
- %if cpuflag(avx2)
- .loop4:
- movu xm0, [r3]
- movu xm1, [r3+r4]
- vinserti128 m0, m0, [r3+r4], 1
- vinserti128 m1, m1, [r3+r4*2], 1
- pshufb m0, m5
- pshufb m1, m5
- pmaddubsw m0, m7
- pmaddubsw m1, m6
- paddw m0, m1
- pmulhrsw m0, shiftround
- packuswb m0, m0
- vextracti128 xm1, m0, 1
- movd [r0], xm0
- movd [r0+r2], xm1
- psrldq xm0, 4
- psrldq xm1, 4
- movd [r1], xm0
- movd [r1+r2], xm1
- lea r3, [r3+r4*2]
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- sub r5d, 2
- jg .loop4
- RET
- .width8:
- movu xm0, [r3]
- vinserti128 m0, m0, [r3+8], 1
- pshufb m0, m5
- .loop8:
- movu xm3, [r3+r4]
- vinserti128 m3, m3, [r3+r4+8], 1
- pshufb m3, m5
- pmaddubsw m1, m0, m7
- pmaddubsw m2, m3, m6
- pmaddubsw m3, m3, m7
- movu xm0, [r3+r4*2]
- vinserti128 m0, m0, [r3+r4*2+8], 1
- pshufb m0, m5
- pmaddubsw m4, m0, m6
- paddw m1, m2
- paddw m3, m4
- pmulhrsw m1, shiftround
- pmulhrsw m3, shiftround
- packuswb m1, m3
- mova m2, [deinterleave_shufd]
- vpermd m1, m2, m1
- vextracti128 xm2, m1, 1
- movq [r0], xm1
- movhps [r1], xm1
- movq [r0+r2], xm2
- movhps [r1+r2], xm2
- %else
- movu m0, [r3]
- pshufb m0, m5
- .loop4:
- movu m1, [r3+r4]
- pshufb m1, m5
- movu m3, [r3+r4*2]
- pshufb m3, m5
- mova m4, m3
- pmaddubsw m0, m7
- pmaddubsw m2, m1, m7
- pmaddubsw m1, m6
- pmaddubsw m3, m6
- paddw m1, m0
- paddw m3, m2
- pmulhrsw m1, shiftround
- pmulhrsw m3, shiftround
- mova m0, m4
- packuswb m1, m3
- movd [r0], m1
- %if cpuflag(sse4)
- pextrd [r1], m1, 1
- pextrd [r0+r2], m1, 2
- pextrd [r1+r2], m1, 3
- %else
- movhlps m3, m1
- movd [r0+r2], m3
- psrldq m1, 4
- psrldq m3, 4
- movd [r1], m1
- movd [r1+r2], m3
- %endif
- lea r3, [r3+r4*2]
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- sub r5d, 2
- jg .loop4
- RET
- .width8:
- movu m0, [r3]
- pshufb m0, m5
- movu m1, [r3+8]
- pshufb m1, m5
- %if ARCH_X86_64
- SWAP 9, 6
- %define mult1 m9
- %else
- mova r0m, m6
- %define mult1 r0m
- %endif
- .loop8:
- movu m2, [r3+r4]
- pshufb m2, m5
- movu m3, [r3+r4+8]
- pshufb m3, m5
- mova m4, m2
- mova m6, m3
- pmaddubsw m0, m7
- pmaddubsw m1, m7
- pmaddubsw m2, mult1
- pmaddubsw m3, mult1
- paddw m0, m2
- paddw m1, m3
- pmulhrsw m0, shiftround ; x + 32 >> 6
- pmulhrsw m1, shiftround
- packuswb m0, m1
- pshufd m0, m0, q3120
- movq [r0], m0
- movhps [r1], m0
- movu m2, [r3+r4*2]
- pshufb m2, m5
- movu m3, [r3+r4*2+8]
- pshufb m3, m5
- mova m0, m2
- mova m1, m3
- pmaddubsw m4, m7
- pmaddubsw m6, m7
- pmaddubsw m2, mult1
- pmaddubsw m3, mult1
- paddw m2, m4
- paddw m3, m6
- pmulhrsw m2, shiftround
- pmulhrsw m3, shiftround
- packuswb m2, m3
- pshufd m2, m2, q3120
- movq [r0+r2], m2
- movhps [r1+r2], m2
- %endif
- lea r3, [r3+r4*2]
- lea r0, [r0+r2*2]
- lea r1, [r1+r2*2]
- sub r5d, 2
- jg .loop8
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_MMX mmx2
- MC_CHROMA
- INIT_XMM sse2
- MC_CHROMA
- INIT_XMM avx
- MC_CHROMA
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx2
- MC_CHROMA
- INIT_XMM sse2
- MC_CHROMA
- INIT_XMM ssse3
- MC_CHROMA_SSSE3
- INIT_XMM cache64, ssse3
- MC_CHROMA_SSSE3
- INIT_XMM avx
- MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
- INIT_YMM avx2
- MC_CHROMA_SSSE3
- %endif ; HIGH_BIT_DEPTH
|