123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181 |
- ;*****************************************************************************
- ;* predict-a.asm: x86 intra prediction
- ;*****************************************************************************
- ;* Copyright (C) 2005-2018 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;* Holger Lubitz <holger@lubitz.org>
- ;* Fiona Glaser <fiona@x264.com>
- ;* Henrik Gramner <henrik@gramner.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 32
- pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
- pw_m3: times 16 dw -3
- pw_m7: times 16 dw -7
- pb_00s_ff: times 8 db 0
- pb_0s_ff: times 7 db 0
- db 0xff
- shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
- shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- shuf_hu: db 7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0
- shuf_vr: db 2,4,6,8,9,10,11,12,13,14,15,0,1,3,5,7
- pw_reverse: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1
- SECTION .text
- cextern pb_0
- cextern pb_1
- cextern pb_3
- cextern pw_1
- cextern pw_2
- cextern pw_4
- cextern pw_8
- cextern pw_16
- cextern pw_00ff
- cextern pw_pixel_max
- cextern pw_0to15
- %macro STORE8 1
- mova [r0+0*FDEC_STRIDEB], %1
- mova [r0+1*FDEC_STRIDEB], %1
- add r0, 4*FDEC_STRIDEB
- mova [r0-2*FDEC_STRIDEB], %1
- mova [r0-1*FDEC_STRIDEB], %1
- mova [r0+0*FDEC_STRIDEB], %1
- mova [r0+1*FDEC_STRIDEB], %1
- mova [r0+2*FDEC_STRIDEB], %1
- mova [r0+3*FDEC_STRIDEB], %1
- %endmacro
- %macro STORE16 1-4
- %if %0 > 1
- mov r1d, 2*%0
- .loop:
- mova [r0+0*FDEC_STRIDEB+0*mmsize], %1
- mova [r0+0*FDEC_STRIDEB+1*mmsize], %2
- mova [r0+1*FDEC_STRIDEB+0*mmsize], %1
- mova [r0+1*FDEC_STRIDEB+1*mmsize], %2
- %ifidn %0, 4
- mova [r0+0*FDEC_STRIDEB+2*mmsize], %3
- mova [r0+0*FDEC_STRIDEB+3*mmsize], %4
- mova [r0+1*FDEC_STRIDEB+2*mmsize], %3
- mova [r0+1*FDEC_STRIDEB+3*mmsize], %4
- add r0, 2*FDEC_STRIDEB
- %else ; %0 == 2
- add r0, 4*FDEC_STRIDEB
- mova [r0-2*FDEC_STRIDEB+0*mmsize], %1
- mova [r0-2*FDEC_STRIDEB+1*mmsize], %2
- mova [r0-1*FDEC_STRIDEB+0*mmsize], %1
- mova [r0-1*FDEC_STRIDEB+1*mmsize], %2
- %endif
- dec r1d
- jg .loop
- %else ; %0 == 1
- STORE8 %1
- %if HIGH_BIT_DEPTH ; Different code paths to reduce code size
- add r0, 6*FDEC_STRIDEB
- mova [r0-2*FDEC_STRIDEB], %1
- mova [r0-1*FDEC_STRIDEB], %1
- mova [r0+0*FDEC_STRIDEB], %1
- mova [r0+1*FDEC_STRIDEB], %1
- add r0, 4*FDEC_STRIDEB
- mova [r0-2*FDEC_STRIDEB], %1
- mova [r0-1*FDEC_STRIDEB], %1
- mova [r0+0*FDEC_STRIDEB], %1
- mova [r0+1*FDEC_STRIDEB], %1
- %else
- add r0, 8*FDEC_STRIDE
- mova [r0-4*FDEC_STRIDE], %1
- mova [r0-3*FDEC_STRIDE], %1
- mova [r0-2*FDEC_STRIDE], %1
- mova [r0-1*FDEC_STRIDE], %1
- mova [r0+0*FDEC_STRIDE], %1
- mova [r0+1*FDEC_STRIDE], %1
- mova [r0+2*FDEC_STRIDE], %1
- mova [r0+3*FDEC_STRIDE], %1
- %endif ; HIGH_BIT_DEPTH
- %endif
- %endmacro
- %macro PRED_H_LOAD 2 ; reg, offset
- %if cpuflag(avx2)
- vpbroadcastpix %1, [r0+(%2)*FDEC_STRIDEB-SIZEOF_PIXEL]
- %elif HIGH_BIT_DEPTH
- movd %1, [r0+(%2)*FDEC_STRIDEB-4]
- SPLATW %1, %1, 1
- %else
- SPLATB_LOAD %1, r0+(%2)*FDEC_STRIDE-1, m2
- %endif
- %endmacro
- %macro PRED_H_STORE 3 ; reg, offset, width
- %assign %%w %3*SIZEOF_PIXEL
- %if %%w == 8
- movq [r0+(%2)*FDEC_STRIDEB], %1
- %else
- %assign %%i 0
- %rep %%w/mmsize
- mova [r0+(%2)*FDEC_STRIDEB+%%i], %1
- %assign %%i %%i+mmsize
- %endrep
- %endif
- %endmacro
- %macro PRED_H_4ROWS 2 ; width, inc_ptr
- PRED_H_LOAD m0, 0
- PRED_H_LOAD m1, 1
- PRED_H_STORE m0, 0, %1
- PRED_H_STORE m1, 1, %1
- PRED_H_LOAD m0, 2
- %if %2
- add r0, 4*FDEC_STRIDEB
- %endif
- PRED_H_LOAD m1, 3-4*%2
- PRED_H_STORE m0, 2-4*%2, %1
- PRED_H_STORE m1, 3-4*%2, %1
- %endmacro
- ; dest, left, right, src, tmp
- ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
- %macro PRED8x8_LOWPASS 4-5
- %if HIGH_BIT_DEPTH
- paddw %2, %3
- psrlw %2, 1
- pavgw %1, %4, %2
- %else
- mova %5, %2
- pavgb %2, %3
- pxor %3, %5
- pand %3, [pb_1]
- psubusb %2, %3
- pavgb %1, %4, %2
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_h( pixel *src )
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH
- INIT_XMM avx2
- cglobal predict_4x4_h, 1,1
- PRED_H_4ROWS 4, 0
- RET
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_ddl( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4_DDL 0
- cglobal predict_4x4_ddl, 1,1
- movu m1, [r0-FDEC_STRIDEB]
- PSLLPIX m2, m1, 1
- mova m0, m1
- %if HIGH_BIT_DEPTH
- PSRLPIX m1, m1, 1
- pshufhw m1, m1, q2210
- %else
- pxor m1, m2
- PSRLPIX m1, m1, 1
- pxor m1, m0
- %endif
- PRED8x8_LOWPASS m0, m2, m1, m0, m3
- %assign Y 0
- %rep 4
- PSRLPIX m0, m0, 1
- movh [r0+Y*FDEC_STRIDEB], m0
- %assign Y (Y+1)
- %endrep
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_4x4_DDL
- INIT_XMM avx
- PREDICT_4x4_DDL
- INIT_MMX mmx2
- cglobal predict_4x4_ddl, 1,2
- movu m1, [r0-FDEC_STRIDEB+4]
- PRED8x8_LOWPASS m0, m1, [r0-FDEC_STRIDEB+0], [r0-FDEC_STRIDEB+2]
- mova m3, [r0-FDEC_STRIDEB+8]
- mova [r0+0*FDEC_STRIDEB], m0
- pshufw m4, m3, q3321
- PRED8x8_LOWPASS m2, m4, [r0-FDEC_STRIDEB+6], m3
- mova [r0+3*FDEC_STRIDEB], m2
- pshufw m1, m0, q0021
- punpckldq m1, m2
- mova [r0+1*FDEC_STRIDEB], m1
- psllq m0, 16
- PALIGNR m2, m0, 6, m0
- mova [r0+2*FDEC_STRIDEB], m2
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx2
- PREDICT_4x4_DDL
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_vr( pixel *src )
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH == 0
- INIT_MMX ssse3
- cglobal predict_4x4_vr, 1,1
- movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
- mova m4, m1
- palignr m1, [r0-1*FDEC_STRIDEB-8], 7 ; ......t3t2t1t0lt
- pavgb m4, m1
- palignr m1, [r0+0*FDEC_STRIDEB-8], 7 ; ....t3t2t1t0ltl0
- mova m0, m1
- palignr m1, [r0+1*FDEC_STRIDEB-8], 7 ; ..t3t2t1t0ltl0l1
- mova m2, m1
- palignr m1, [r0+2*FDEC_STRIDEB-8], 7 ; t3t2t1t0ltl0l1l2
- PRED8x8_LOWPASS m2, m0, m1, m2, m3
- pshufw m0, m2, 0
- psrlq m2, 16
- movd [r0+0*FDEC_STRIDEB], m4
- palignr m4, m0, 7
- movd [r0+1*FDEC_STRIDEB], m2
- psllq m0, 8
- movd [r0+2*FDEC_STRIDEB], m4
- palignr m2, m0, 7
- movd [r0+3*FDEC_STRIDEB], m2
- RET
- %endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_ddr( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4 4
- cglobal predict_4x4_ddr, 1,1
- %if HIGH_BIT_DEPTH
- movu m2, [r0-1*FDEC_STRIDEB-8]
- pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
- pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
- pinsrw m2, [r0+2*FDEC_STRIDEB-2], 0
- movhps m3, [r0+3*FDEC_STRIDEB-8]
- %else ; !HIGH_BIT_DEPTH
- movd m0, [r0+2*FDEC_STRIDEB-4]
- movd m1, [r0+0*FDEC_STRIDEB-4]
- punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
- punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
- punpckhwd m0, m1
- movd m2, [r0-1*FDEC_STRIDEB]
- %if cpuflag(ssse3)
- palignr m2, m0, 4
- %else
- psllq m2, 32
- punpckhdq m0, m2
- SWAP 2, 0
- %endif
- movd m3, [r0+3*FDEC_STRIDEB-4]
- psllq m3, 32
- %endif ; !HIGH_BIT_DEPTH
- PSRLPIX m1, m2, 1
- mova m0, m2
- PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
- PRED8x8_LOWPASS m0, m2, m1, m0, m3
- %assign Y 3
- movh [r0+Y*FDEC_STRIDEB], m0
- %rep 3
- %assign Y (Y-1)
- PSRLPIX m0, m0, 1
- movh [r0+Y*FDEC_STRIDEB], m0
- %endrep
- RET
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_vr( pixel *src )
- ;-----------------------------------------------------------------------------
- cglobal predict_4x4_vr, 1,1
- %if HIGH_BIT_DEPTH
- movu m1, [r0-1*FDEC_STRIDEB-8]
- pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
- pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
- pinsrw m1, [r0+2*FDEC_STRIDEB-2], 0
- %else ; !HIGH_BIT_DEPTH
- movd m0, [r0+2*FDEC_STRIDEB-4]
- movd m1, [r0+0*FDEC_STRIDEB-4]
- punpcklbw m0, [r0+1*FDEC_STRIDEB-4]
- punpcklbw m1, [r0-1*FDEC_STRIDEB-4]
- punpckhwd m0, m1
- movd m1, [r0-1*FDEC_STRIDEB]
- %if cpuflag(ssse3)
- palignr m1, m0, 4
- %else
- psllq m1, 32
- punpckhdq m0, m1
- SWAP 1, 0
- %endif
- %endif ; !HIGH_BIT_DEPTH
- PSRLPIX m2, m1, 1
- PSRLPIX m0, m1, 2
- pavg%1 m4, m1, m2
- PSRLPIX m4, m4, 3
- PRED8x8_LOWPASS m2, m0, m1, m2, m3
- PSLLPIX m0, m2, 6
- PSRLPIX m2, m2, 2
- movh [r0+0*FDEC_STRIDEB], m4
- PALIGNR m4, m0, 7*SIZEOF_PIXEL, m3
- movh [r0+1*FDEC_STRIDEB], m2
- PSLLPIX m0, m0, 1
- movh [r0+2*FDEC_STRIDEB], m4
- PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
- movh [r0+3*FDEC_STRIDEB], m2
- RET
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_hd( pixel *src )
- ;-----------------------------------------------------------------------------
- cglobal predict_4x4_hd, 1,1
- %if HIGH_BIT_DEPTH
- movu m1, [r0-1*FDEC_STRIDEB-8]
- PSLLPIX m1, m1, 1
- pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
- pinsrw m1, [r0+1*FDEC_STRIDEB-2], 2
- pinsrw m1, [r0+2*FDEC_STRIDEB-2], 1
- pinsrw m1, [r0+3*FDEC_STRIDEB-2], 0
- %else
- movd m0, [r0-1*FDEC_STRIDEB-4] ; lt ..
- punpckldq m0, [r0-1*FDEC_STRIDEB] ; t3 t2 t1 t0 lt .. .. ..
- PSLLPIX m0, m0, 1 ; t2 t1 t0 lt .. .. .. ..
- movd m1, [r0+3*FDEC_STRIDEB-4] ; l3
- punpcklbw m1, [r0+2*FDEC_STRIDEB-4] ; l2 l3
- movd m2, [r0+1*FDEC_STRIDEB-4] ; l1
- punpcklbw m2, [r0+0*FDEC_STRIDEB-4] ; l0 l1
- punpckh%3 m1, m2 ; l0 l1 l2 l3
- punpckh%4 m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
- %endif
- PSRLPIX m2, m1, 1 ; .. t2 t1 t0 lt l0 l1 l2
- PSRLPIX m0, m1, 2 ; .. .. t2 t1 t0 lt l0 l1
- pavg%1 m5, m1, m2
- PRED8x8_LOWPASS m3, m1, m0, m2, m4
- punpckl%2 m5, m3
- PSRLPIX m3, m3, 4
- PALIGNR m3, m5, 6*SIZEOF_PIXEL, m4
- %assign Y 3
- movh [r0+Y*FDEC_STRIDEB], m5
- %rep 2
- %assign Y (Y-1)
- PSRLPIX m5, m5, 2
- movh [r0+Y*FDEC_STRIDEB], m5
- %endrep
- movh [r0+0*FDEC_STRIDEB], m3
- RET
- %endmacro ; PREDICT_4x4
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_ddr( pixel *src )
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH
- INIT_MMX mmx2
- cglobal predict_4x4_ddr, 1,1
- mova m0, [r0+1*FDEC_STRIDEB-8]
- punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
- mova m3, [r0+3*FDEC_STRIDEB-8]
- punpckhwd m3, [r0+2*FDEC_STRIDEB-8]
- punpckhdq m3, m0
- pshufw m0, m3, q3321
- pinsrw m0, [r0-1*FDEC_STRIDEB-2], 3
- pshufw m1, m0, q3321
- PRED8x8_LOWPASS m0, m1, m3, m0
- movq [r0+3*FDEC_STRIDEB], m0
- movq m2, [r0-1*FDEC_STRIDEB-0]
- pshufw m4, m2, q2100
- pinsrw m4, [r0-1*FDEC_STRIDEB-2], 0
- movq m1, m4
- PALIGNR m4, m3, 6, m3
- PRED8x8_LOWPASS m1, m4, m2, m1
- movq [r0+0*FDEC_STRIDEB], m1
- pshufw m2, m0, q3321
- punpckldq m2, m1
- psllq m0, 16
- PALIGNR m1, m0, 6, m0
- movq [r0+1*FDEC_STRIDEB], m1
- movq [r0+2*FDEC_STRIDEB], m2
- movd [r0+3*FDEC_STRIDEB+4], m1
- RET
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_hd( pixel *src )
- ;-----------------------------------------------------------------------------
- cglobal predict_4x4_hd, 1,1
- mova m0, [r0+1*FDEC_STRIDEB-8]
- punpckhwd m0, [r0+0*FDEC_STRIDEB-8]
- mova m1, [r0+3*FDEC_STRIDEB-8]
- punpckhwd m1, [r0+2*FDEC_STRIDEB-8]
- punpckhdq m1, m0
- mova m0, m1
- movu m3, [r0-1*FDEC_STRIDEB-2]
- pshufw m4, m1, q0032
- mova m7, m3
- punpckldq m4, m3
- PALIGNR m3, m1, 2, m2
- PRED8x8_LOWPASS m2, m4, m1, m3
- pavgw m0, m3
- punpcklwd m5, m0, m2
- punpckhwd m4, m0, m2
- mova [r0+3*FDEC_STRIDEB], m5
- mova [r0+1*FDEC_STRIDEB], m4
- psrlq m5, 32
- punpckldq m5, m4
- mova [r0+2*FDEC_STRIDEB], m5
- pshufw m4, m7, q2100
- mova m6, [r0-1*FDEC_STRIDEB+0]
- pinsrw m4, [r0+0*FDEC_STRIDEB-2], 0
- PRED8x8_LOWPASS m3, m4, m6, m7
- PALIGNR m3, m0, 6, m0
- mova [r0+0*FDEC_STRIDEB], m3
- RET
- INIT_XMM sse2
- PREDICT_4x4 w, wd, dq, qdq
- INIT_XMM ssse3
- PREDICT_4x4 w, wd, dq, qdq
- INIT_XMM avx
- PREDICT_4x4 w, wd, dq, qdq
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx2
- PREDICT_4x4 b, bw, wd, dq
- INIT_MMX ssse3
- %define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3
- PREDICT_4x4 b, bw, wd, dq
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_hu( pixel *src )
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH
- INIT_MMX
- cglobal predict_4x4_hu_mmx2, 1,1
- movq m0, [r0+0*FDEC_STRIDEB-8]
- punpckhwd m0, [r0+1*FDEC_STRIDEB-8]
- movq m1, [r0+2*FDEC_STRIDEB-8]
- punpckhwd m1, [r0+3*FDEC_STRIDEB-8]
- punpckhdq m0, m1
- pshufw m1, m1, q3333
- movq [r0+3*FDEC_STRIDEB], m1
- pshufw m3, m0, q3321
- pshufw m4, m0, q3332
- pavgw m2, m0, m3
- PRED8x8_LOWPASS m3, m0, m4, m3
- punpcklwd m4, m2, m3
- mova [r0+0*FDEC_STRIDEB], m4
- psrlq m2, 16
- psrlq m3, 16
- punpcklwd m2, m3
- mova [r0+1*FDEC_STRIDEB], m2
- punpckhdq m2, m1
- mova [r0+2*FDEC_STRIDEB], m2
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX
- cglobal predict_4x4_hu_mmx2, 1,1
- movd m1, [r0+0*FDEC_STRIDEB-4]
- punpcklbw m1, [r0+1*FDEC_STRIDEB-4]
- movd m0, [r0+2*FDEC_STRIDEB-4]
- punpcklbw m0, [r0+3*FDEC_STRIDEB-4]
- punpckhwd m1, m0
- movq m0, m1
- punpckhbw m1, m1
- pshufw m1, m1, q3333
- punpckhdq m0, m1
- movq m2, m0
- movq m3, m0
- movq m5, m0
- psrlq m3, 8
- psrlq m2, 16
- pavgb m5, m3
- PRED8x8_LOWPASS m3, m0, m2, m3, m4
- movd [r0+3*FDEC_STRIDEB], m1
- punpcklbw m5, m3
- movd [r0+0*FDEC_STRIDEB], m5
- psrlq m5, 16
- movd [r0+1*FDEC_STRIDEB], m5
- psrlq m5, 16
- movd [r0+2*FDEC_STRIDEB], m5
- RET
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_vl( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_4x4_V1 1
- cglobal predict_4x4_vl, 1,1
- movu m1, [r0-FDEC_STRIDEB]
- PSRLPIX m3, m1, 1
- PSRLPIX m2, m1, 2
- pavg%1 m4, m3, m1
- PRED8x8_LOWPASS m0, m1, m2, m3, m5
- movh [r0+0*FDEC_STRIDEB], m4
- movh [r0+1*FDEC_STRIDEB], m0
- PSRLPIX m4, m4, 1
- PSRLPIX m0, m0, 1
- movh [r0+2*FDEC_STRIDEB], m4
- movh [r0+3*FDEC_STRIDEB], m0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_4x4_V1 w
- INIT_XMM avx
- PREDICT_4x4_V1 w
- INIT_MMX mmx2
- cglobal predict_4x4_vl, 1,4
- mova m1, [r0-FDEC_STRIDEB+0]
- mova m2, [r0-FDEC_STRIDEB+8]
- mova m0, m2
- PALIGNR m2, m1, 4, m4
- PALIGNR m0, m1, 2, m4
- mova m3, m0
- pavgw m3, m1
- mova [r0+0*FDEC_STRIDEB], m3
- psrlq m3, 16
- mova [r0+2*FDEC_STRIDEB], m3
- PRED8x8_LOWPASS m0, m1, m2, m0
- mova [r0+1*FDEC_STRIDEB], m0
- psrlq m0, 16
- mova [r0+3*FDEC_STRIDEB], m0
- movzx r1d, word [r0-FDEC_STRIDEB+ 8]
- movzx r2d, word [r0-FDEC_STRIDEB+10]
- movzx r3d, word [r0-FDEC_STRIDEB+12]
- lea r1d, [r1+r2+1]
- add r3d, r2d
- lea r3d, [r3+r1+1]
- shr r1d, 1
- shr r3d, 2
- mov [r0+2*FDEC_STRIDEB+6], r1w
- mov [r0+3*FDEC_STRIDEB+6], r3w
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx2
- PREDICT_4x4_V1 b
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_4x4_dc( pixel *src )
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx2
- %if HIGH_BIT_DEPTH
- cglobal predict_4x4_dc, 1,1
- mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- paddw m2, [r0+3*FDEC_STRIDEB-4*SIZEOF_PIXEL]
- psrlq m2, 48
- mova m0, [r0-FDEC_STRIDEB]
- HADDW m0, m1
- paddw m0, [pw_4]
- paddw m0, m2
- psrlw m0, 3
- SPLATW m0, m0
- mova [r0+0*FDEC_STRIDEB], m0
- mova [r0+1*FDEC_STRIDEB], m0
- mova [r0+2*FDEC_STRIDEB], m0
- mova [r0+3*FDEC_STRIDEB], m0
- RET
- %else ; !HIGH_BIT_DEPTH
- cglobal predict_4x4_dc, 1,4
- pxor mm7, mm7
- movd mm0, [r0-FDEC_STRIDEB]
- psadbw mm0, mm7
- movd r3d, mm0
- movzx r1d, byte [r0-1]
- %assign Y 1
- %rep 3
- movzx r2d, byte [r0+FDEC_STRIDEB*Y-1]
- add r1d, r2d
- %assign Y Y+1
- %endrep
- lea r1d, [r1+r3+4]
- shr r1d, 3
- imul r1d, 0x01010101
- mov [r0+FDEC_STRIDEB*0], r1d
- mov [r0+FDEC_STRIDEB*1], r1d
- mov [r0+FDEC_STRIDEB*2], r1d
- mov [r0+FDEC_STRIDEB*3], r1d
- RET
- %endif ; HIGH_BIT_DEPTH
- %macro PREDICT_FILTER 4
- ;-----------------------------------------------------------------------------
- ;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_filter, 4,6,6
- add r0, 0x58*SIZEOF_PIXEL
- %define src r0-0x58*SIZEOF_PIXEL
- %if ARCH_X86_64 == 0
- mov r4, r1
- %define t1 r4
- %define t4 r1
- %else
- %define t1 r1
- %define t4 r4
- %endif
- test r3b, 1
- je .check_top
- mov t4d, r2d
- and t4d, 8
- neg t4
- mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
- mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m1, m0
- mova m2, [src+4*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%1%2 m2, [src+3*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- mova m3, [src+6*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%1%2 m3, [src+5*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- punpckh%2%3 m3, m2
- punpckh%3%4 m3, m1
- mova m0, [src+7*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- mova m1, [src-1*FDEC_STRIDEB]
- PALIGNR m4, m3, m0, 7*SIZEOF_PIXEL, m0
- PALIGNR m1, m1, m3, 1*SIZEOF_PIXEL, m2
- PRED8x8_LOWPASS m3, m1, m4, m3, m5
- mova [t1+8*SIZEOF_PIXEL], m3
- movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
- movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
- lea t4d, [t4*3+2]
- add t4d, r5d
- shr t4d, 2
- mov [t1+7*SIZEOF_PIXEL], t4%1
- mov [t1+6*SIZEOF_PIXEL], t4%1
- test r3b, 2
- je .done
- .check_top:
- %if SIZEOF_PIXEL==1 && cpuflag(ssse3)
- INIT_XMM cpuname
- movu m3, [src-1*FDEC_STRIDEB]
- movhps m0, [src-1*FDEC_STRIDEB-8]
- test r2b, 8
- je .fix_lt_2
- .do_top:
- and r2d, 4
- %ifdef PIC
- lea r3, [shuf_fixtr]
- pshufb m3, [r3+r2*4]
- %else
- pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
- %endif
- psrldq m1, m3, 15
- PALIGNR m2, m3, m0, 15, m0
- PALIGNR m1, m3, 1, m5
- PRED8x8_LOWPASS m0, m2, m1, m3, m5
- mova [t1+16*SIZEOF_PIXEL], m0
- psrldq m0, 15
- movd [t1+32*SIZEOF_PIXEL], m0
- .done:
- REP_RET
- .fix_lt_2:
- pslldq m0, m3, 15
- jmp .do_top
- %else
- mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
- mova m3, [src-1*FDEC_STRIDEB]
- mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
- test r2b, 8
- je .fix_lt_2
- test r2b, 4
- je .fix_tr_1
- .do_top:
- PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
- PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
- PRED8x8_LOWPASS m4, m2, m0, m3, m5
- mova [t1+16*SIZEOF_PIXEL], m4
- test r3b, 4
- je .done
- PSRLPIX m5, m1, 7
- PALIGNR m2, m1, m3, 7*SIZEOF_PIXEL, m3
- PALIGNR m5, m1, 1*SIZEOF_PIXEL, m4
- PRED8x8_LOWPASS m0, m2, m5, m1, m4
- mova [t1+24*SIZEOF_PIXEL], m0
- PSRLPIX m0, m0, 7
- movd [t1+32*SIZEOF_PIXEL], m0
- .done:
- REP_RET
- .fix_lt_2:
- PSLLPIX m0, m3, 7
- test r2b, 4
- jne .do_top
- .fix_tr_1:
- punpckh%1%2 m1, m3, m3
- pshuf%2 m1, m1, q3333
- jmp .do_top
- %endif
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_FILTER w, d, q, dq
- INIT_XMM ssse3
- PREDICT_FILTER w, d, q, dq
- INIT_XMM avx
- PREDICT_FILTER w, d, q, dq
- %else
- INIT_MMX mmx2
- PREDICT_FILTER b, w, d, q
- INIT_MMX ssse3
- PREDICT_FILTER b, w, d, q
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_v( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_V 0
- cglobal predict_8x8_v, 2,2
- mova m0, [r1+16*SIZEOF_PIXEL]
- STORE8 m0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse
- PREDICT_8x8_V
- %else
- INIT_MMX mmx2
- PREDICT_8x8_V
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_h( pixel *src, pixel edge[36] )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_H 2
- cglobal predict_8x8_h, 2,2
- movu m1, [r1+7*SIZEOF_PIXEL]
- add r0, 4*FDEC_STRIDEB
- punpckl%1 m2, m1, m1
- punpckh%1 m1, m1
- %assign Y 0
- %rep 8
- %assign i 1+Y/4
- SPLAT%2 m0, m %+ i, (3-Y)&3
- mova [r0+(Y-4)*FDEC_STRIDEB], m0
- %assign Y Y+1
- %endrep
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_H wd, D
- %else
- INIT_MMX mmx2
- PREDICT_8x8_H bw, W
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_dc( pixel *src, pixel *edge );
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- cglobal predict_8x8_dc, 2,2
- movu m0, [r1+14]
- paddw m0, [r1+32]
- HADDW m0, m1
- paddw m0, [pw_8]
- psrlw m0, 4
- SPLATW m0, m0
- STORE8 m0
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx2
- cglobal predict_8x8_dc, 2,2
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [r1+7]
- psadbw mm1, [r1+16]
- paddw mm0, [pw_8]
- paddw mm0, mm1
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- STORE8 mm0
- RET
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_dc_top ( pixel *src, pixel *edge );
- ; void predict_8x8_dc_left( pixel *src, pixel *edge );
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH
- %macro PREDICT_8x8_DC 3
- cglobal %1, 2,2
- %3 m0, [r1+%2]
- HADDW m0, m1
- paddw m0, [pw_4]
- psrlw m0, 3
- SPLATW m0, m0
- STORE8 m0
- RET
- %endmacro
- INIT_XMM sse2
- PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
- PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
- %else ; !HIGH_BIT_DEPTH
- %macro PREDICT_8x8_DC 2
- cglobal %1, 2,2
- pxor mm0, mm0
- psadbw mm0, [r1+%2]
- paddw mm0, [pw_4]
- psrlw mm0, 3
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
- STORE8 mm0
- RET
- %endmacro
- INIT_MMX
- PREDICT_8x8_DC predict_8x8_dc_top_mmx2, 16
- PREDICT_8x8_DC predict_8x8_dc_left_mmx2, 7
- %endif ; HIGH_BIT_DEPTH
- ; sse2 is faster even on amd for 8-bit, so there's no sense in spending exe
- ; size on the 8-bit mmx functions below if we know sse2 is available.
- %macro PREDICT_8x8_DDLR 0
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddl( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_ddl, 2,2,7
- mova m0, [r1+16*SIZEOF_PIXEL]
- mova m1, [r1+24*SIZEOF_PIXEL]
- %if cpuflag(cache64)
- movd m5, [r1+32*SIZEOF_PIXEL]
- palignr m3, m1, m0, 1*SIZEOF_PIXEL
- palignr m5, m5, m1, 1*SIZEOF_PIXEL
- palignr m4, m1, m0, 7*SIZEOF_PIXEL
- %else
- movu m3, [r1+17*SIZEOF_PIXEL]
- movu m4, [r1+23*SIZEOF_PIXEL]
- movu m5, [r1+25*SIZEOF_PIXEL]
- %endif
- PSLLPIX m2, m0, 1
- add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS m0, m2, m3, m0, m6
- PRED8x8_LOWPASS m1, m4, m5, m1, m6
- mova [r0+3*FDEC_STRIDEB], m1
- %assign Y 2
- %rep 6
- PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
- PSLLPIX m0, m0, 1
- mova [r0+Y*FDEC_STRIDEB], m1
- %assign Y (Y-1)
- %endrep
- PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
- mova [r0+Y*FDEC_STRIDEB], m1
- RET
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddr( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_ddr, 2,2,7
- add r0, FDEC_STRIDEB*4
- mova m0, [r1+ 8*SIZEOF_PIXEL]
- mova m1, [r1+16*SIZEOF_PIXEL]
- ; edge[] is 32byte aligned, so some of the unaligned loads are known to be not cachesplit
- movu m2, [r1+ 7*SIZEOF_PIXEL]
- movu m5, [r1+17*SIZEOF_PIXEL]
- %if cpuflag(cache64)
- palignr m3, m1, m0, 1*SIZEOF_PIXEL
- palignr m4, m1, m0, 7*SIZEOF_PIXEL
- %else
- movu m3, [r1+ 9*SIZEOF_PIXEL]
- movu m4, [r1+15*SIZEOF_PIXEL]
- %endif
- PRED8x8_LOWPASS m0, m2, m3, m0, m6
- PRED8x8_LOWPASS m1, m4, m5, m1, m6
- mova [r0+3*FDEC_STRIDEB], m0
- %assign Y -4
- %rep 6
- PALIGNR m1, m0, 7*SIZEOF_PIXEL, m2
- PSLLPIX m0, m0, 1
- mova [r0+Y*FDEC_STRIDEB], m1
- %assign Y (Y+1)
- %endrep
- PALIGNR m1, m0, 7*SIZEOF_PIXEL, m0
- mova [r0+Y*FDEC_STRIDEB], m1
- RET
- %endmacro ; PREDICT_8x8_DDLR
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_DDLR
- INIT_XMM ssse3
- PREDICT_8x8_DDLR
- INIT_XMM cache64, ssse3
- PREDICT_8x8_DDLR
- %elif ARCH_X86_64 == 0
- INIT_MMX mmx2
- PREDICT_8x8_DDLR
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_hu( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_HU 2
- cglobal predict_8x8_hu, 2,2,8
- add r0, 4*FDEC_STRIDEB
- %if HIGH_BIT_DEPTH
- %if cpuflag(ssse3)
- movu m5, [r1+7*SIZEOF_PIXEL]
- pshufb m5, [pw_reverse]
- %else
- movq m6, [r1+7*SIZEOF_PIXEL]
- movq m5, [r1+11*SIZEOF_PIXEL]
- pshuflw m6, m6, q0123
- pshuflw m5, m5, q0123
- movlhps m5, m6
- %endif ; cpuflag
- psrldq m2, m5, 2
- pshufd m3, m5, q0321
- pshufhw m2, m2, q2210
- pshufhw m3, m3, q1110
- pavgw m4, m5, m2
- %else ; !HIGH_BIT_DEPTH
- movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
- pshufw m0, m1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
- psllq m1, 56 ; l7 .. .. .. .. .. .. ..
- mova m2, m0
- psllw m0, 8
- psrlw m2, 8
- por m2, m0
- mova m3, m2
- mova m4, m2
- mova m5, m2 ; l7 l6 l5 l4 l3 l2 l1 l0
- psrlq m3, 16
- psrlq m2, 8
- por m2, m1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckhbw m1, m1
- por m3, m1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavgb m4, m2
- %endif ; !HIGH_BIT_DEPTH
- PRED8x8_LOWPASS m2, m3, m5, m2, m6
- punpckh%2 m0, m4, m2 ; p8 p7 p6 p5
- punpckl%2 m4, m2 ; p4 p3 p2 p1
- PALIGNR m5, m0, m4, 2*SIZEOF_PIXEL, m3
- pshuf%1 m1, m0, q3321
- PALIGNR m6, m0, m4, 4*SIZEOF_PIXEL, m3
- pshuf%1 m2, m0, q3332
- PALIGNR m7, m0, m4, 6*SIZEOF_PIXEL, m3
- pshuf%1 m3, m0, q3333
- mova [r0-4*FDEC_STRIDEB], m4
- mova [r0-3*FDEC_STRIDEB], m5
- mova [r0-2*FDEC_STRIDEB], m6
- mova [r0-1*FDEC_STRIDEB], m7
- mova [r0+0*FDEC_STRIDEB], m0
- mova [r0+1*FDEC_STRIDEB], m1
- mova [r0+2*FDEC_STRIDEB], m2
- mova [r0+3*FDEC_STRIDEB], m3
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_HU d, wd
- INIT_XMM ssse3
- PREDICT_8x8_HU d, wd
- INIT_XMM avx
- PREDICT_8x8_HU d, wd
- %elif ARCH_X86_64 == 0
- INIT_MMX mmx2
- PREDICT_8x8_HU w, bw
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vr( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_VR 1
- cglobal predict_8x8_vr, 2,3
- mova m2, [r1+16*SIZEOF_PIXEL]
- %ifidn cpuname, ssse3
- mova m0, [r1+8*SIZEOF_PIXEL]
- palignr m3, m2, m0, 7*SIZEOF_PIXEL
- palignr m1, m2, m0, 6*SIZEOF_PIXEL
- %else
- movu m3, [r1+15*SIZEOF_PIXEL]
- movu m1, [r1+14*SIZEOF_PIXEL]
- %endif
- pavg%1 m4, m3, m2
- add r0, FDEC_STRIDEB*4
- PRED8x8_LOWPASS m3, m1, m2, m3, m5
- mova [r0-4*FDEC_STRIDEB], m4
- mova [r0-3*FDEC_STRIDEB], m3
- mova m1, [r1+8*SIZEOF_PIXEL]
- PSLLPIX m0, m1, 1
- PSLLPIX m2, m1, 2
- PRED8x8_LOWPASS m0, m1, m2, m0, m6
- %assign Y -2
- %rep 5
- PALIGNR m4, m0, 7*SIZEOF_PIXEL, m5
- mova [r0+Y*FDEC_STRIDEB], m4
- PSLLPIX m0, m0, 1
- SWAP 3, 4
- %assign Y (Y+1)
- %endrep
- PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
- mova [r0+Y*FDEC_STRIDEB], m4
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_VR w
- INIT_XMM ssse3
- PREDICT_8x8_VR w
- INIT_XMM avx
- PREDICT_8x8_VR w
- %elif ARCH_X86_64 == 0
- INIT_MMX mmx2
- PREDICT_8x8_VR b
- %endif
- %macro LOAD_PLANE_ARGS 0
- %if cpuflag(avx2) && ARCH_X86_64 == 0
- vpbroadcastw m0, r1m
- vpbroadcastw m2, r2m
- vpbroadcastw m4, r3m
- %elif mmsize == 8 ; MMX is only used on x86_32
- SPLATW m0, r1m
- SPLATW m2, r2m
- SPLATW m4, r3m
- %else
- movd xm0, r1m
- movd xm2, r2m
- movd xm4, r3m
- SPLATW m0, xm0
- SPLATW m2, xm2
- SPLATW m4, xm4
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
- ;-----------------------------------------------------------------------------
- %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
- %macro PREDICT_CHROMA_P_MMX 1
- cglobal predict_8x%1c_p_core, 1,2
- LOAD_PLANE_ARGS
- movq m1, m2
- pmullw m2, [pw_0to15]
- psllw m1, 2
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b}
- paddsw m1, m0 ; m1 = {i+4*b, i+5*b, i+6*b, i+7*b}
- mov r1d, %1
- ALIGN 4
- .loop:
- movq m5, m0
- movq m6, m1
- psraw m5, 5
- psraw m6, 5
- packuswb m5, m6
- movq [r0], m5
- paddsw m0, m4
- paddsw m1, m4
- add r0, FDEC_STRIDE
- dec r1d
- jg .loop
- RET
- %endmacro ; PREDICT_CHROMA_P_MMX
- INIT_MMX mmx2
- PREDICT_CHROMA_P_MMX 8
- PREDICT_CHROMA_P_MMX 16
- %endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
- %macro PREDICT_CHROMA_P 1
- %if HIGH_BIT_DEPTH
- cglobal predict_8x%1c_p_core, 1,2,7
- LOAD_PLANE_ARGS
- mova m3, [pw_pixel_max]
- pxor m1, m1
- pmullw m2, [pw_43210123] ; b
- %if %1 == 16
- pmullw m5, m4, [pw_m7] ; c
- %else
- pmullw m5, m4, [pw_m3]
- %endif
- paddw m5, [pw_16]
- %if mmsize == 32
- mova xm6, xm4
- paddw m4, m4
- paddw m5, m6
- %endif
- mov r1d, %1/(mmsize/16)
- .loop:
- paddsw m6, m2, m5
- paddsw m6, m0
- psraw m6, 5
- CLIPW m6, m1, m3
- paddw m5, m4
- %if mmsize == 32
- vextracti128 [r0], m6, 1
- mova [r0+FDEC_STRIDEB], xm6
- add r0, 2*FDEC_STRIDEB
- %else
- mova [r0], m6
- add r0, FDEC_STRIDEB
- %endif
- dec r1d
- jg .loop
- RET
- %else ; !HIGH_BIT_DEPTH
- cglobal predict_8x%1c_p_core, 1,2
- LOAD_PLANE_ARGS
- %if mmsize == 32
- vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
- pmullw m2, m1
- mova xm1, xm4 ; zero upper half
- paddsw m4, m4
- paddsw m0, m1
- %else
- pmullw m2, [pw_0to15]
- %endif
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- paddsw m1, m0, m4
- paddsw m4, m4
- mov r1d, %1/(mmsize/8)
- .loop:
- psraw m2, m0, 5
- psraw m3, m1, 5
- paddsw m0, m4
- paddsw m1, m4
- packuswb m2, m3
- %if mmsize == 32
- movq [r0+FDEC_STRIDE*1], xm2
- movhps [r0+FDEC_STRIDE*3], xm2
- vextracti128 xm2, m2, 1
- movq [r0+FDEC_STRIDE*0], xm2
- movhps [r0+FDEC_STRIDE*2], xm2
- %else
- movq [r0+FDEC_STRIDE*0], xm2
- movhps [r0+FDEC_STRIDE*1], xm2
- %endif
- add r0, FDEC_STRIDE*mmsize/8
- dec r1d
- jg .loop
- RET
- %endif ; HIGH_BIT_DEPTH
- %endmacro ; PREDICT_CHROMA_P
- INIT_XMM sse2
- PREDICT_CHROMA_P 8
- PREDICT_CHROMA_P 16
- INIT_XMM avx
- PREDICT_CHROMA_P 8
- PREDICT_CHROMA_P 16
- INIT_YMM avx2
- PREDICT_CHROMA_P 8
- PREDICT_CHROMA_P 16
- ;-----------------------------------------------------------------------------
- ; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
- ;-----------------------------------------------------------------------------
- %if HIGH_BIT_DEPTH == 0 && ARCH_X86_64 == 0
- INIT_MMX mmx2
- cglobal predict_16x16_p_core, 1,2
- LOAD_PLANE_ARGS
- movq mm5, mm2
- movq mm1, mm2
- pmullw mm5, [pw_0to15]
- psllw mm2, 3
- psllw mm1, 2
- movq mm3, mm2
- paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
- paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
- paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
- mov r1d, 16
- ALIGN 4
- .loop:
- movq mm5, mm0
- movq mm6, mm1
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0], mm5
- movq mm5, mm2
- movq mm6, mm3
- psraw mm5, 5
- psraw mm6, 5
- packuswb mm5, mm6
- movq [r0+8], mm5
- paddsw mm0, mm4
- paddsw mm1, mm4
- paddsw mm2, mm4
- paddsw mm3, mm4
- add r0, FDEC_STRIDE
- dec r1d
- jg .loop
- RET
- %endif ; !HIGH_BIT_DEPTH && !ARCH_X86_64
- %macro PREDICT_16x16_P 0
- cglobal predict_16x16_p_core, 1,2,8
- movd m0, r1m
- movd m1, r2m
- movd m2, r3m
- SPLATW m0, m0, 0
- SPLATW m1, m1, 0
- SPLATW m2, m2, 0
- pmullw m3, m1, [pw_0to15]
- psllw m1, 3
- %if HIGH_BIT_DEPTH
- pxor m6, m6
- mov r1d, 16
- .loop:
- mova m4, m0
- mova m5, m0
- mova m7, m3
- paddsw m7, m6
- paddsw m4, m7
- paddsw m7, m1
- paddsw m5, m7
- psraw m4, 5
- psraw m5, 5
- CLIPW m4, [pb_0], [pw_pixel_max]
- CLIPW m5, [pb_0], [pw_pixel_max]
- mova [r0], m4
- mova [r0+16], m5
- add r0, FDEC_STRIDEB
- paddw m6, m2
- %else ; !HIGH_BIT_DEPTH
- paddsw m0, m3 ; m0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
- paddsw m1, m0 ; m1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
- paddsw m7, m2, m2
- mov r1d, 8
- ALIGN 4
- .loop:
- psraw m3, m0, 5
- psraw m4, m1, 5
- paddsw m5, m0, m2
- paddsw m6, m1, m2
- psraw m5, 5
- psraw m6, 5
- packuswb m3, m4
- packuswb m5, m6
- mova [r0+FDEC_STRIDE*0], m3
- mova [r0+FDEC_STRIDE*1], m5
- paddsw m0, m7
- paddsw m1, m7
- add r0, FDEC_STRIDE*2
- %endif ; !HIGH_BIT_DEPTH
- dec r1d
- jg .loop
- RET
- %endmacro ; PREDICT_16x16_P
- INIT_XMM sse2
- PREDICT_16x16_P
- %if HIGH_BIT_DEPTH == 0
- INIT_XMM avx
- PREDICT_16x16_P
- %endif
- INIT_YMM avx2
- cglobal predict_16x16_p_core, 1,2,8*HIGH_BIT_DEPTH
- LOAD_PLANE_ARGS
- %if HIGH_BIT_DEPTH
- pmullw m2, [pw_0to15]
- pxor m5, m5
- pxor m6, m6
- mova m7, [pw_pixel_max]
- mov r1d, 8
- .loop:
- paddsw m1, m2, m5
- paddw m5, m4
- paddsw m1, m0
- paddsw m3, m2, m5
- psraw m1, 5
- paddsw m3, m0
- psraw m3, 5
- CLIPW m1, m6, m7
- mova [r0+0*FDEC_STRIDEB], m1
- CLIPW m3, m6, m7
- mova [r0+1*FDEC_STRIDEB], m3
- paddw m5, m4
- add r0, 2*FDEC_STRIDEB
- %else ; !HIGH_BIT_DEPTH
- vbroadcasti128 m1, [pw_0to15]
- mova xm3, xm4 ; zero high bits
- pmullw m1, m2
- psllw m2, 3
- paddsw m0, m3
- paddsw m0, m1 ; X+1*C X+0*C
- paddsw m1, m0, m2 ; Y+1*C Y+0*C
- paddsw m4, m4
- mov r1d, 4
- .loop:
- psraw m2, m0, 5
- psraw m3, m1, 5
- paddsw m0, m4
- paddsw m1, m4
- packuswb m2, m3 ; X+1*C Y+1*C X+0*C Y+0*C
- vextracti128 [r0+0*FDEC_STRIDE], m2, 1
- mova [r0+1*FDEC_STRIDE], xm2
- psraw m2, m0, 5
- psraw m3, m1, 5
- paddsw m0, m4
- paddsw m1, m4
- packuswb m2, m3 ; X+3*C Y+3*C X+2*C Y+2*C
- vextracti128 [r0+2*FDEC_STRIDE], m2, 1
- mova [r0+3*FDEC_STRIDE], xm2
- add r0, FDEC_STRIDE*4
- %endif ; !HIGH_BIT_DEPTH
- dec r1d
- jg .loop
- RET
- %if HIGH_BIT_DEPTH == 0
- %macro PREDICT_8x8 0
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_ddl, 2,2
- mova m0, [r1+16]
- %ifidn cpuname, ssse3
- movd m2, [r1+32]
- palignr m2, m0, 1
- %else
- movu m2, [r1+17]
- %endif
- pslldq m1, m0, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS m0, m1, m2, m0, m3
- %assign Y -4
- %rep 8
- psrldq m0, 1
- movq [r0+Y*FDEC_STRIDE], m0
- %assign Y (Y+1)
- %endrep
- RET
- %ifnidn cpuname, ssse3
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_ddr( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_ddr, 2,2
- movu m0, [r1+8]
- movu m1, [r1+7]
- psrldq m2, m0, 1
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS m0, m1, m2, m0, m3
- psrldq m1, m0, 1
- %assign Y 3
- %rep 3
- movq [r0+Y*FDEC_STRIDE], m0
- movq [r0+(Y-1)*FDEC_STRIDE], m1
- psrldq m0, 2
- psrldq m1, 2
- %assign Y (Y-2)
- %endrep
- movq [r0-3*FDEC_STRIDE], m0
- movq [r0-4*FDEC_STRIDE], m1
- RET
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vl( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_vl, 2,2
- mova m0, [r1+16]
- pslldq m1, m0, 1
- psrldq m2, m0, 1
- pavgb m3, m0, m2
- add r0, FDEC_STRIDE*4
- PRED8x8_LOWPASS m0, m1, m2, m0, m5
- ; m0: (t0 + 2*t1 + t2 + 2) >> 2
- ; m3: (t0 + t1 + 1) >> 1
- %assign Y -4
- %rep 3
- psrldq m0, 1
- movq [r0+ Y *FDEC_STRIDE], m3
- movq [r0+(Y+1)*FDEC_STRIDE], m0
- psrldq m3, 1
- %assign Y (Y+2)
- %endrep
- psrldq m0, 1
- movq [r0+ Y *FDEC_STRIDE], m3
- movq [r0+(Y+1)*FDEC_STRIDE], m0
- RET
- %endif ; !ssse3
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vr( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- cglobal predict_8x8_vr, 2,2
- movu m2, [r1+8]
- add r0, 4*FDEC_STRIDE
- pslldq m1, m2, 2
- pslldq m0, m2, 1
- pavgb m3, m2, m0
- PRED8x8_LOWPASS m0, m2, m1, m0, m4
- movhps [r0-4*FDEC_STRIDE], m3
- movhps [r0-3*FDEC_STRIDE], m0
- %if cpuflag(ssse3)
- punpckhqdq m3, m3
- pshufb m0, [shuf_vr]
- palignr m3, m0, 13
- %else
- mova m2, m0
- mova m1, [pw_00ff]
- pand m1, m0
- psrlw m0, 8
- packuswb m1, m0
- pslldq m1, 4
- movhlps m3, m1
- shufps m1, m2, q3210
- psrldq m3, 5
- psrldq m1, 5
- SWAP 0, 1
- %endif
- movq [r0+3*FDEC_STRIDE], m0
- movq [r0+2*FDEC_STRIDE], m3
- psrldq m0, 1
- psrldq m3, 1
- movq [r0+1*FDEC_STRIDE], m0
- movq [r0+0*FDEC_STRIDE], m3
- psrldq m0, 1
- psrldq m3, 1
- movq [r0-1*FDEC_STRIDE], m0
- movq [r0-2*FDEC_STRIDE], m3
- RET
- %endmacro ; PREDICT_8x8
- INIT_XMM sse2
- PREDICT_8x8
- INIT_XMM ssse3
- PREDICT_8x8
- INIT_XMM avx
- PREDICT_8x8
- %endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_vl( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_VL_10 1
- cglobal predict_8x8_vl, 2,2,8
- mova m0, [r1+16*SIZEOF_PIXEL]
- mova m1, [r1+24*SIZEOF_PIXEL]
- PALIGNR m2, m1, m0, SIZEOF_PIXEL*1, m4
- PSRLPIX m4, m1, 1
- pavg%1 m6, m0, m2
- pavg%1 m7, m1, m4
- add r0, FDEC_STRIDEB*4
- mova [r0-4*FDEC_STRIDEB], m6
- PALIGNR m3, m7, m6, SIZEOF_PIXEL*1, m5
- mova [r0-2*FDEC_STRIDEB], m3
- PALIGNR m3, m7, m6, SIZEOF_PIXEL*2, m5
- mova [r0+0*FDEC_STRIDEB], m3
- PALIGNR m7, m7, m6, SIZEOF_PIXEL*3, m5
- mova [r0+2*FDEC_STRIDEB], m7
- PALIGNR m3, m1, m0, SIZEOF_PIXEL*7, m6
- PSLLPIX m5, m0, 1
- PRED8x8_LOWPASS m0, m5, m2, m0, m7
- PRED8x8_LOWPASS m1, m3, m4, m1, m7
- PALIGNR m4, m1, m0, SIZEOF_PIXEL*1, m2
- mova [r0-3*FDEC_STRIDEB], m4
- PALIGNR m4, m1, m0, SIZEOF_PIXEL*2, m2
- mova [r0-1*FDEC_STRIDEB], m4
- PALIGNR m4, m1, m0, SIZEOF_PIXEL*3, m2
- mova [r0+1*FDEC_STRIDEB], m4
- PALIGNR m1, m1, m0, SIZEOF_PIXEL*4, m2
- mova [r0+3*FDEC_STRIDEB], m1
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_VL_10 w
- INIT_XMM ssse3
- PREDICT_8x8_VL_10 w
- INIT_XMM avx
- PREDICT_8x8_VL_10 w
- %else
- INIT_MMX mmx2
- PREDICT_8x8_VL_10 b
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_hd( pixel *src, pixel *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_HD 2
- cglobal predict_8x8_hd, 2,2
- add r0, 4*FDEC_STRIDEB
- mova m0, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
- movu m1, [r1+ 7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
- %ifidn cpuname, ssse3
- mova m2, [r1+16*SIZEOF_PIXEL] ; t7 t6 t5 t4 t3 t2 t1 t0
- mova m4, m2 ; t7 t6 t5 t4 t3 t2 t1 t0
- palignr m2, m0, 7*SIZEOF_PIXEL ; t6 t5 t4 t3 t2 t1 t0 lt
- palignr m4, m0, 1*SIZEOF_PIXEL ; t0 lt l0 l1 l2 l3 l4 l5
- %else
- movu m2, [r1+15*SIZEOF_PIXEL]
- movu m4, [r1+ 9*SIZEOF_PIXEL]
- %endif ; cpuflag
- pavg%1 m3, m0, m1
- PRED8x8_LOWPASS m0, m4, m1, m0, m5
- PSRLPIX m4, m2, 2 ; .. .. t6 t5 t4 t3 t2 t1
- PSRLPIX m1, m2, 1 ; .. t6 t5 t4 t3 t2 t1 t0
- PRED8x8_LOWPASS m1, m4, m2, m1, m5
- ; .. p11 p10 p9
- punpckh%2 m2, m3, m0 ; p8 p7 p6 p5
- punpckl%2 m3, m0 ; p4 p3 p2 p1
- mova [r0+3*FDEC_STRIDEB], m3
- PALIGNR m0, m2, m3, 2*SIZEOF_PIXEL, m5
- mova [r0+2*FDEC_STRIDEB], m0
- PALIGNR m0, m2, m3, 4*SIZEOF_PIXEL, m5
- mova [r0+1*FDEC_STRIDEB], m0
- PALIGNR m0, m2, m3, 6*SIZEOF_PIXEL, m3
- mova [r0+0*FDEC_STRIDEB], m0
- mova [r0-1*FDEC_STRIDEB], m2
- PALIGNR m0, m1, m2, 2*SIZEOF_PIXEL, m5
- mova [r0-2*FDEC_STRIDEB], m0
- PALIGNR m0, m1, m2, 4*SIZEOF_PIXEL, m5
- mova [r0-3*FDEC_STRIDEB], m0
- PALIGNR m1, m1, m2, 6*SIZEOF_PIXEL, m2
- mova [r0-4*FDEC_STRIDEB], m1
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_8x8_HD w, wd
- INIT_XMM ssse3
- PREDICT_8x8_HD w, wd
- INIT_XMM avx
- PREDICT_8x8_HD w, wd
- %else
- INIT_MMX mmx2
- PREDICT_8x8_HD b, bw
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_hd( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8_HD 0
- cglobal predict_8x8_hd, 2,2
- add r0, 4*FDEC_STRIDE
- movu m1, [r1+7]
- movu m3, [r1+8]
- movu m2, [r1+9]
- pavgb m4, m1, m3
- PRED8x8_LOWPASS m0, m1, m2, m3, m5
- punpcklbw m4, m0
- movhlps m0, m4
- %assign Y 3
- %rep 3
- movq [r0+(Y)*FDEC_STRIDE], m4
- movq [r0+(Y-4)*FDEC_STRIDE], m0
- psrldq m4, 2
- psrldq m0, 2
- %assign Y (Y-1)
- %endrep
- movq [r0+(Y)*FDEC_STRIDE], m4
- movq [r0+(Y-4)*FDEC_STRIDE], m0
- RET
- %endmacro
- INIT_XMM sse2
- PREDICT_8x8_HD
- INIT_XMM avx
- PREDICT_8x8_HD
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH == 0
- ;-----------------------------------------------------------------------------
- ; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
- ;-----------------------------------------------------------------------------
- INIT_MMX
- cglobal predict_8x8_hu_sse2, 2,2
- add r0, 4*FDEC_STRIDE
- movq mm1, [r1+7] ; l0 l1 l2 l3 l4 l5 l6 l7
- pshufw mm0, mm1, q0123 ; l6 l7 l4 l5 l2 l3 l0 l1
- movq mm2, mm0
- psllw mm0, 8
- psrlw mm2, 8
- por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
- psllq mm1, 56 ; l7 .. .. .. .. .. .. ..
- movq mm3, mm2
- movq mm4, mm2
- movq mm5, mm2
- psrlq mm2, 8
- psrlq mm3, 16
- por mm2, mm1 ; l7 l7 l6 l5 l4 l3 l2 l1
- punpckhbw mm1, mm1
- por mm3, mm1 ; l7 l7 l7 l6 l5 l4 l3 l2
- pavgb mm4, mm2
- PRED8x8_LOWPASS mm1, mm3, mm5, mm2, mm6
- movq2dq xmm0, mm4
- movq2dq xmm1, mm1
- punpcklbw xmm0, xmm1
- punpckhbw mm4, mm1
- %assign Y -4
- %rep 3
- movq [r0+Y*FDEC_STRIDE], xmm0
- psrldq xmm0, 2
- %assign Y (Y+1)
- %endrep
- pshufw mm5, mm4, q3321
- pshufw mm6, mm4, q3332
- pshufw mm7, mm4, q3333
- movq [r0+Y*FDEC_STRIDE], xmm0
- movq [r0+0*FDEC_STRIDE], mm4
- movq [r0+1*FDEC_STRIDE], mm5
- movq [r0+2*FDEC_STRIDE], mm6
- movq [r0+3*FDEC_STRIDE], mm7
- RET
- INIT_XMM
- cglobal predict_8x8_hu_ssse3, 2,2
- add r0, 4*FDEC_STRIDE
- movq m3, [r1+7]
- pshufb m3, [shuf_hu]
- psrldq m1, m3, 1
- psrldq m2, m3, 2
- pavgb m0, m1, m3
- PRED8x8_LOWPASS m1, m3, m2, m1, m4
- punpcklbw m0, m1
- %assign Y -4
- %rep 3
- movq [r0+ Y *FDEC_STRIDE], m0
- movhps [r0+(Y+4)*FDEC_STRIDE], m0
- psrldq m0, 2
- pshufhw m0, m0, q2210
- %assign Y (Y+1)
- %endrep
- movq [r0+ Y *FDEC_STRIDE], m0
- movhps [r0+(Y+4)*FDEC_STRIDE], m0
- RET
- %endif ; !HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void predict_8x8c_v( uint8_t *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_8x8C_V 0
- cglobal predict_8x8c_v, 1,1
- mova m0, [r0 - FDEC_STRIDEB]
- STORE8 m0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse
- PREDICT_8x8C_V
- %else
- INIT_MMX mmx
- PREDICT_8x8C_V
- %endif
- %if HIGH_BIT_DEPTH
- INIT_MMX
- cglobal predict_8x8c_v_mmx, 1,1
- mova m0, [r0 - FDEC_STRIDEB]
- mova m1, [r0 - FDEC_STRIDEB + 8]
- %assign Y 0
- %rep 8
- mova [r0 + (Y&1)*FDEC_STRIDEB], m0
- mova [r0 + (Y&1)*FDEC_STRIDEB + 8], m1
- %if (Y&1) && (Y!=7)
- add r0, FDEC_STRIDEB*2
- %endif
- %assign Y Y+1
- %endrep
- RET
- %endif
- %macro PREDICT_8x16C_V 0
- cglobal predict_8x16c_v, 1,1
- mova m0, [r0 - FDEC_STRIDEB]
- STORE16 m0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse
- PREDICT_8x16C_V
- %else
- INIT_MMX mmx
- PREDICT_8x16C_V
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8c_h( uint8_t *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_C_H 0
- cglobal predict_8x8c_h, 1,1
- %if cpuflag(ssse3) && notcpuflag(avx2)
- mova m2, [pb_3]
- %endif
- PRED_H_4ROWS 8, 1
- PRED_H_4ROWS 8, 0
- RET
- cglobal predict_8x16c_h, 1,2
- %if cpuflag(ssse3) && notcpuflag(avx2)
- mova m2, [pb_3]
- %endif
- mov r1d, 4
- .loop:
- PRED_H_4ROWS 8, 1
- dec r1d
- jg .loop
- RET
- %endmacro
- INIT_MMX mmx2
- PREDICT_C_H
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_C_H
- INIT_XMM avx2
- PREDICT_C_H
- %else
- INIT_MMX ssse3
- PREDICT_C_H
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_8x8c_dc( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro LOAD_LEFT 1
- movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
- movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
- add r1d, r2d
- movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
- add r1d, r2d
- %endmacro
- %macro PREDICT_8x8C_DC 0
- cglobal predict_8x8c_dc, 1,3
- pxor m7, m7
- %if HIGH_BIT_DEPTH
- movq m0, [r0-FDEC_STRIDEB+0]
- movq m1, [r0-FDEC_STRIDEB+8]
- HADDW m0, m2
- HADDW m1, m2
- %else ; !HIGH_BIT_DEPTH
- movd m0, [r0-FDEC_STRIDEB+0]
- movd m1, [r0-FDEC_STRIDEB+4]
- psadbw m0, m7 ; s0
- psadbw m1, m7 ; s1
- %endif
- add r0, FDEC_STRIDEB*4
- LOAD_LEFT 0 ; s2
- movd m2, r1d
- LOAD_LEFT 4 ; s3
- movd m3, r1d
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpckldq m0, m2 ; s0, s1, s2, s3
- pshufw m3, m0, q3312 ; s2, s1, s3, s3
- pshufw m0, m0, q1310 ; s0, s1, s3, s1
- paddw m0, m3
- psrlw m0, 2
- pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
- %if HIGH_BIT_DEPTH
- %if cpuflag(sse2)
- movq2dq xmm0, m0
- punpcklwd xmm0, xmm0
- pshufd xmm1, xmm0, q3322
- punpckldq xmm0, xmm0
- %assign Y 0
- %rep 8
- %assign i (0 + (Y/4))
- movdqa [r0+FDEC_STRIDEB*(Y-4)+0], xmm %+ i
- %assign Y Y+1
- %endrep
- %else ; !sse2
- pshufw m1, m0, q0000
- pshufw m2, m0, q1111
- pshufw m3, m0, q2222
- pshufw m4, m0, q3333
- %assign Y 0
- %rep 8
- %assign i (1 + (Y/4)*2)
- %assign j (2 + (Y/4)*2)
- movq [r0+FDEC_STRIDEB*(Y-4)+0], m %+ i
- movq [r0+FDEC_STRIDEB*(Y-4)+8], m %+ j
- %assign Y Y+1
- %endrep
- %endif
- %else ; !HIGH_BIT_DEPTH
- packuswb m0, m0
- punpcklbw m0, m0
- movq m1, m0
- punpcklbw m0, m0
- punpckhbw m1, m1
- %assign Y 0
- %rep 8
- %assign i (0 + (Y/4))
- movq [r0+FDEC_STRIDEB*(Y-4)], m %+ i
- %assign Y Y+1
- %endrep
- %endif
- RET
- %endmacro
- INIT_MMX mmx2
- PREDICT_8x8C_DC
- %if HIGH_BIT_DEPTH
- INIT_MMX sse2
- PREDICT_8x8C_DC
- %endif
- %if HIGH_BIT_DEPTH
- %macro STORE_4LINES 3
- %if cpuflag(sse2)
- movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
- movdqa [r0+FDEC_STRIDEB*(%3-3)], %1
- movdqa [r0+FDEC_STRIDEB*(%3-2)], %1
- movdqa [r0+FDEC_STRIDEB*(%3-1)], %1
- %else
- movq [r0+FDEC_STRIDEB*(%3-4)+0], %1
- movq [r0+FDEC_STRIDEB*(%3-4)+8], %2
- movq [r0+FDEC_STRIDEB*(%3-3)+0], %1
- movq [r0+FDEC_STRIDEB*(%3-3)+8], %2
- movq [r0+FDEC_STRIDEB*(%3-2)+0], %1
- movq [r0+FDEC_STRIDEB*(%3-2)+8], %2
- movq [r0+FDEC_STRIDEB*(%3-1)+0], %1
- movq [r0+FDEC_STRIDEB*(%3-1)+8], %2
- %endif
- %endmacro
- %else
- %macro STORE_4LINES 2
- movq [r0+FDEC_STRIDEB*(%2-4)], %1
- movq [r0+FDEC_STRIDEB*(%2-3)], %1
- movq [r0+FDEC_STRIDEB*(%2-2)], %1
- movq [r0+FDEC_STRIDEB*(%2-1)], %1
- %endmacro
- %endif
- %macro PREDICT_8x16C_DC 0
- cglobal predict_8x16c_dc, 1,3
- pxor m7, m7
- %if HIGH_BIT_DEPTH
- movq m0, [r0-FDEC_STRIDEB+0]
- movq m1, [r0-FDEC_STRIDEB+8]
- HADDW m0, m2
- HADDW m1, m2
- %else
- movd m0, [r0-FDEC_STRIDEB+0]
- movd m1, [r0-FDEC_STRIDEB+4]
- psadbw m0, m7 ; s0
- psadbw m1, m7 ; s1
- %endif
- punpcklwd m0, m1 ; s0, s1
- add r0, FDEC_STRIDEB*4
- LOAD_LEFT 0 ; s2
- pinsrw m0, r1d, 2
- LOAD_LEFT 4 ; s3
- pinsrw m0, r1d, 3 ; s0, s1, s2, s3
- add r0, FDEC_STRIDEB*8
- LOAD_LEFT 0 ; s4
- pinsrw m1, r1d, 2
- LOAD_LEFT 4 ; s5
- pinsrw m1, r1d, 3 ; s1, __, s4, s5
- sub r0, FDEC_STRIDEB*8
- pshufw m2, m0, q1310 ; s0, s1, s3, s1
- pshufw m0, m0, q3312 ; s2, s1, s3, s3
- pshufw m3, m1, q0302 ; s4, s1, s5, s1
- pshufw m1, m1, q3322 ; s4, s4, s5, s5
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 2
- psrlw m1, 2
- pavgw m0, m7
- pavgw m1, m7
- %if HIGH_BIT_DEPTH
- %if cpuflag(sse2)
- movq2dq xmm0, m0
- movq2dq xmm1, m1
- punpcklwd xmm0, xmm0
- punpcklwd xmm1, xmm1
- pshufd xmm2, xmm0, q3322
- pshufd xmm3, xmm1, q3322
- punpckldq xmm0, xmm0
- punpckldq xmm1, xmm1
- STORE_4LINES xmm0, xmm0, 0
- STORE_4LINES xmm2, xmm2, 4
- STORE_4LINES xmm1, xmm1, 8
- STORE_4LINES xmm3, xmm3, 12
- %else
- pshufw m2, m0, q0000
- pshufw m3, m0, q1111
- pshufw m4, m0, q2222
- pshufw m5, m0, q3333
- STORE_4LINES m2, m3, 0
- STORE_4LINES m4, m5, 4
- pshufw m2, m1, q0000
- pshufw m3, m1, q1111
- pshufw m4, m1, q2222
- pshufw m5, m1, q3333
- STORE_4LINES m2, m3, 8
- STORE_4LINES m4, m5, 12
- %endif
- %else
- packuswb m0, m0 ; dc0, dc1, dc2, dc3
- packuswb m1, m1 ; dc4, dc5, dc6, dc7
- punpcklbw m0, m0
- punpcklbw m1, m1
- pshufw m2, m0, q1100
- pshufw m3, m0, q3322
- pshufw m4, m1, q1100
- pshufw m5, m1, q3322
- STORE_4LINES m2, 0
- STORE_4LINES m3, 4
- add r0, FDEC_STRIDEB*8
- STORE_4LINES m4, 0
- STORE_4LINES m5, 4
- %endif
- RET
- %endmacro
- INIT_MMX mmx2
- PREDICT_8x16C_DC
- %if HIGH_BIT_DEPTH
- INIT_MMX sse2
- PREDICT_8x16C_DC
- %endif
- %macro PREDICT_C_DC_TOP 1
- %if HIGH_BIT_DEPTH
- INIT_XMM
- cglobal predict_8x%1c_dc_top_sse2, 1,1
- pxor m2, m2
- mova m0, [r0 - FDEC_STRIDEB]
- pshufd m1, m0, q2301
- paddw m0, m1
- pshuflw m1, m0, q2301
- pshufhw m1, m1, q2301
- paddw m0, m1
- psrlw m0, 1
- pavgw m0, m2
- STORE%1 m0
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX
- cglobal predict_8x%1c_dc_top_mmx2, 1,1
- movq mm0, [r0 - FDEC_STRIDE]
- pxor mm1, mm1
- pxor mm2, mm2
- punpckhbw mm1, mm0
- punpcklbw mm0, mm2
- psadbw mm1, mm2 ; s1
- psadbw mm0, mm2 ; s0
- psrlw mm1, 1
- psrlw mm0, 1
- pavgw mm1, mm2
- pavgw mm0, mm2
- pshufw mm1, mm1, 0
- pshufw mm0, mm0, 0 ; dc0 (w)
- packuswb mm0, mm1 ; dc0,dc1 (b)
- STORE%1 mm0
- RET
- %endif
- %endmacro
- PREDICT_C_DC_TOP 8
- PREDICT_C_DC_TOP 16
- ;-----------------------------------------------------------------------------
- ; void predict_16x16_v( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_16x16_V 0
- cglobal predict_16x16_v, 1,2
- %assign %%i 0
- %rep 16*SIZEOF_PIXEL/mmsize
- mova m %+ %%i, [r0-FDEC_STRIDEB+%%i*mmsize]
- %assign %%i %%i+1
- %endrep
- %if 16*SIZEOF_PIXEL/mmsize == 4
- STORE16 m0, m1, m2, m3
- %elif 16*SIZEOF_PIXEL/mmsize == 2
- STORE16 m0, m1
- %else
- STORE16 m0
- %endif
- RET
- %endmacro
- INIT_MMX mmx2
- PREDICT_16x16_V
- INIT_XMM sse
- PREDICT_16x16_V
- %if HIGH_BIT_DEPTH
- INIT_YMM avx
- PREDICT_16x16_V
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_16x16_h( pixel *src )
- ;-----------------------------------------------------------------------------
- %macro PREDICT_16x16_H 0
- cglobal predict_16x16_h, 1,2
- %if cpuflag(ssse3) && notcpuflag(avx2)
- mova m2, [pb_3]
- %endif
- mov r1d, 4
- .loop:
- PRED_H_4ROWS 16, 1
- dec r1d
- jg .loop
- RET
- %endmacro
- INIT_MMX mmx2
- PREDICT_16x16_H
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- PREDICT_16x16_H
- INIT_YMM avx2
- PREDICT_16x16_H
- %else
- ;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
- INIT_XMM ssse3
- PREDICT_16x16_H
- %endif
- ;-----------------------------------------------------------------------------
- ; void predict_16x16_dc( pixel *src )
- ;-----------------------------------------------------------------------------
- %if WIN64
- DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
- %else
- DECLARE_REG_TMP 3
- %endif
- INIT_XMM
- ; Returns the sum of the left pixels in r1d+r2d
- cglobal predict_16x16_dc_left_internal, 0,4
- movzx r1d, pixel [r0-SIZEOF_PIXEL]
- movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
- %assign i 2*FDEC_STRIDEB
- %rep 7
- movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
- add r1d, t0d
- movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
- add r2d, t0d
- %assign i i+2*FDEC_STRIDEB
- %endrep
- RET
- %macro PRED16x16_DC 2
- %if HIGH_BIT_DEPTH
- mova xm0, [r0 - FDEC_STRIDEB+ 0]
- paddw xm0, [r0 - FDEC_STRIDEB+16]
- HADDW xm0, xm2
- paddw xm0, %1
- psrlw xm0, %2
- SPLATW m0, xm0
- %if mmsize == 32
- STORE16 m0
- %else
- STORE16 m0, m0
- %endif
- %else ; !HIGH_BIT_DEPTH
- pxor m0, m0
- psadbw m0, [r0 - FDEC_STRIDE]
- MOVHL m1, m0
- paddw m0, m1
- paddusw m0, %1
- psrlw m0, %2 ; dc
- SPLATW m0, m0
- packuswb m0, m0 ; dc in bytes
- STORE16 m0
- %endif
- %endmacro
- %macro PREDICT_16x16_DC 0
- cglobal predict_16x16_dc, 1,3
- call predict_16x16_dc_left_internal
- lea r1d, [r1+r2+16]
- movd xm3, r1d
- PRED16x16_DC xm3, 5
- RET
- cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC [pw_8], 4
- RET
- cglobal predict_16x16_dc_left, 1,3
- call predict_16x16_dc_left_internal
- lea r1d, [r1+r2+8]
- shr r1d, 4
- movd xm0, r1d
- SPLATW m0, xm0
- %if HIGH_BIT_DEPTH && mmsize == 16
- STORE16 m0, m0
- %else
- %if HIGH_BIT_DEPTH == 0
- packuswb m0, m0
- %endif
- STORE16 m0
- %endif
- RET
- %endmacro
- INIT_XMM sse2
- PREDICT_16x16_DC
- %if HIGH_BIT_DEPTH
- INIT_YMM avx2
- PREDICT_16x16_DC
- %else
- INIT_XMM avx2
- PREDICT_16x16_DC
- %endif
|