1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215 |
- ;*****************************************************************************
- ;* sad-a.asm: x86 sad functions
- ;*****************************************************************************
- ;* Copyright (C) 2003-2018 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;* Fiona Glaser <fiona@x264.com>
- ;* Laurent Aimar <fenrir@via.ecp.fr>
- ;* Alex Izvorski <aizvorksi@gmail.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 32
- pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
- hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
- SECTION .text
- cextern pb_3
- cextern pb_shuf8x8c
- cextern pw_8
- cextern sw_64
- ;=============================================================================
- ; SAD MMX
- ;=============================================================================
- %macro SAD_INC_2x16P 0
- movq mm1, [r0]
- movq mm2, [r0+8]
- movq mm3, [r0+r1]
- movq mm4, [r0+r1+8]
- psadbw mm1, [r2]
- psadbw mm2, [r2+8]
- psadbw mm3, [r2+r3]
- psadbw mm4, [r2+r3+8]
- lea r0, [r0+2*r1]
- paddw mm1, mm2
- paddw mm3, mm4
- lea r2, [r2+2*r3]
- paddw mm0, mm1
- paddw mm0, mm3
- %endmacro
- %macro SAD_INC_2x8P 0
- movq mm1, [r0]
- movq mm2, [r0+r1]
- psadbw mm1, [r2]
- psadbw mm2, [r2+r3]
- lea r0, [r0+2*r1]
- paddw mm0, mm1
- paddw mm0, mm2
- lea r2, [r2+2*r3]
- %endmacro
- %macro SAD_INC_2x4P 0
- movd mm1, [r0]
- movd mm2, [r2]
- punpckldq mm1, [r0+r1]
- punpckldq mm2, [r2+r3]
- psadbw mm1, mm2
- paddw mm0, mm1
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
- ;-----------------------------------------------------------------------------
- %macro SAD 2
- cglobal pixel_sad_%1x%2_mmx2, 4,4
- pxor mm0, mm0
- %rep %2/2
- SAD_INC_2x%1P
- %endrep
- movd eax, mm0
- RET
- %endmacro
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- SAD 8, 4
- SAD 4, 16
- SAD 4, 8
- SAD 4, 4
- ;=============================================================================
- ; SAD XMM
- ;=============================================================================
- %macro SAD_END_SSE2 0
- MOVHL m1, m0
- paddw m0, m1
- movd eax, m0
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
- ;-----------------------------------------------------------------------------
- %macro SAD_W16 1 ; h
- cglobal pixel_sad_16x%1, 4,4
- %ifidn cpuname, sse2
- .skip_prologue:
- %endif
- %assign %%i 0
- %if ARCH_X86_64
- lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
- lea r5, [3*r3]
- %rep %1/4
- movu m1, [r2]
- psadbw m1, [r0]
- movu m3, [r2+r3]
- psadbw m3, [r0+r1]
- movu m2, [r2+2*r3]
- psadbw m2, [r0+2*r1]
- movu m4, [r2+r5]
- psadbw m4, [r0+r6]
- %if %%i != %1/4-1
- lea r2, [r2+4*r3]
- lea r0, [r0+4*r1]
- %endif
- paddw m1, m3
- paddw m2, m4
- ACCUM paddw, 0, 1, %%i
- paddw m0, m2
- %assign %%i %%i+1
- %endrep
- %else ; The cost of having to save and restore registers on x86-32
- %rep %1/2 ; nullifies the benefit of having 3*stride in registers.
- movu m1, [r2]
- psadbw m1, [r0]
- movu m2, [r2+r3]
- psadbw m2, [r0+r1]
- %if %%i != %1/2-1
- lea r2, [r2+2*r3]
- lea r0, [r0+2*r1]
- %endif
- ACCUM paddw, 0, 1, %%i
- paddw m0, m2
- %assign %%i %%i+1
- %endrep
- %endif
- SAD_END_SSE2
- %endmacro
- INIT_XMM sse2
- SAD_W16 16
- SAD_W16 8
- INIT_XMM sse3
- SAD_W16 16
- SAD_W16 8
- INIT_XMM sse2, aligned
- SAD_W16 16
- SAD_W16 8
- %macro SAD_INC_4x8P_SSE 1
- movq m1, [r0]
- movq m2, [r0+r1]
- lea r0, [r0+2*r1]
- movq m3, [r2]
- movq m4, [r2+r3]
- lea r2, [r2+2*r3]
- movhps m1, [r0]
- movhps m2, [r0+r1]
- movhps m3, [r2]
- movhps m4, [r2+r3]
- lea r0, [r0+2*r1]
- psadbw m1, m3
- psadbw m2, m4
- lea r2, [r2+2*r3]
- ACCUM paddw, 0, 1, %1
- paddw m0, m2
- %endmacro
- INIT_XMM
- ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
- cglobal pixel_sad_8x16_sse2, 4,4
- SAD_INC_4x8P_SSE 0
- SAD_INC_4x8P_SSE 1
- SAD_INC_4x8P_SSE 1
- SAD_INC_4x8P_SSE 1
- SAD_END_SSE2
- %macro SAD_W48_AVX512 3 ; w, h, d/q
- cglobal pixel_sad_%1x%2, 4,4
- kxnorb k1, k1, k1
- kaddb k1, k1, k1
- %assign %%i 0
- %if ARCH_X86_64 && %2 != 4
- lea r6, [3*r1]
- lea r5, [3*r3]
- %rep %2/4
- mov%3 m1, [r0]
- vpbroadcast%3 m1 {k1}, [r0+r1]
- mov%3 m3, [r2]
- vpbroadcast%3 m3 {k1}, [r2+r3]
- mov%3 m2, [r0+2*r1]
- vpbroadcast%3 m2 {k1}, [r0+r6]
- mov%3 m4, [r2+2*r3]
- vpbroadcast%3 m4 {k1}, [r2+r5]
- %if %%i != %2/4-1
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- %endif
- psadbw m1, m3
- psadbw m2, m4
- ACCUM paddd, 0, 1, %%i
- paddd m0, m2
- %assign %%i %%i+1
- %endrep
- %else
- %rep %2/2
- mov%3 m1, [r0]
- vpbroadcast%3 m1 {k1}, [r0+r1]
- mov%3 m2, [r2]
- vpbroadcast%3 m2 {k1}, [r2+r3]
- %if %%i != %2/2-1
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- %endif
- psadbw m1, m2
- ACCUM paddd, 0, 1, %%i
- %assign %%i %%i+1
- %endrep
- %endif
- %if %1 == 8
- punpckhqdq m1, m0, m0
- paddd m0, m1
- %endif
- movd eax, m0
- RET
- %endmacro
- INIT_XMM avx512
- SAD_W48_AVX512 4, 4, d
- SAD_W48_AVX512 4, 8, d
- SAD_W48_AVX512 4, 16, d
- SAD_W48_AVX512 8, 4, q
- SAD_W48_AVX512 8, 8, q
- SAD_W48_AVX512 8, 16, q
- %macro SAD_W16_AVX512_START 1 ; h
- cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which
- jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
- lea r1, [3*r3]
- %endmacro
- %macro SAD_W16_AVX512_END 0
- paddd m0, m1
- paddd m0, m2
- paddd m0, m3
- %if mmsize == 64
- vextracti32x8 ym1, m0, 1
- paddd ym0, ym1
- %endif
- vextracti128 xm1, ym0, 1
- paddd xmm0, xm0, xm1
- punpckhqdq xmm1, xmm0, xmm0
- paddd xmm0, xmm1
- movd eax, xmm0
- RET
- %endmacro
- INIT_YMM avx512
- cglobal pixel_sad_16x8, 4,4
- SAD_W16_AVX512_START 8
- movu xm0, [r2]
- vinserti128 m0, [r2+r3], 1
- psadbw m0, [r0+0*32]
- movu xm1, [r2+2*r3]
- vinserti128 m1, [r2+r1], 1
- lea r2, [r2+4*r3]
- psadbw m1, [r0+1*32]
- movu xm2, [r2]
- vinserti128 m2, [r2+r3], 1
- psadbw m2, [r0+2*32]
- movu xm3, [r2+2*r3]
- vinserti128 m3, [r2+r1], 1
- psadbw m3, [r0+3*32]
- SAD_W16_AVX512_END
- INIT_ZMM avx512
- cglobal pixel_sad_16x16, 4,4
- SAD_W16_AVX512_START 16
- movu xm0, [r2]
- vinserti128 ym0, [r2+r3], 1
- movu xm1, [r2+4*r3]
- vinserti32x4 m0, [r2+2*r3], 2
- vinserti32x4 m1, [r2+2*r1], 2
- vinserti32x4 m0, [r2+r1], 3
- lea r2, [r2+4*r3]
- vinserti32x4 m1, [r2+r3], 1
- psadbw m0, [r0+0*64]
- vinserti32x4 m1, [r2+r1], 3
- lea r2, [r2+4*r3]
- psadbw m1, [r0+1*64]
- movu xm2, [r2]
- vinserti128 ym2, [r2+r3], 1
- movu xm3, [r2+4*r3]
- vinserti32x4 m2, [r2+2*r3], 2
- vinserti32x4 m3, [r2+2*r1], 2
- vinserti32x4 m2, [r2+r1], 3
- lea r2, [r2+4*r3]
- vinserti32x4 m3, [r2+r3], 1
- psadbw m2, [r0+2*64]
- vinserti32x4 m3, [r2+r1], 3
- psadbw m3, [r0+3*64]
- SAD_W16_AVX512_END
- ;-----------------------------------------------------------------------------
- ; void pixel_vsad( pixel *src, intptr_t stride );
- ;-----------------------------------------------------------------------------
- %if ARCH_X86_64 == 0
- INIT_MMX
- cglobal pixel_vsad_mmx2, 3,3
- mova m0, [r0]
- mova m1, [r0+8]
- mova m2, [r0+r1]
- mova m3, [r0+r1+8]
- lea r0, [r0+r1*2]
- psadbw m0, m2
- psadbw m1, m3
- paddw m0, m1
- sub r2d, 2
- je .end
- .loop:
- mova m4, [r0]
- mova m5, [r0+8]
- mova m6, [r0+r1]
- mova m7, [r0+r1+8]
- lea r0, [r0+r1*2]
- psadbw m2, m4
- psadbw m3, m5
- psadbw m4, m6
- psadbw m5, m7
- ;max sum: 31*16*255(pixel_max)=126480
- paddd m0, m2
- paddd m0, m3
- paddd m0, m4
- paddd m0, m5
- mova m2, m6
- mova m3, m7
- sub r2d, 2
- jg .loop
- .end:
- movd eax, m0
- RET
- %endif
- INIT_XMM
- cglobal pixel_vsad_sse2, 3,3
- mova m0, [r0]
- mova m1, [r0+r1]
- lea r0, [r0+r1*2]
- psadbw m0, m1
- sub r2d, 2
- je .end
- .loop:
- mova m2, [r0]
- mova m3, [r0+r1]
- lea r0, [r0+r1*2]
- psadbw m1, m2
- psadbw m2, m3
- paddw m0, m1
- paddw m0, m2
- mova m1, m3
- sub r2d, 2
- jg .loop
- .end:
- MOVHL m1, m0
- ;max sum: 31*16*255(pixel_max)=126480
- paddd m0, m1
- movd eax, m0
- RET
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
- ;-----------------------------------------------------------------------------
- cglobal intra_sad_x3_4x4_mmx2, 3,3
- pxor mm7, mm7
- movd mm0, [r1-FDEC_STRIDE]
- movd mm1, [r0+FENC_STRIDE*0]
- movd mm2, [r0+FENC_STRIDE*2]
- punpckldq mm0, mm0
- punpckldq mm1, [r0+FENC_STRIDE*1]
- punpckldq mm2, [r0+FENC_STRIDE*3]
- movq mm6, mm0
- movq mm3, mm1
- psadbw mm3, mm0
- psadbw mm0, mm2
- paddw mm0, mm3
- movd [r2], mm0 ;V prediction cost
- movd mm3, [r1+FDEC_STRIDE*0-4]
- movd mm0, [r1+FDEC_STRIDE*1-4]
- movd mm4, [r1+FDEC_STRIDE*2-4]
- movd mm5, [r1+FDEC_STRIDE*3-4]
- punpcklbw mm3, mm0
- punpcklbw mm4, mm5
- movq mm5, mm3
- punpckhwd mm5, mm4
- punpckhdq mm5, mm6
- psadbw mm5, mm7
- punpckhbw mm3, mm3
- punpckhbw mm4, mm4
- punpckhwd mm3, mm3
- punpckhwd mm4, mm4
- psraw mm5, 2
- pavgw mm5, mm7
- punpcklbw mm5, mm5
- pshufw mm5, mm5, 0 ;DC prediction
- movq mm6, mm5
- psadbw mm5, mm1
- psadbw mm6, mm2
- psadbw mm1, mm3
- psadbw mm2, mm4
- paddw mm5, mm6
- paddw mm1, mm2
- movd [r2+8], mm5 ;DC prediction cost
- movd [r2+4], mm1 ;H prediction cost
- RET
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
- ;-----------------------------------------------------------------------------
- ;m0 = DC
- ;m6 = V
- ;m7 = H
- ;m1 = DC score
- ;m2 = V score
- ;m3 = H score
- ;m5 = pixel row
- ;m4 = temp
- %macro INTRA_SAD_HVDC_ITER 2
- movq m5, [r0+FENC_STRIDE*%1]
- movq m4, m5
- psadbw m4, m0
- ACCUM paddw, 1, 4, %1
- movq m4, m5
- psadbw m4, m6
- ACCUM paddw, 2, 4, %1
- pshufw m4, m7, %2
- psadbw m5, m4
- ACCUM paddw, 3, 5, %1
- %endmacro
- INIT_MMX
- cglobal intra_sad_x3_8x8_mmx2, 3,3
- movq m7, [r1+7]
- pxor m0, m0
- movq m6, [r1+16] ;V prediction
- pxor m1, m1
- psadbw m0, m7
- psadbw m1, m6
- paddw m0, m1
- paddw m0, [pw_8]
- psrlw m0, 4
- punpcklbw m0, m0
- pshufw m0, m0, q0000 ;DC prediction
- punpckhbw m7, m7
- INTRA_SAD_HVDC_ITER 0, q3333
- INTRA_SAD_HVDC_ITER 1, q2222
- INTRA_SAD_HVDC_ITER 2, q1111
- INTRA_SAD_HVDC_ITER 3, q0000
- movq m7, [r1+7]
- punpcklbw m7, m7
- INTRA_SAD_HVDC_ITER 4, q3333
- INTRA_SAD_HVDC_ITER 5, q2222
- INTRA_SAD_HVDC_ITER 6, q1111
- INTRA_SAD_HVDC_ITER 7, q0000
- movd [r2+0], m2
- movd [r2+4], m3
- movd [r2+8], m1
- RET
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
- ;-----------------------------------------------------------------------------
- %macro INTRA_SAD_HV_ITER 1
- %if cpuflag(ssse3)
- movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
- movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
- pshufb m1, m7
- pshufb m3, m7
- %else
- movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
- movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
- punpckhbw m1, m1
- punpckhbw m3, m3
- pshufw m1, m1, q3333
- pshufw m3, m3, q3333
- %endif
- movq m4, [r0 + FENC_STRIDE*(%1+0)]
- movq m5, [r0 + FENC_STRIDE*(%1+1)]
- psadbw m1, m4
- psadbw m3, m5
- psadbw m4, m6
- psadbw m5, m6
- paddw m1, m3
- paddw m4, m5
- ACCUM paddw, 0, 1, %1
- ACCUM paddw, 2, 4, %1
- %endmacro
- %macro INTRA_SAD_8x8C 0
- cglobal intra_sad_x3_8x8c, 3,3
- movq m6, [r1 - FDEC_STRIDE]
- add r1, FDEC_STRIDE*4
- %if cpuflag(ssse3)
- movq m7, [pb_3]
- %endif
- INTRA_SAD_HV_ITER 0
- INTRA_SAD_HV_ITER 2
- INTRA_SAD_HV_ITER 4
- INTRA_SAD_HV_ITER 6
- movd [r2+4], m0
- movd [r2+8], m2
- pxor m7, m7
- movq m2, [r1 + FDEC_STRIDE*-4 - 8]
- movq m4, [r1 + FDEC_STRIDE*-2 - 8]
- movq m3, [r1 + FDEC_STRIDE* 0 - 8]
- movq m5, [r1 + FDEC_STRIDE* 2 - 8]
- punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
- punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
- punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
- punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
- punpckhbw m2, m4
- punpckhbw m3, m5
- psrlq m2, 32
- psrlq m3, 32
- psadbw m2, m7 ; s2
- psadbw m3, m7 ; s3
- movq m1, m6
- SWAP 0, 6
- punpckldq m0, m7
- punpckhdq m1, m7
- psadbw m0, m7 ; s0
- psadbw m1, m7 ; s1
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpckldq m0, m2 ;s0 s1 s2 s3
- pshufw m3, m0, q3312 ;s2,s1,s3,s3
- pshufw m0, m0, q1310 ;s0,s1,s3,s1
- paddw m0, m3
- psrlw m0, 2
- pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
- %if cpuflag(ssse3)
- movq2dq xmm0, m0
- pshufb xmm0, [pb_shuf8x8c]
- movq xmm1, [r0+FENC_STRIDE*0]
- movq xmm2, [r0+FENC_STRIDE*1]
- movq xmm3, [r0+FENC_STRIDE*2]
- movq xmm4, [r0+FENC_STRIDE*3]
- movhps xmm1, [r0+FENC_STRIDE*4]
- movhps xmm2, [r0+FENC_STRIDE*5]
- movhps xmm3, [r0+FENC_STRIDE*6]
- movhps xmm4, [r0+FENC_STRIDE*7]
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
- paddw xmm1, xmm2
- paddw xmm1, xmm3
- paddw xmm1, xmm4
- MOVHL xmm0, xmm1
- paddw xmm1, xmm0
- movd [r2], xmm1
- %else
- packuswb m0, m0
- punpcklbw m0, m0
- movq m1, m0
- punpcklbw m0, m0 ; 4x dc0 4x dc1
- punpckhbw m1, m1 ; 4x dc2 4x dc3
- movq m2, [r0+FENC_STRIDE*0]
- movq m3, [r0+FENC_STRIDE*1]
- movq m4, [r0+FENC_STRIDE*2]
- movq m5, [r0+FENC_STRIDE*3]
- movq m6, [r0+FENC_STRIDE*4]
- movq m7, [r0+FENC_STRIDE*5]
- psadbw m2, m0
- psadbw m3, m0
- psadbw m4, m0
- psadbw m5, m0
- movq m0, [r0+FENC_STRIDE*6]
- psadbw m6, m1
- psadbw m7, m1
- psadbw m0, m1
- psadbw m1, [r0+FENC_STRIDE*7]
- paddw m2, m3
- paddw m4, m5
- paddw m6, m7
- paddw m0, m1
- paddw m2, m4
- paddw m6, m0
- paddw m2, m6
- movd [r2], m2
- %endif
- RET
- %endmacro
- INIT_MMX mmx2
- INTRA_SAD_8x8C
- INIT_MMX ssse3
- INTRA_SAD_8x8C
- INIT_YMM avx2
- cglobal intra_sad_x3_8x8c, 3,3,7
- vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
- add r1, FDEC_STRIDE*4-1
- pxor xm5, xm5
- punpckldq xm3, xm2, xm5 ; V0 _ V1 _
- movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
- movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
- pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
- pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
- pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
- pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
- pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
- pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
- punpcklqdq xm0, xm1 ; H0 _ H1 _
- vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
- pshufb xm0, [hpred_shuf] ; H00224466 H11335577
- psadbw m3, m5 ; s0 s1 s2 s3
- vpermq m4, m3, q3312 ; s2 s1 s3 s3
- vpermq m3, m3, q1310 ; s0 s1 s3 s1
- paddw m3, m4
- psrlw m3, 2
- pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
- pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
- vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
- vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
- vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
- vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
- movddup m2, [r0+FENC_STRIDE*0]
- movddup m4, [r0+FENC_STRIDE*2]
- pshuflw m3, m0, q0000
- psadbw m3, m2
- psadbw m2, m1
- pshuflw m5, m0, q1111
- psadbw m5, m4
- psadbw m4, m1
- paddw m2, m4
- paddw m3, m5
- movddup m4, [r0+FENC_STRIDE*4]
- pshuflw m5, m0, q2222
- psadbw m5, m4
- psadbw m4, m6
- paddw m2, m4
- paddw m3, m5
- movddup m4, [r0+FENC_STRIDE*6]
- pshuflw m5, m0, q3333
- psadbw m5, m4
- psadbw m4, m6
- paddw m2, m4
- paddw m3, m5
- vextracti128 xm0, m2, 1
- vextracti128 xm1, m3, 1
- paddw xm2, xm0 ; DC V
- paddw xm3, xm1 ; H
- pextrd [r2+8], xm2, 2 ; V
- movd [r2+4], xm3 ; H
- movd [r2+0], xm2 ; DC
- RET
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
- ;-----------------------------------------------------------------------------
- ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
- ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
- %macro INTRA_SAD16 0
- cglobal intra_sad_x3_16x16, 3,5,8
- pxor mm0, mm0
- pxor mm1, mm1
- psadbw mm0, [r1-FDEC_STRIDE+0]
- psadbw mm1, [r1-FDEC_STRIDE+8]
- paddw mm0, mm1
- movd r3d, mm0
- %if cpuflag(ssse3)
- mova m1, [pb_3]
- %endif
- %assign x 0
- %rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
- %if (x&3)==3 && x!=15
- add r1, FDEC_STRIDE*4
- %endif
- add r3d, r4d
- %assign x x+1
- %endrep
- sub r1, FDEC_STRIDE*12
- add r3d, 16
- shr r3d, 5
- imul r3d, 0x01010101
- movd m7, r3d
- mova m5, [r1-FDEC_STRIDE]
- %if mmsize==16
- pshufd m7, m7, 0
- %else
- mova m1, [r1-FDEC_STRIDE+8]
- punpckldq m7, m7
- %endif
- pxor m4, m4
- pxor m3, m3
- pxor m2, m2
- mov r3d, 15*FENC_STRIDE
- .vloop:
- SPLATB_LOAD m6, r1+r3*2-1, m1
- mova m0, [r0+r3]
- psadbw m0, m7
- paddw m4, m0
- mova m0, [r0+r3]
- psadbw m0, m5
- paddw m2, m0
- %if mmsize==8
- mova m0, [r0+r3]
- psadbw m0, m6
- paddw m3, m0
- mova m0, [r0+r3+8]
- psadbw m0, m7
- paddw m4, m0
- mova m0, [r0+r3+8]
- psadbw m0, m1
- paddw m2, m0
- psadbw m6, [r0+r3+8]
- paddw m3, m6
- %else
- psadbw m6, [r0+r3]
- paddw m3, m6
- %endif
- add r3d, -FENC_STRIDE
- jge .vloop
- %if mmsize==16
- pslldq m3, 4
- por m3, m2
- MOVHL m1, m3
- paddw m3, m1
- movq [r2+0], m3
- MOVHL m1, m4
- paddw m4, m1
- %else
- movd [r2+0], m2
- movd [r2+4], m3
- %endif
- movd [r2+8], m4
- RET
- %endmacro
- INIT_MMX mmx2
- INTRA_SAD16
- INIT_XMM sse2
- INTRA_SAD16
- INIT_XMM ssse3
- INTRA_SAD16
- INIT_YMM avx2
- cglobal intra_sad_x3_16x16, 3,5,6
- pxor xm0, xm0
- psadbw xm0, [r1-FDEC_STRIDE]
- MOVHL xm1, xm0
- paddw xm0, xm1
- movd r3d, xm0
- %assign x 0
- %rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
- %if (x&3)==3 && x!=15
- add r1, FDEC_STRIDE*4
- %endif
- add r3d, r4d
- %assign x x+1
- %endrep
- sub r1, FDEC_STRIDE*12
- add r3d, 16
- shr r3d, 5
- movd xm5, r3d
- vpbroadcastb xm5, xm5
- vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
- pxor m4, m4 ; DC / V accumulator
- pxor xm3, xm3 ; H accumulator
- mov r3d, 15*FENC_STRIDE
- .vloop:
- vpbroadcastb xm2, [r1+r3*2-1]
- vbroadcasti128 m0, [r0+r3]
- psadbw m1, m0, m5
- psadbw xm0, xm2
- paddw m4, m1
- paddw xm3, xm0
- add r3d, -FENC_STRIDE
- jge .vloop
- punpckhqdq m5, m4, m4
- MOVHL xm2, xm3
- paddw m4, m5 ; DC / V
- paddw xm3, xm2 ; H
- vextracti128 xm2, m4, 1
- movd [r2+0], xm2
- movd [r2+4], xm3
- movd [r2+8], xm4
- RET
- ;=============================================================================
- ; SAD x3/x4 MMX
- ;=============================================================================
- %macro SAD_X3_START_1x8P 0
- movq mm3, [r0]
- movq mm0, [r1]
- movq mm1, [r2]
- movq mm2, [r3]
- psadbw mm0, mm3
- psadbw mm1, mm3
- psadbw mm2, mm3
- %endmacro
- %macro SAD_X3_1x8P 2
- movq mm3, [r0+%1]
- movq mm4, [r1+%2]
- movq mm5, [r2+%2]
- movq mm6, [r3+%2]
- psadbw mm4, mm3
- psadbw mm5, mm3
- psadbw mm6, mm3
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- %endmacro
- %macro SAD_X3_START_2x4P 3
- movd mm3, [r0]
- movd %1, [r1]
- movd %2, [r2]
- movd %3, [r3]
- punpckldq mm3, [r0+FENC_STRIDE]
- punpckldq %1, [r1+r4]
- punpckldq %2, [r2+r4]
- punpckldq %3, [r3+r4]
- psadbw %1, mm3
- psadbw %2, mm3
- psadbw %3, mm3
- %endmacro
- %macro SAD_X3_2x16P 1
- %if %1
- SAD_X3_START_1x8P
- %else
- SAD_X3_1x8P 0, 0
- %endif
- SAD_X3_1x8P 8, 8
- SAD_X3_1x8P FENC_STRIDE, r4
- SAD_X3_1x8P FENC_STRIDE+8, r4+8
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X3_2x8P 1
- %if %1
- SAD_X3_START_1x8P
- %else
- SAD_X3_1x8P 0, 0
- %endif
- SAD_X3_1x8P FENC_STRIDE, r4
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X3_2x4P 1
- %if %1
- SAD_X3_START_2x4P mm0, mm1, mm2
- %else
- SAD_X3_START_2x4P mm4, mm5, mm6
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- %endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r4]
- lea r2, [r2+2*r4]
- lea r3, [r3+2*r4]
- %endmacro
- %macro SAD_X4_START_1x8P 0
- movq mm7, [r0]
- movq mm0, [r1]
- movq mm1, [r2]
- movq mm2, [r3]
- movq mm3, [r4]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
- %endmacro
- %macro SAD_X4_1x8P 2
- movq mm7, [r0+%1]
- movq mm4, [r1+%2]
- movq mm5, [r2+%2]
- movq mm6, [r3+%2]
- psadbw mm4, mm7
- psadbw mm5, mm7
- psadbw mm6, mm7
- psadbw mm7, [r4+%2]
- paddw mm0, mm4
- paddw mm1, mm5
- paddw mm2, mm6
- paddw mm3, mm7
- %endmacro
- %macro SAD_X4_START_2x4P 0
- movd mm7, [r0]
- movd mm0, [r1]
- movd mm1, [r2]
- movd mm2, [r3]
- movd mm3, [r4]
- punpckldq mm7, [r0+FENC_STRIDE]
- punpckldq mm0, [r1+r5]
- punpckldq mm1, [r2+r5]
- punpckldq mm2, [r3+r5]
- punpckldq mm3, [r4+r5]
- psadbw mm0, mm7
- psadbw mm1, mm7
- psadbw mm2, mm7
- psadbw mm3, mm7
- %endmacro
- %macro SAD_X4_INC_2x4P 0
- movd mm7, [r0]
- movd mm4, [r1]
- movd mm5, [r2]
- punpckldq mm7, [r0+FENC_STRIDE]
- punpckldq mm4, [r1+r5]
- punpckldq mm5, [r2+r5]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm0, mm4
- paddw mm1, mm5
- movd mm4, [r3]
- movd mm5, [r4]
- punpckldq mm4, [r3+r5]
- punpckldq mm5, [r4+r5]
- psadbw mm4, mm7
- psadbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- %endmacro
- %macro SAD_X4_2x16P 1
- %if %1
- SAD_X4_START_1x8P
- %else
- SAD_X4_1x8P 0, 0
- %endif
- SAD_X4_1x8P 8, 8
- SAD_X4_1x8P FENC_STRIDE, r5
- SAD_X4_1x8P FENC_STRIDE+8, r5+8
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X4_2x8P 1
- %if %1
- SAD_X4_START_1x8P
- %else
- SAD_X4_1x8P 0, 0
- %endif
- SAD_X4_1x8P FENC_STRIDE, r5
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X4_2x4P 1
- %if %1
- SAD_X4_START_2x4P
- %else
- SAD_X4_INC_2x4P
- %endif
- add r0, 2*FENC_STRIDE
- lea r1, [r1+2*r5]
- lea r2, [r2+2*r5]
- lea r3, [r3+2*r5]
- lea r4, [r4+2*r5]
- %endmacro
- %macro SAD_X3_END 0
- %if UNIX64
- movd [r5+0], mm0
- movd [r5+4], mm1
- movd [r5+8], mm2
- %else
- mov r0, r5mp
- movd [r0+0], mm0
- movd [r0+4], mm1
- movd [r0+8], mm2
- %endif
- RET
- %endmacro
- %macro SAD_X4_END 0
- mov r0, r6mp
- movd [r0+0], mm0
- movd [r0+4], mm1
- movd [r0+8], mm2
- movd [r0+12], mm3
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
- ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
- ;-----------------------------------------------------------------------------
- %macro SAD_X 3
- cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
- SAD_X%1_2x%2P 1
- %rep %3/2-1
- SAD_X%1_2x%2P 0
- %endrep
- SAD_X%1_END
- %endmacro
- INIT_MMX
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- SAD_X 3, 4, 8
- SAD_X 3, 4, 4
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- SAD_X 4, 4, 8
- SAD_X 4, 4, 4
- ;=============================================================================
- ; SAD x3/x4 XMM
- ;=============================================================================
- %macro SAD_X3_START_1x16P_SSE2 0
- mova m2, [r0]
- %if cpuflag(avx)
- psadbw m0, m2, [r1]
- psadbw m1, m2, [r2]
- psadbw m2, [r3]
- %else
- movu m0, [r1]
- movu m1, [r2]
- movu m3, [r3]
- psadbw m0, m2
- psadbw m1, m2
- psadbw m2, m3
- %endif
- %endmacro
- %macro SAD_X3_1x16P_SSE2 2
- mova m3, [r0+%1]
- %if cpuflag(avx)
- psadbw m4, m3, [r1+%2]
- psadbw m5, m3, [r2+%2]
- psadbw m3, [r3+%2]
- %else
- movu m4, [r1+%2]
- movu m5, [r2+%2]
- movu m6, [r3+%2]
- psadbw m4, m3
- psadbw m5, m3
- psadbw m3, m6
- %endif
- paddw m0, m4
- paddw m1, m5
- paddw m2, m3
- %endmacro
- %if ARCH_X86_64
- DECLARE_REG_TMP 6
- %else
- DECLARE_REG_TMP 5
- %endif
- %macro SAD_X3_4x16P_SSE2 2
- %if %1==0
- lea t0, [r4*3]
- SAD_X3_START_1x16P_SSE2
- %else
- SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
- %endif
- SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
- SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
- SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r4]
- lea r2, [r2+4*r4]
- lea r3, [r3+4*r4]
- %endif
- %endmacro
- %macro SAD_X3_START_2x8P_SSE2 0
- movq m3, [r0]
- movq m0, [r1]
- movq m1, [r2]
- movq m2, [r3]
- movhps m3, [r0+FENC_STRIDE]
- movhps m0, [r1+r4]
- movhps m1, [r2+r4]
- movhps m2, [r3+r4]
- psadbw m0, m3
- psadbw m1, m3
- psadbw m2, m3
- %endmacro
- %macro SAD_X3_2x8P_SSE2 4
- movq m6, [r0+%1]
- movq m3, [r1+%2]
- movq m4, [r2+%2]
- movq m5, [r3+%2]
- movhps m6, [r0+%3]
- movhps m3, [r1+%4]
- movhps m4, [r2+%4]
- movhps m5, [r3+%4]
- psadbw m3, m6
- psadbw m4, m6
- psadbw m5, m6
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
- %endmacro
- %macro SAD_X4_START_2x8P_SSE2 0
- movq m4, [r0]
- movq m0, [r1]
- movq m1, [r2]
- movq m2, [r3]
- movq m3, [r4]
- movhps m4, [r0+FENC_STRIDE]
- movhps m0, [r1+r5]
- movhps m1, [r2+r5]
- movhps m2, [r3+r5]
- movhps m3, [r4+r5]
- psadbw m0, m4
- psadbw m1, m4
- psadbw m2, m4
- psadbw m3, m4
- %endmacro
- %macro SAD_X4_2x8P_SSE2 4
- movq m6, [r0+%1]
- movq m4, [r1+%2]
- movq m5, [r2+%2]
- movhps m6, [r0+%3]
- movhps m4, [r1+%4]
- movhps m5, [r2+%4]
- psadbw m4, m6
- psadbw m5, m6
- paddw m0, m4
- paddw m1, m5
- movq m4, [r3+%2]
- movq m5, [r4+%2]
- movhps m4, [r3+%4]
- movhps m5, [r4+%4]
- psadbw m4, m6
- psadbw m5, m6
- paddw m2, m4
- paddw m3, m5
- %endmacro
- %macro SAD_X4_START_1x16P_SSE2 0
- mova m3, [r0]
- %if cpuflag(avx)
- psadbw m0, m3, [r1]
- psadbw m1, m3, [r2]
- psadbw m2, m3, [r3]
- psadbw m3, [r4]
- %else
- movu m0, [r1]
- movu m1, [r2]
- movu m2, [r3]
- movu m4, [r4]
- psadbw m0, m3
- psadbw m1, m3
- psadbw m2, m3
- psadbw m3, m4
- %endif
- %endmacro
- %macro SAD_X4_1x16P_SSE2 2
- mova m6, [r0+%1]
- %if cpuflag(avx)
- psadbw m4, m6, [r1+%2]
- psadbw m5, m6, [r2+%2]
- %else
- movu m4, [r1+%2]
- movu m5, [r2+%2]
- psadbw m4, m6
- psadbw m5, m6
- %endif
- paddw m0, m4
- paddw m1, m5
- %if cpuflag(avx)
- psadbw m4, m6, [r3+%2]
- psadbw m5, m6, [r4+%2]
- %else
- movu m4, [r3+%2]
- movu m5, [r4+%2]
- psadbw m4, m6
- psadbw m5, m6
- %endif
- paddw m2, m4
- paddw m3, m5
- %endmacro
- %macro SAD_X4_4x16P_SSE2 2
- %if %1==0
- lea r6, [r5*3]
- SAD_X4_START_1x16P_SSE2
- %else
- SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
- %endif
- SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
- SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
- SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r5]
- lea r2, [r2+4*r5]
- lea r3, [r3+4*r5]
- lea r4, [r4+4*r5]
- %endif
- %endmacro
- %macro SAD_X3_4x8P_SSE2 2
- %if %1==0
- lea t0, [r4*3]
- SAD_X3_START_2x8P_SSE2
- %else
- SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
- %endif
- SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r4]
- lea r2, [r2+4*r4]
- lea r3, [r3+4*r4]
- %endif
- %endmacro
- %macro SAD_X4_4x8P_SSE2 2
- %if %1==0
- lea r6, [r5*3]
- SAD_X4_START_2x8P_SSE2
- %else
- SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
- %endif
- SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r5]
- lea r2, [r2+4*r5]
- lea r3, [r3+4*r5]
- lea r4, [r4+4*r5]
- %endif
- %endmacro
- %macro SAD_X3_END_SSE2 0
- movifnidn r5, r5mp
- %if cpuflag(ssse3)
- packssdw m0, m1
- packssdw m2, m2
- phaddd m0, m2
- mova [r5], m0
- %else
- movhlps m3, m0
- movhlps m4, m1
- movhlps m5, m2
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
- movd [r5+0], m0
- movd [r5+4], m1
- movd [r5+8], m2
- %endif
- RET
- %endmacro
- %macro SAD_X4_END_SSE2 0
- mov r0, r6mp
- %if cpuflag(ssse3)
- packssdw m0, m1
- packssdw m2, m3
- phaddd m0, m2
- mova [r0], m0
- %else
- psllq m1, 32
- psllq m3, 32
- paddw m0, m1
- paddw m2, m3
- movhlps m1, m0
- movhlps m3, m2
- paddw m0, m1
- paddw m2, m3
- movq [r0+0], m0
- movq [r0+8], m2
- %endif
- RET
- %endmacro
- %macro SAD_X4_START_2x8P_SSSE3 0
- movddup m4, [r0]
- movq m0, [r1]
- movq m1, [r3]
- movhps m0, [r2]
- movhps m1, [r4]
- movddup m5, [r0+FENC_STRIDE]
- movq m2, [r1+r5]
- movq m3, [r3+r5]
- movhps m2, [r2+r5]
- movhps m3, [r4+r5]
- psadbw m0, m4
- psadbw m1, m4
- psadbw m2, m5
- psadbw m3, m5
- paddw m0, m2
- paddw m1, m3
- %endmacro
- %macro SAD_X4_2x8P_SSSE3 4
- movddup m6, [r0+%1]
- movq m2, [r1+%2]
- movq m3, [r3+%2]
- movhps m2, [r2+%2]
- movhps m3, [r4+%2]
- movddup m7, [r0+%3]
- movq m4, [r1+%4]
- movq m5, [r3+%4]
- movhps m4, [r2+%4]
- movhps m5, [r4+%4]
- psadbw m2, m6
- psadbw m3, m6
- psadbw m4, m7
- psadbw m5, m7
- paddw m0, m2
- paddw m1, m3
- paddw m0, m4
- paddw m1, m5
- %endmacro
- %macro SAD_X4_4x8P_SSSE3 2
- %if %1==0
- lea r6, [r5*3]
- SAD_X4_START_2x8P_SSSE3
- %else
- SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
- %endif
- SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r5]
- lea r2, [r2+4*r5]
- lea r3, [r3+4*r5]
- lea r4, [r4+4*r5]
- %endif
- %endmacro
- %macro SAD_X4_END_SSSE3 0
- mov r0, r6mp
- packssdw m0, m1
- mova [r0], m0
- RET
- %endmacro
- %macro SAD_X3_START_2x16P_AVX2 0
- movu m3, [r0] ; assumes FENC_STRIDE == 16
- movu xm0, [r1]
- movu xm1, [r2]
- movu xm2, [r3]
- vinserti128 m0, m0, [r1+r4], 1
- vinserti128 m1, m1, [r2+r4], 1
- vinserti128 m2, m2, [r3+r4], 1
- psadbw m0, m3
- psadbw m1, m3
- psadbw m2, m3
- %endmacro
- %macro SAD_X3_2x16P_AVX2 3
- movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
- movu xm4, [r1+%2]
- movu xm5, [r2+%2]
- movu xm6, [r3+%2]
- vinserti128 m4, m4, [r1+%3], 1
- vinserti128 m5, m5, [r2+%3], 1
- vinserti128 m6, m6, [r3+%3], 1
- psadbw m4, m3
- psadbw m5, m3
- psadbw m6, m3
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- %endmacro
- %macro SAD_X3_4x16P_AVX2 2
- %if %1==0
- lea t0, [r4*3]
- SAD_X3_START_2x16P_AVX2
- %else
- SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
- %endif
- SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r4]
- lea r2, [r2+4*r4]
- lea r3, [r3+4*r4]
- %endif
- %endmacro
- %macro SAD_X4_START_2x16P_AVX2 0
- vbroadcasti128 m4, [r0]
- vbroadcasti128 m5, [r0+FENC_STRIDE]
- movu xm0, [r1]
- movu xm1, [r2]
- movu xm2, [r1+r5]
- movu xm3, [r2+r5]
- vinserti128 m0, m0, [r3], 1
- vinserti128 m1, m1, [r4], 1
- vinserti128 m2, m2, [r3+r5], 1
- vinserti128 m3, m3, [r4+r5], 1
- psadbw m0, m4
- psadbw m1, m4
- psadbw m2, m5
- psadbw m3, m5
- paddw m0, m2
- paddw m1, m3
- %endmacro
- %macro SAD_X4_2x16P_AVX2 4
- vbroadcasti128 m6, [r0+%1]
- vbroadcasti128 m7, [r0+%3]
- movu xm2, [r1+%2]
- movu xm3, [r2+%2]
- movu xm4, [r1+%4]
- movu xm5, [r2+%4]
- vinserti128 m2, m2, [r3+%2], 1
- vinserti128 m3, m3, [r4+%2], 1
- vinserti128 m4, m4, [r3+%4], 1
- vinserti128 m5, m5, [r4+%4], 1
- psadbw m2, m6
- psadbw m3, m6
- psadbw m4, m7
- psadbw m5, m7
- paddw m0, m2
- paddw m1, m3
- paddw m0, m4
- paddw m1, m5
- %endmacro
- %macro SAD_X4_4x16P_AVX2 2
- %if %1==0
- lea r6, [r5*3]
- SAD_X4_START_2x16P_AVX2
- %else
- SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
- %endif
- SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
- %if %1 != %2-1
- %if (%1&1) != 0
- add r0, 8*FENC_STRIDE
- %endif
- lea r1, [r1+4*r5]
- lea r2, [r2+4*r5]
- lea r3, [r3+4*r5]
- lea r4, [r4+4*r5]
- %endif
- %endmacro
- %macro SAD_X3_END_AVX2 0
- movifnidn r5, r5mp
- packssdw m0, m1 ; 0 0 1 1 0 0 1 1
- packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
- phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
- vextracti128 xm1, m0, 1
- paddd xm0, xm1 ; 0 1 2 _
- mova [r5], xm0
- RET
- %endmacro
- %macro SAD_X4_END_AVX2 0
- mov r0, r6mp
- packssdw m0, m1 ; 0 0 1 1 2 2 3 3
- vextracti128 xm1, m0, 1
- phaddd xm0, xm1 ; 0 1 2 3
- mova [r0], xm0
- RET
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
- ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
- ;-----------------------------------------------------------------------------
- %macro SAD_X_SSE2 4
- cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
- %assign x 0
- %rep %3/4
- SAD_X%1_4x%2P_SSE2 x, %3/4
- %assign x x+1
- %endrep
- SAD_X%1_END_SSE2
- %endmacro
- INIT_XMM sse2
- SAD_X_SSE2 3, 16, 16, 7
- SAD_X_SSE2 3, 16, 8, 7
- SAD_X_SSE2 3, 8, 16, 7
- SAD_X_SSE2 3, 8, 8, 7
- SAD_X_SSE2 3, 8, 4, 7
- SAD_X_SSE2 4, 16, 16, 7
- SAD_X_SSE2 4, 16, 8, 7
- SAD_X_SSE2 4, 8, 16, 7
- SAD_X_SSE2 4, 8, 8, 7
- SAD_X_SSE2 4, 8, 4, 7
- INIT_XMM sse3
- SAD_X_SSE2 3, 16, 16, 7
- SAD_X_SSE2 3, 16, 8, 7
- SAD_X_SSE2 4, 16, 16, 7
- SAD_X_SSE2 4, 16, 8, 7
- %macro SAD_X_SSSE3 3
- cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
- %assign x 0
- %rep %3/4
- SAD_X%1_4x%2P_SSSE3 x, %3/4
- %assign x x+1
- %endrep
- SAD_X%1_END_SSSE3
- %endmacro
- INIT_XMM ssse3
- SAD_X_SSE2 3, 16, 16, 7
- SAD_X_SSE2 3, 16, 8, 7
- SAD_X_SSE2 4, 16, 16, 7
- SAD_X_SSE2 4, 16, 8, 7
- SAD_X_SSSE3 4, 8, 16
- SAD_X_SSSE3 4, 8, 8
- SAD_X_SSSE3 4, 8, 4
- INIT_XMM avx
- SAD_X_SSE2 3, 16, 16, 6
- SAD_X_SSE2 3, 16, 8, 6
- SAD_X_SSE2 4, 16, 16, 7
- SAD_X_SSE2 4, 16, 8, 7
- %macro SAD_X_AVX2 4
- cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
- %assign x 0
- %rep %3/4
- SAD_X%1_4x%2P_AVX2 x, %3/4
- %assign x x+1
- %endrep
- SAD_X%1_END_AVX2
- %endmacro
- INIT_YMM avx2
- SAD_X_AVX2 3, 16, 16, 7
- SAD_X_AVX2 3, 16, 8, 7
- SAD_X_AVX2 4, 16, 16, 8
- SAD_X_AVX2 4, 16, 8, 8
- %macro SAD_X_W4_AVX512 2 ; x, h
- cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
- mov t1d, 0xa
- kmovb k1, t1d
- lea t1, [3*t0]
- kaddb k2, k1, k1
- kshiftlb k3, k1, 2
- %assign %%i 0
- %rep %2/4
- movu m6, [r0+%%i*64]
- vmovddup m6 {k1}, [r0+%%i*64+32]
- movd xmm2, [r1]
- movd xmm4, [r1+t0]
- vpbroadcastd xmm2 {k1}, [r1+2*t0]
- vpbroadcastd xmm4 {k1}, [r1+t1]
- vpbroadcastd xmm2 {k2}, [r2+t0]
- vpbroadcastd xmm4 {k2}, [r2]
- vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
- vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
- vpmovqd s1, m6 ; s0 s2 s1 s3
- movd xmm3, [r3]
- movd xmm5, [r3+t0]
- vpbroadcastd xmm3 {k1}, [r3+2*t0]
- vpbroadcastd xmm5 {k1}, [r3+t1]
- %if %1 == 4
- vpbroadcastd xmm3 {k2}, [r4+t0]
- vpbroadcastd xmm5 {k2}, [r4]
- vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
- vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
- %endif
- %if %%i != %2/4-1
- %assign %%j 1
- %rep %1
- lea r%+%%j, [r%+%%j+4*t0]
- %assign %%j %%j+1
- %endrep
- %endif
- pshufd s2, s1, q1032
- psadbw xmm2, s1
- psadbw xmm4, s2
- psadbw xmm3, s1
- psadbw xmm5, s2
- %if %%i
- paddd xmm0, xmm2
- paddd xmm1, xmm3
- paddd xmm0, xmm4
- paddd xmm1, xmm5
- %else
- paddd xmm0, xmm2, xmm4
- paddd xmm1, xmm3, xmm5
- %endif
- %assign %%i %%i+1
- %endrep
- %if %1 == 4
- movifnidn t2, r6mp
- %else
- movifnidn t2, r5mp
- %endif
- packusdw xmm0, xmm1
- mova [t2], xmm0
- RET
- %endmacro
- %macro SAD_X_W8_AVX512 2 ; x, h
- cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
- kxnorb k3, k3, k3
- lea t1, [3*t0]
- kaddb k1, k3, k3
- kshiftlb k2, k3, 2
- kshiftlb k3, k3, 3
- %assign %%i 0
- %rep %2/4
- movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
- movq xm2, [r1]
- movq xm4, [r1+2*t0]
- vpbroadcastq xm2 {k1}, [r2]
- vpbroadcastq xm4 {k1}, [r2+2*t0]
- vpbroadcastq m2 {k2}, [r1+t0]
- vpbroadcastq m4 {k2}, [r1+t1]
- vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
- vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
- movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
- movq xm3, [r3]
- movq xm5, [r3+2*t0]
- %if %1 == 4
- vpbroadcastq xm3 {k1}, [r4]
- vpbroadcastq xm5 {k1}, [r4+2*t0]
- %endif
- vpbroadcastq m3 {k2}, [r3+t0]
- vpbroadcastq m5 {k2}, [r3+t1]
- %if %1 == 4
- vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
- vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
- %endif
- %if %%i != %2/4-1
- %assign %%j 1
- %rep %1
- lea r%+%%j, [r%+%%j+4*t0]
- %assign %%j %%j+1
- %endrep
- %endif
- psadbw m2, m6
- psadbw m4, m7
- psadbw m3, m6
- psadbw m5, m7
- ACCUM paddd, 0, 2, %%i
- ACCUM paddd, 1, 3, %%i
- paddd m0, m4
- paddd m1, m5
- %assign %%i %%i+1
- %endrep
- %if %1 == 4
- movifnidn t2, r6mp
- %else
- movifnidn t2, r5mp
- %endif
- packusdw m0, m1
- vextracti128 xm1, m0, 1
- paddd xm0, xm1
- mova [t2], xm0
- RET
- %endmacro
- %macro SAD_X_W16_AVX512 2 ; x, h
- cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
- lea t1, [3*t0]
- %assign %%i 0
- %rep %2/4
- mova m6, [r0+%%i*64] ; s0 s1 s2 s3
- movu xm2, [r3]
- movu xm4, [r3+t0]
- %if %1 == 4
- vinserti128 ym2, [r4+t0], 1
- vinserti128 ym4, [r4], 1
- %endif
- vinserti32x4 m2, [r1+2*t0], 2
- vinserti32x4 m4, [r1+t1], 2
- vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
- vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
- vpermq m7, m6, q1032 ; s1 s0 s3 s2
- movu xm3, [r1]
- movu xm5, [r1+t0]
- vinserti128 ym3, [r2+t0], 1
- vinserti128 ym5, [r2], 1
- vinserti32x4 m3, [r3+2*t0], 2
- vinserti32x4 m5, [r3+t1], 2
- %if %1 == 4
- vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
- vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
- %endif
- %if %%i != %2/4-1
- %assign %%j 1
- %rep %1
- lea r%+%%j, [r%+%%j+4*t0]
- %assign %%j %%j+1
- %endrep
- %endif
- psadbw m2, m6
- psadbw m4, m7
- psadbw m3, m6
- psadbw m5, m7
- ACCUM paddd, 0, 2, %%i
- ACCUM paddd, 1, 3, %%i
- paddd m0, m4
- paddd m1, m5
- %assign %%i %%i+1
- %endrep
- %if %1 == 4
- movifnidn t2, r6mp
- %else
- movifnidn t2, r5mp
- %endif
- mov t1d, 0x1111
- kmovw k1, t1d
- vshufi32x4 m0, m0, q1032
- paddd m0, m1
- punpckhqdq m1, m0, m0
- paddd m0, m1
- vpcompressd m0 {k1}{z}, m0
- mova [t2], xm0
- RET
- %endmacro
- ; t0 = stride, t1 = tmp/stride3, t2 = scores
- %if WIN64
- %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
- %define s2 xmm17 ; they're callee-saved on win64
- DECLARE_REG_TMP 4, 6, 0
- %else
- %define s1 xmm6
- %define s2 xmm7
- %if ARCH_X86_64
- DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
- %else
- DECLARE_REG_TMP 4, 5, 0
- %endif
- %endif
- INIT_YMM avx512
- SAD_X_W4_AVX512 3, 4 ; x3_4x4
- SAD_X_W4_AVX512 3, 8 ; x3_4x8
- SAD_X_W8_AVX512 3, 4 ; x3_8x4
- SAD_X_W8_AVX512 3, 8 ; x3_8x8
- SAD_X_W8_AVX512 3, 16 ; x3_8x16
- INIT_ZMM avx512
- SAD_X_W16_AVX512 3, 8 ; x3_16x8
- SAD_X_W16_AVX512 3, 16 ; x3_16x16
- DECLARE_REG_TMP 5, 6, 0
- INIT_YMM avx512
- SAD_X_W4_AVX512 4, 4 ; x4_4x4
- SAD_X_W4_AVX512 4, 8 ; x4_4x8
- SAD_X_W8_AVX512 4, 4 ; x4_8x4
- SAD_X_W8_AVX512 4, 8 ; x4_8x8
- SAD_X_W8_AVX512 4, 16 ; x4_8x16
- INIT_ZMM avx512
- SAD_X_W16_AVX512 4, 8 ; x4_16x8
- SAD_X_W16_AVX512 4, 16 ; x4_16x16
- ;=============================================================================
- ; SAD cacheline split
- ;=============================================================================
- ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
- ; unless the unaligned data spans the border between 2 cachelines, in which
- ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
- ; to Nehalem have a large penalty for cacheline splits.
- ; (8-byte alignment exactly half way between two cachelines is ok though.)
- ; LDDQU was supposed to fix this, but it only works on Pentium 4.
- ; So in the split case we load aligned data and explicitly perform the
- ; alignment between registers. Like on archs that have only aligned loads,
- ; except complicated by the fact that PALIGNR takes only an immediate, not
- ; a variable alignment.
- ; It is also possible to hoist the realignment to the macroblock level (keep
- ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
- ; needed for that method makes it often slower.
- ; sad 16x16 costs on Core2:
- ; good offsets: 49 cycles (50/64 of all mvs)
- ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
- ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
- ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
- ; computed jump assumes this loop is exactly 80 bytes
- %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
- ALIGN 16
- sad_w16_align%1_sse2:
- movdqa xmm1, [r2+16]
- movdqa xmm2, [r2+r3+16]
- movdqa xmm3, [r2]
- movdqa xmm4, [r2+r3]
- pslldq xmm1, 16-%1
- pslldq xmm2, 16-%1
- psrldq xmm3, %1
- psrldq xmm4, %1
- por xmm1, xmm3
- por xmm2, xmm4
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- dec r4
- jg sad_w16_align%1_sse2
- ret
- %endmacro
- ; computed jump assumes this loop is exactly 64 bytes
- %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
- ALIGN 16
- sad_w16_align%1_ssse3:
- movdqa xmm1, [r2+16]
- movdqa xmm2, [r2+r3+16]
- palignr xmm1, [r2], %1
- palignr xmm2, [r2+r3], %1
- psadbw xmm1, [r0]
- psadbw xmm2, [r0+r1]
- paddw xmm0, xmm1
- paddw xmm0, xmm2
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- dec r4
- jg sad_w16_align%1_ssse3
- ret
- %endmacro
- %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
- cglobal pixel_sad_16x%2_cache64_%1
- mov eax, r2m
- and eax, 0x37
- cmp eax, 0x30
- jle pixel_sad_16x%2_sse2
- PROLOGUE 4,6
- mov r4d, r2d
- and r4d, 15
- %ifidn %1, ssse3
- shl r4d, 6 ; code size = 64
- %else
- lea r4, [r4*5]
- shl r4d, 4 ; code size = 80
- %endif
- %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
- %ifdef PIC
- lea r5, [sad_w16_addr]
- add r5, r4
- %else
- lea r5, [sad_w16_addr + r4]
- %endif
- and r2, ~15
- mov r4d, %2/2
- pxor xmm0, xmm0
- call r5
- MOVHL xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- RET
- %endmacro
- %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
- mov eax, r2m
- and eax, 0x17|%1|(%4>>1)
- cmp eax, 0x10|%1|(%4>>1)
- jle pixel_sad_%1x%2_mmx2
- and eax, 7
- shl eax, 3
- movd mm6, [sw_64]
- movd mm7, eax
- psubw mm6, mm7
- PROLOGUE 4,5
- and r2, ~7
- mov r4d, %3
- pxor mm0, mm0
- %endmacro
- %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
- cglobal pixel_sad_16x%1_cache%2_mmx2
- SAD_CACHELINE_START_MMX2 16, %1, %1, %2
- .loop:
- movq mm1, [r2]
- movq mm2, [r2+8]
- movq mm3, [r2+16]
- movq mm4, mm2
- psrlq mm1, mm7
- psllq mm2, mm6
- psllq mm3, mm6
- psrlq mm4, mm7
- por mm1, mm2
- por mm3, mm4
- psadbw mm1, [r0]
- psadbw mm3, [r0+8]
- paddw mm0, mm1
- paddw mm0, mm3
- add r2, r3
- add r0, r1
- dec r4
- jg .loop
- movd eax, mm0
- RET
- %endmacro
- %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
- cglobal pixel_sad_8x%1_cache%2_mmx2
- SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
- .loop:
- movq mm1, [r2+8]
- movq mm2, [r2+r3+8]
- movq mm3, [r2]
- movq mm4, [r2+r3]
- psllq mm1, mm6
- psllq mm2, mm6
- psrlq mm3, mm7
- psrlq mm4, mm7
- por mm1, mm3
- por mm2, mm4
- psadbw mm1, [r0]
- psadbw mm2, [r0+r1]
- paddw mm0, mm1
- paddw mm0, mm2
- lea r2, [r2+2*r3]
- lea r0, [r0+2*r1]
- dec r4
- jg .loop
- movd eax, mm0
- RET
- %endmacro
- ; sad_x3/x4_cache64: check each mv.
- ; if they're all within a cacheline, use normal sad_x3/x4.
- ; otherwise, send them individually to sad_cache64.
- %macro CHECK_SPLIT 3 ; pix, width, cacheline
- mov eax, %1
- and eax, 0x17|%2|(%3>>1)
- cmp eax, 0x10|%2|(%3>>1)
- jg .split
- %endmacro
- %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
- cglobal pixel_sad_x3_%1x%2_cache%3_%6
- CHECK_SPLIT r1m, %1, %3
- CHECK_SPLIT r2m, %1, %3
- CHECK_SPLIT r3m, %1, %3
- jmp pixel_sad_x3_%1x%2_%4
- .split:
- %if ARCH_X86_64
- PROLOGUE 6,9
- push r3
- push r2
- %if WIN64
- movsxd r4, r4d
- sub rsp, 40 ; shadow space and alignment
- %endif
- mov r2, r1
- mov r1, FENC_STRIDE
- mov r3, r4
- mov r7, r0
- mov r8, r5
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8], eax
- %if WIN64
- mov r2, [rsp+40+0*8]
- %else
- pop r2
- %endif
- mov r0, r7
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8+4], eax
- %if WIN64
- mov r2, [rsp+40+1*8]
- %else
- pop r2
- %endif
- mov r0, r7
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8+8], eax
- %if WIN64
- add rsp, 40+2*8
- %endif
- RET
- %else
- push edi
- mov edi, [esp+28]
- push dword [esp+24]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call pixel_sad_%1x%2_cache%3_%5
- mov [edi+8], eax
- add esp, 16
- pop edi
- ret
- %endif
- %endmacro
- %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
- cglobal pixel_sad_x4_%1x%2_cache%3_%6
- CHECK_SPLIT r1m, %1, %3
- CHECK_SPLIT r2m, %1, %3
- CHECK_SPLIT r3m, %1, %3
- CHECK_SPLIT r4m, %1, %3
- jmp pixel_sad_x4_%1x%2_%4
- .split:
- %if ARCH_X86_64
- PROLOGUE 6,9
- mov r8, r6mp
- push r4
- push r3
- push r2
- %if WIN64
- sub rsp, 32 ; shadow space
- %endif
- mov r2, r1
- mov r1, FENC_STRIDE
- mov r3, r5
- mov r7, r0
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8], eax
- %if WIN64
- mov r2, [rsp+32+0*8]
- %else
- pop r2
- %endif
- mov r0, r7
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8+4], eax
- %if WIN64
- mov r2, [rsp+32+1*8]
- %else
- pop r2
- %endif
- mov r0, r7
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8+8], eax
- %if WIN64
- mov r2, [rsp+32+2*8]
- %else
- pop r2
- %endif
- mov r0, r7
- call pixel_sad_%1x%2_cache%3_%5
- mov [r8+12], eax
- %if WIN64
- add rsp, 32+3*8
- %endif
- RET
- %else
- push edi
- mov edi, [esp+32]
- push dword [esp+28]
- push dword [esp+16]
- push dword 16
- push dword [esp+20]
- call pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+32]
- mov [edi], eax
- mov [esp+8], ecx
- call pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+36]
- mov [edi+4], eax
- mov [esp+8], ecx
- call pixel_sad_%1x%2_cache%3_%5
- mov ecx, [esp+40]
- mov [edi+8], eax
- mov [esp+8], ecx
- call pixel_sad_%1x%2_cache%3_%5
- mov [edi+12], eax
- add esp, 16
- pop edi
- ret
- %endif
- %endmacro
- %macro SADX34_CACHELINE_FUNC 1+
- SADX3_CACHELINE_FUNC %1
- SADX4_CACHELINE_FUNC %1
- %endmacro
- ; instantiate the aligned sads
- INIT_MMX
- %if ARCH_X86_64 == 0
- SAD16_CACHELINE_FUNC_MMX2 8, 32
- SAD16_CACHELINE_FUNC_MMX2 16, 32
- SAD8_CACHELINE_FUNC_MMX2 4, 32
- SAD8_CACHELINE_FUNC_MMX2 8, 32
- SAD8_CACHELINE_FUNC_MMX2 16, 32
- SAD16_CACHELINE_FUNC_MMX2 8, 64
- SAD16_CACHELINE_FUNC_MMX2 16, 64
- %endif ; !ARCH_X86_64
- SAD8_CACHELINE_FUNC_MMX2 4, 64
- SAD8_CACHELINE_FUNC_MMX2 8, 64
- SAD8_CACHELINE_FUNC_MMX2 16, 64
- %if ARCH_X86_64 == 0
- SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
- %endif ; !ARCH_X86_64
- SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
- SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
- %if ARCH_X86_64 == 0
- SAD16_CACHELINE_FUNC sse2, 8
- SAD16_CACHELINE_FUNC sse2, 16
- %assign i 1
- %rep 15
- SAD16_CACHELINE_LOOP_SSE2 i
- %assign i i+1
- %endrep
- SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
- SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
- %endif ; !ARCH_X86_64
- SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
- SAD16_CACHELINE_FUNC ssse3, 8
- SAD16_CACHELINE_FUNC ssse3, 16
- %assign i 1
- %rep 15
- SAD16_CACHELINE_LOOP_SSSE3 i
- %assign i i+1
- %endrep
- SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
- SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3
|