123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727 |
- ;*****************************************************************************
- ;* sad16-a.asm: x86 high depth sad functions
- ;*****************************************************************************
- ;* Copyright (C) 2010-2018 x264 project
- ;*
- ;* Authors: Oskar Arvidsson <oskar@irock.se>
- ;* Henrik Gramner <henrik@gramner.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION .text
- cextern pw_1
- cextern pw_4
- cextern pw_8
- ;=============================================================================
- ; SAD MMX
- ;=============================================================================
- %macro SAD_INC_1x16P_MMX 0
- movu m1, [r0+ 0]
- movu m2, [r0+ 8]
- movu m3, [r0+16]
- movu m4, [r0+24]
- psubw m1, [r2+ 0]
- psubw m2, [r2+ 8]
- psubw m3, [r2+16]
- psubw m4, [r2+24]
- ABSW2 m1, m2, m1, m2, m5, m6
- ABSW2 m3, m4, m3, m4, m7, m5
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
- paddw m1, m2
- paddw m3, m4
- paddw m0, m1
- paddw m0, m3
- %endmacro
- %macro SAD_INC_2x8P_MMX 0
- movu m1, [r0+0]
- movu m2, [r0+8]
- movu m3, [r0+2*r1+0]
- movu m4, [r0+2*r1+8]
- psubw m1, [r2+0]
- psubw m2, [r2+8]
- psubw m3, [r2+2*r3+0]
- psubw m4, [r2+2*r3+8]
- ABSW2 m1, m2, m1, m2, m5, m6
- ABSW2 m3, m4, m3, m4, m7, m5
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- paddw m1, m2
- paddw m3, m4
- paddw m0, m1
- paddw m0, m3
- %endmacro
- %macro SAD_INC_2x4P_MMX 0
- movu m1, [r0]
- movu m2, [r0+2*r1]
- psubw m1, [r2]
- psubw m2, [r2+2*r3]
- ABSW2 m1, m2, m1, m2, m3, m4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- paddw m0, m1
- paddw m0, m2
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
- ;-----------------------------------------------------------------------------
- %macro SAD_MMX 3
- cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
- pxor m0, m0
- %if %2 == 4
- SAD_INC_%3x%1P_MMX
- SAD_INC_%3x%1P_MMX
- %else
- mov r4d, %2/%3
- .loop:
- SAD_INC_%3x%1P_MMX
- dec r4d
- jg .loop
- %endif
- %if %1*%2 == 256
- HADDUW m0, m1
- %else
- HADDW m0, m1
- %endif
- movd eax, m0
- RET
- %endmacro
- INIT_MMX mmx2
- SAD_MMX 16, 16, 1
- SAD_MMX 16, 8, 1
- SAD_MMX 8, 16, 2
- SAD_MMX 8, 8, 2
- SAD_MMX 8, 4, 2
- SAD_MMX 4, 8, 2
- SAD_MMX 4, 4, 2
- INIT_MMX ssse3
- SAD_MMX 4, 8, 2
- SAD_MMX 4, 4, 2
- ;=============================================================================
- ; SAD XMM
- ;=============================================================================
- %macro SAD_INC_2ROW 1
- %if 2*%1 > mmsize
- movu m1, [r2+ 0]
- movu m2, [r2+16]
- movu m3, [r2+2*r3+ 0]
- movu m4, [r2+2*r3+16]
- psubw m1, [r0+ 0]
- psubw m2, [r0+16]
- psubw m3, [r0+2*r1+ 0]
- psubw m4, [r0+2*r1+16]
- ABSW2 m1, m2, m1, m2, m5, m6
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- ABSW2 m3, m4, m3, m4, m7, m5
- paddw m1, m2
- paddw m3, m4
- paddw m0, m1
- paddw m0, m3
- %else
- movu m1, [r2]
- movu m2, [r2+2*r3]
- psubw m1, [r0]
- psubw m2, [r0+2*r1]
- ABSW2 m1, m2, m1, m2, m3, m4
- lea r0, [r0+4*r1]
- lea r2, [r2+4*r3]
- paddw m0, m1
- paddw m0, m2
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
- ;-----------------------------------------------------------------------------
- %macro SAD 2
- cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
- pxor m0, m0
- %if %2 == 4
- SAD_INC_2ROW %1
- SAD_INC_2ROW %1
- %else
- mov r4d, %2/2
- .loop:
- SAD_INC_2ROW %1
- dec r4d
- jg .loop
- %endif
- HADDW m0, m1
- movd eax, xm0
- RET
- %endmacro
- INIT_XMM sse2
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- SAD 8, 4
- INIT_XMM sse2, aligned
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- INIT_XMM ssse3
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- SAD 8, 4
- INIT_XMM ssse3, aligned
- SAD 16, 16
- SAD 16, 8
- SAD 8, 16
- SAD 8, 8
- INIT_YMM avx2
- SAD 16, 16
- SAD 16, 8
- ;=============================================================================
- ; SAD x3/x4
- ;=============================================================================
- %macro SAD_X3_INC_P 0
- add r0, 4*FENC_STRIDE
- lea r1, [r1+4*r4]
- lea r2, [r2+4*r4]
- lea r3, [r3+4*r4]
- %endmacro
- %macro SAD_X3_ONE_START 0
- mova m3, [r0]
- movu m0, [r1]
- movu m1, [r2]
- movu m2, [r3]
- psubw m0, m3
- psubw m1, m3
- psubw m2, m3
- ABSW2 m0, m1, m0, m1, m4, m5
- ABSW m2, m2, m6
- %endmacro
- %macro SAD_X3_ONE 2
- mova m6, [r0+%1]
- movu m3, [r1+%2]
- movu m4, [r2+%2]
- movu m5, [r3+%2]
- psubw m3, m6
- psubw m4, m6
- psubw m5, m6
- ABSW2 m3, m4, m3, m4, m7, m6
- ABSW m5, m5, m6
- paddw m0, m3
- paddw m1, m4
- paddw m2, m5
- %endmacro
- %macro SAD_X3_END 2
- %if mmsize == 8 && %1*%2 == 256
- HADDUW m0, m3
- HADDUW m1, m4
- HADDUW m2, m5
- %else
- HADDW m0, m3
- HADDW m1, m4
- HADDW m2, m5
- %endif
- %if UNIX64
- movd [r5+0], xm0
- movd [r5+4], xm1
- movd [r5+8], xm2
- %else
- mov r0, r5mp
- movd [r0+0], xm0
- movd [r0+4], xm1
- movd [r0+8], xm2
- %endif
- RET
- %endmacro
- %macro SAD_X4_INC_P 0
- add r0, 4*FENC_STRIDE
- lea r1, [r1+4*r5]
- lea r2, [r2+4*r5]
- lea r3, [r3+4*r5]
- lea r4, [r4+4*r5]
- %endmacro
- %macro SAD_X4_ONE_START 0
- mova m4, [r0]
- movu m0, [r1]
- movu m1, [r2]
- movu m2, [r3]
- movu m3, [r4]
- psubw m0, m4
- psubw m1, m4
- psubw m2, m4
- psubw m3, m4
- ABSW2 m0, m1, m0, m1, m5, m6
- ABSW2 m2, m3, m2, m3, m4, m7
- %endmacro
- %macro SAD_X4_ONE 2
- mova m4, [r0+%1]
- movu m5, [r1+%2]
- movu m6, [r2+%2]
- %if num_mmregs > 8
- movu m7, [r3+%2]
- movu m8, [r4+%2]
- psubw m5, m4
- psubw m6, m4
- psubw m7, m4
- psubw m8, m4
- ABSW2 m5, m6, m5, m6, m9, m10
- ABSW2 m7, m8, m7, m8, m9, m10
- paddw m0, m5
- paddw m1, m6
- paddw m2, m7
- paddw m3, m8
- %elif cpuflag(ssse3)
- movu m7, [r3+%2]
- psubw m5, m4
- psubw m6, m4
- psubw m7, m4
- movu m4, [r4+%2]
- pabsw m5, m5
- psubw m4, [r0+%1]
- pabsw m6, m6
- pabsw m7, m7
- pabsw m4, m4
- paddw m0, m5
- paddw m1, m6
- paddw m2, m7
- paddw m3, m4
- %else ; num_mmregs == 8 && !ssse3
- psubw m5, m4
- psubw m6, m4
- ABSW m5, m5, m7
- ABSW m6, m6, m7
- paddw m0, m5
- paddw m1, m6
- movu m5, [r3+%2]
- movu m6, [r4+%2]
- psubw m5, m4
- psubw m6, m4
- ABSW2 m5, m6, m5, m6, m7, m4
- paddw m2, m5
- paddw m3, m6
- %endif
- %endmacro
- %macro SAD_X4_END 2
- %if mmsize == 8 && %1*%2 == 256
- HADDUW m0, m4
- HADDUW m1, m5
- HADDUW m2, m6
- HADDUW m3, m7
- %else
- HADDW m0, m4
- HADDW m1, m5
- HADDW m2, m6
- HADDW m3, m7
- %endif
- mov r0, r6mp
- movd [r0+ 0], xm0
- movd [r0+ 4], xm1
- movd [r0+ 8], xm2
- movd [r0+12], xm3
- RET
- %endmacro
- %macro SAD_X_2xNP 4
- %assign x %3
- %rep %4
- SAD_X%1_ONE x*mmsize, x*mmsize
- SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
- %assign x x+1
- %endrep
- %endmacro
- %macro PIXEL_VSAD 0
- cglobal pixel_vsad, 3,3,8
- mova m0, [r0]
- mova m1, [r0+16]
- mova m2, [r0+2*r1]
- mova m3, [r0+2*r1+16]
- lea r0, [r0+4*r1]
- psubw m0, m2
- psubw m1, m3
- ABSW2 m0, m1, m0, m1, m4, m5
- paddw m0, m1
- sub r2d, 2
- je .end
- .loop:
- mova m4, [r0]
- mova m5, [r0+16]
- mova m6, [r0+2*r1]
- mova m7, [r0+2*r1+16]
- lea r0, [r0+4*r1]
- psubw m2, m4
- psubw m3, m5
- psubw m4, m6
- psubw m5, m7
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- ABSW m4, m4, m1
- ABSW m5, m5, m1
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- paddw m0, m5
- mova m2, m6
- mova m3, m7
- sub r2d, 2
- jg .loop
- .end:
- %if BIT_DEPTH == 9
- HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
- %else
- HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
- %endif
- movd eax, m0
- RET
- %endmacro
- INIT_XMM sse2
- PIXEL_VSAD
- INIT_XMM ssse3
- PIXEL_VSAD
- INIT_XMM xop
- PIXEL_VSAD
- INIT_YMM avx2
- cglobal pixel_vsad, 3,3
- mova m0, [r0]
- mova m1, [r0+2*r1]
- lea r0, [r0+4*r1]
- psubw m0, m1
- pabsw m0, m0
- sub r2d, 2
- je .end
- .loop:
- mova m2, [r0]
- mova m3, [r0+2*r1]
- lea r0, [r0+4*r1]
- psubw m1, m2
- psubw m2, m3
- pabsw m1, m1
- pabsw m2, m2
- paddw m0, m1
- paddw m0, m2
- mova m1, m3
- sub r2d, 2
- jg .loop
- .end:
- %if BIT_DEPTH == 9
- HADDW m0, m1
- %else
- HADDUW m0, m1
- %endif
- movd eax, xm0
- RET
- ;-----------------------------------------------------------------------------
- ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
- ; uint16_t *pix2, intptr_t i_stride, int scores[3] )
- ;-----------------------------------------------------------------------------
- %macro SAD_X 3
- cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
- %assign regnum %1+1
- %xdefine STRIDE r %+ regnum
- mov r6, %3/2-1
- SAD_X%1_ONE_START
- SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
- SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
- .loop:
- SAD_X%1_INC_P
- SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
- dec r6
- jg .loop
- %if %1 == 4
- mov r6, r6m
- %endif
- SAD_X%1_END %2, %3
- %endmacro
- INIT_MMX mmx2
- %define XMM_REGS 0
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- SAD_X 3, 4, 8
- SAD_X 3, 4, 4
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- SAD_X 4, 4, 8
- SAD_X 4, 4, 4
- INIT_MMX ssse3
- SAD_X 3, 4, 8
- SAD_X 3, 4, 4
- SAD_X 4, 4, 8
- SAD_X 4, 4, 4
- INIT_XMM ssse3
- %define XMM_REGS 7
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- %define XMM_REGS 9
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- INIT_XMM sse2
- %define XMM_REGS 8
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- %define XMM_REGS 11
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- INIT_XMM xop
- %define XMM_REGS 7
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- SAD_X 3, 8, 16
- SAD_X 3, 8, 8
- SAD_X 3, 8, 4
- %define XMM_REGS 9
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- SAD_X 4, 8, 16
- SAD_X 4, 8, 8
- SAD_X 4, 8, 4
- INIT_YMM avx2
- %define XMM_REGS 7
- SAD_X 3, 16, 16
- SAD_X 3, 16, 8
- %define XMM_REGS 9
- SAD_X 4, 16, 16
- SAD_X 4, 16, 8
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
- ;-----------------------------------------------------------------------------
- %macro INTRA_SAD_X3_4x4 0
- cglobal intra_sad_x3_4x4, 3,3,7
- %if cpuflag(ssse3)
- movddup m0, [r1-1*FDEC_STRIDEB]
- %else
- movq m0, [r1-1*FDEC_STRIDEB]
- punpcklqdq m0, m0
- %endif
- movq m1, [r0+0*FENC_STRIDEB]
- movq m2, [r0+2*FENC_STRIDEB]
- pshuflw m6, m0, q1032
- paddw m6, m0
- pshuflw m5, m6, q2301
- paddw m6, m5
- punpcklqdq m6, m6 ; A+B+C+D 8 times
- movhps m1, [r0+1*FENC_STRIDEB]
- movhps m2, [r0+3*FENC_STRIDEB]
- psubw m3, m1, m0
- psubw m0, m2
- ABSW2 m3, m0, m3, m0, m4, m5
- paddw m0, m3
- movd m3, [r1+0*FDEC_STRIDEB-4]
- movd m4, [r1+2*FDEC_STRIDEB-4]
- movhps m3, [r1+1*FDEC_STRIDEB-8]
- movhps m4, [r1+3*FDEC_STRIDEB-8]
- pshufhw m3, m3, q3333
- pshufhw m4, m4, q3333
- pshuflw m3, m3, q1111 ; FF FF EE EE
- pshuflw m4, m4, q1111 ; HH HH GG GG
- paddw m5, m3, m4
- paddw m6, [pw_4]
- paddw m6, m5
- pshufd m5, m5, q1032
- paddw m5, m6
- psrlw m5, 3
- psubw m6, m5, m2
- psubw m5, m1
- psubw m1, m3
- psubw m2, m4
- ABSW2 m5, m6, m5, m6, m3, m4
- ABSW2 m1, m2, m1, m2, m3, m4
- paddw m5, m6
- paddw m1, m2
- %if cpuflag(ssse3)
- phaddw m0, m1
- movhlps m3, m5
- paddw m5, m3
- phaddw m0, m5
- pmaddwd m0, [pw_1]
- mova [r2], m0
- %else
- HADDW m0, m3
- HADDW m1, m3
- HADDW m5, m3
- movd [r2], m0 ; V prediction cost
- movd [r2+4], m1 ; H prediction cost
- movd [r2+8], m5 ; DC prediction cost
- %endif
- RET
- %endmacro
- INIT_XMM sse2
- INTRA_SAD_X3_4x4
- INIT_XMM ssse3
- INTRA_SAD_X3_4x4
- INIT_XMM avx
- INTRA_SAD_X3_4x4
- ;-----------------------------------------------------------------------------
- ; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
- ;-----------------------------------------------------------------------------
- ;m0 = DC
- ;m6 = V
- ;m7 = H
- ;m1 = DC score
- ;m2 = V score
- ;m3 = H score
- ;m5 = temp
- ;m4 = pixel row
- %macro INTRA_SAD_HVDC_ITER 2
- mova m4, [r0+(%1-4)*FENC_STRIDEB]
- psubw m4, m0
- ABSW m4, m4, m5
- ACCUM paddw, 1, 4, %1
- mova m4, [r0+(%1-4)*FENC_STRIDEB]
- psubw m4, m6
- ABSW m4, m4, m5
- ACCUM paddw, 2, 4, %1
- pshufd m5, m7, %2
- psubw m5, [r0+(%1-4)*FENC_STRIDEB]
- ABSW m5, m5, m4
- ACCUM paddw, 3, 5, %1
- %endmacro
- %macro INTRA_SAD_X3_8x8 0
- cglobal intra_sad_x3_8x8, 3,3,8
- add r0, 4*FENC_STRIDEB
- movu m0, [r1+7*SIZEOF_PIXEL]
- mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
- mova m7, m0
- paddw m0, m6
- punpckhwd m7, m7
- HADDW m0, m4
- paddw m0, [pw_8]
- psrlw m0, 4
- SPLATW m0, m0
- INTRA_SAD_HVDC_ITER 0, q3333
- INTRA_SAD_HVDC_ITER 1, q2222
- INTRA_SAD_HVDC_ITER 2, q1111
- INTRA_SAD_HVDC_ITER 3, q0000
- movq m7, [r1+7*SIZEOF_PIXEL]
- punpcklwd m7, m7
- INTRA_SAD_HVDC_ITER 4, q3333
- INTRA_SAD_HVDC_ITER 5, q2222
- INTRA_SAD_HVDC_ITER 6, q1111
- INTRA_SAD_HVDC_ITER 7, q0000
- %if cpuflag(ssse3)
- phaddw m2, m3 ; 2 2 2 2 3 3 3 3
- movhlps m3, m1
- paddw m1, m3 ; 1 1 1 1 _ _ _ _
- phaddw m2, m1 ; 2 2 3 3 1 1 _ _
- pmaddwd m2, [pw_1] ; 2 3 1 _
- mova [r2], m2
- %else
- HADDW m2, m4
- HADDW m3, m4
- HADDW m1, m4
- movd [r2+0], m2
- movd [r2+4], m3
- movd [r2+8], m1
- %endif
- RET
- %endmacro
- INIT_XMM sse2
- INTRA_SAD_X3_8x8
- INIT_XMM ssse3
- INTRA_SAD_X3_8x8
- %macro INTRA_SAD_HVDC_ITER_YMM 2
- mova xm4, [r0+(%1-4)*FENC_STRIDEB]
- vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
- pshufd m5, m7, %2
- psubw m5, m4
- pabsw m5, m5
- ACCUM paddw, 2, 5, %1 ; H
- psubw m5, m4, m6
- psubw m4, m0
- pabsw m5, m5
- pabsw m4, m4
- ACCUM paddw, 1, 5, %1 ; V
- ACCUM paddw, 3, 4, %1 ; DC
- %endmacro
- INIT_YMM avx2
- cglobal intra_sad_x3_8x8, 3,3,8
- add r0, 4*FENC_STRIDEB
- movu xm0, [r1+7*SIZEOF_PIXEL]
- vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
- vpermq m7, m0, q0011
- paddw xm0, xm6
- paddw xm0, [pw_1] ; equal to +8 after HADDW
- HADDW xm0, xm4
- psrld xm0, 4
- vpbroadcastw m0, xm0
- punpcklwd m7, m7
- INTRA_SAD_HVDC_ITER_YMM 0, q3333
- INTRA_SAD_HVDC_ITER_YMM 1, q2222
- INTRA_SAD_HVDC_ITER_YMM 2, q1111
- INTRA_SAD_HVDC_ITER_YMM 3, q0000
- phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
- punpckhqdq m2, m3, m3
- paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
- phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
- vextracti128 xm2, m1, 1
- paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
- pmaddwd xm1, [pw_1] ; 1 2 3 _
- mova [r2], xm1
- RET
|