1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287 |
- ;*****************************************************************************
- ;* dct-a.asm: x86 transform and zigzag
- ;*****************************************************************************
- ;* Copyright (C) 2003-2018 x264 project
- ;*
- ;* Authors: Holger Lubitz <holger@lubitz.org>
- ;* Loren Merritt <lorenm@u.washington.edu>
- ;* Laurent Aimar <fenrir@via.ecp.fr>
- ;* Min Chen <chenm001.163.com>
- ;* Fiona Glaser <fiona@x264.com>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA 64
- ; AVX-512 permutation indices are bit-packed to save cache
- %if HIGH_BIT_DEPTH
- scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
- dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
- dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
- dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
- ; bits 19-23: 8x8_frame4
- scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1
- dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
- dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
- dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
- cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
- dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
- dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
- dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
- %else
- dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
- dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
- dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
- dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
- scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
- dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
- dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
- dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
- scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1
- dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
- dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
- dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
- cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
- dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
- dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
- dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
- %endif
- pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
- pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
- pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
- pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
- pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
- pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
- pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
- pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
- pb_scan8framet1: SHUFFLE_MASK_W 0, 1, 6, 7, 8, 9, 13, 14
- pb_scan8framet2: SHUFFLE_MASK_W 2 , 3, 4, 7, 9, 15, 10, 14
- pb_scan8framet3: SHUFFLE_MASK_W 0, 1, 5, 6, 8, 11, 12, 13
- pb_scan8framet4: SHUFFLE_MASK_W 0, 3, 4, 5, 8, 11, 12, 15
- pb_scan8framet5: SHUFFLE_MASK_W 1, 2, 6, 7, 9, 10, 13, 14
- pb_scan8framet6: SHUFFLE_MASK_W 0, 3, 4, 5, 10, 11, 12, 15
- pb_scan8framet7: SHUFFLE_MASK_W 1, 2, 6, 7, 8, 9, 14, 15
- pb_scan8framet8: SHUFFLE_MASK_W 0, 1, 2, 7, 8, 10, 11, 14
- pb_scan8framet9: SHUFFLE_MASK_W 1, 4, 5, 7, 8, 13, 14, 15
- pb_scan8frame1: SHUFFLE_MASK_W 0, 8, 1, 2, 9, 12, 4, 13
- pb_scan8frame2: SHUFFLE_MASK_W 4, 0, 1, 5, 8, 10, 12, 14
- pb_scan8frame3: SHUFFLE_MASK_W 12, 10, 8, 6, 2, 3, 7, 9
- pb_scan8frame4: SHUFFLE_MASK_W 0, 1, 8, 12, 4, 13, 9, 2
- pb_scan8frame5: SHUFFLE_MASK_W 5, 14, 10, 3, 11, 15, 6, 7
- pb_scan8frame6: SHUFFLE_MASK_W 6, 8, 12, 13, 9, 7, 5, 3
- pb_scan8frame7: SHUFFLE_MASK_W 1, 3, 5, 7, 10, 14, 15, 11
- pb_scan8frame8: SHUFFLE_MASK_W 10, 3, 11, 14, 5, 6, 15, 7
- pb_scan8field1 : SHUFFLE_MASK_W 0, 1, 2, 8, 9, 3, 4, 10
- pb_scan8field2a: SHUFFLE_MASK_W 0x80, 11, 5, 6, 7, 12,0x80,0x80
- pb_scan8field2b: SHUFFLE_MASK_W 0,0x80,0x80,0x80,0x80,0x80, 1, 8
- pb_scan8field3a: SHUFFLE_MASK_W 10, 5, 6, 7, 11,0x80,0x80,0x80
- pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80, 1, 8, 2
- pb_scan8field4a: SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80,0x80
- pb_scan8field6 : SHUFFLE_MASK_W 4, 5, 6, 7, 11,0x80,0x80, 12
- pb_scan8field7 : SHUFFLE_MASK_W 5, 6, 7, 11,0x80,0x80, 12, 13
- SECTION .text
- cextern pw_32_0
- cextern pw_32
- cextern pw_512
- cextern pw_8000
- cextern pw_pixel_max
- cextern hsub_mul
- cextern pb_1
- cextern pw_1
- cextern pd_1
- cextern pd_32
- cextern pw_ppppmmmm
- cextern pw_pmpmpmpm
- cextern deinterleave_shufd
- cextern pb_unpackbd1
- cextern pb_unpackbd2
- %macro WALSH4_1D 6
- SUMSUB_BADC %1, %5, %4, %3, %2, %6
- SUMSUB_BADC %1, %5, %3, %4, %2, %6
- SWAP %2, %5, %4
- %endmacro
- %macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
- movq m%3, m%4
- pxor m%1, m%4
- psubw m%3, m%2
- pxor m%2, m%4
- pavgw m%3, m%1
- pavgw m%2, m%1
- pxor m%3, m%4
- pxor m%2, m%4
- SWAP %1, %2, %3
- %endmacro
- %macro DCT_UNPACK 3
- punpcklwd %3, %1
- punpckhwd %2, %1
- psrad %3, 16
- psrad %2, 16
- SWAP %1, %3
- %endmacro
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void dct4x4dc( dctcoef d[4][4] )
- ;-----------------------------------------------------------------------------
- %macro DCT4x4_DC 0
- cglobal dct4x4dc, 1,1,5
- mova m0, [r0+ 0]
- mova m1, [r0+16]
- mova m2, [r0+32]
- mova m3, [r0+48]
- WALSH4_1D d, 0,1,2,3,4
- TRANSPOSE4x4D 0,1,2,3,4
- paddd m0, [pd_1]
- WALSH4_1D d, 0,1,2,3,4
- psrad m0, 1
- psrad m1, 1
- psrad m2, 1
- psrad m3, 1
- mova [r0+ 0], m0
- mova [r0+16], m1
- mova [r0+32], m2
- mova [r0+48], m3
- RET
- %endmacro ; DCT4x4_DC
- INIT_XMM sse2
- DCT4x4_DC
- INIT_XMM avx
- DCT4x4_DC
- %else
- INIT_MMX mmx2
- cglobal dct4x4dc, 1,1
- movq m3, [r0+24]
- movq m2, [r0+16]
- movq m1, [r0+ 8]
- movq m0, [r0+ 0]
- movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
- WALSH4_1D w, 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC w, 1, 0, 3, 2, 4
- SWAP 0, 1
- SWAP 2, 3
- SUMSUB_17BIT 0,2,4,7
- SUMSUB_17BIT 1,3,5,7
- movq [r0+0], m0
- movq [r0+8], m2
- movq [r0+16], m3
- movq [r0+24], m1
- RET
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void idct4x4dc( int32_t d[4][4] )
- ;-----------------------------------------------------------------------------
- %macro IDCT4x4DC 0
- cglobal idct4x4dc, 1,1
- mova m3, [r0+48]
- mova m2, [r0+32]
- mova m1, [r0+16]
- mova m0, [r0+ 0]
- WALSH4_1D d,0,1,2,3,4
- TRANSPOSE4x4D 0,1,2,3,4
- WALSH4_1D d,0,1,2,3,4
- mova [r0+ 0], m0
- mova [r0+16], m1
- mova [r0+32], m2
- mova [r0+48], m3
- RET
- %endmacro ; IDCT4x4DC
- INIT_XMM sse2
- IDCT4x4DC
- INIT_XMM avx
- IDCT4x4DC
- %else
- ;-----------------------------------------------------------------------------
- ; void idct4x4dc( int16_t d[4][4] )
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx
- cglobal idct4x4dc, 1,1
- movq m3, [r0+24]
- movq m2, [r0+16]
- movq m1, [r0+ 8]
- movq m0, [r0+ 0]
- WALSH4_1D w,0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- WALSH4_1D w,0,1,2,3,4
- movq [r0+ 0], m0
- movq [r0+ 8], m1
- movq [r0+16], m2
- movq [r0+24], m3
- RET
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
- ;-----------------------------------------------------------------------------
- %if WIN64
- DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
- %else
- DECLARE_REG_TMP 2
- %endif
- %macro INSERT_COEFF 3 ; dst, src, imm
- %if %3
- %if HIGH_BIT_DEPTH
- %if cpuflag(sse4)
- pinsrd %1, %2, %3
- %elif %3 == 2
- movd m2, %2
- %elif %3 == 1
- punpckldq %1, %2
- %else
- punpckldq m2, %2
- punpcklqdq %1, m2
- %endif
- %else
- %if %3 == 2
- punpckldq %1, %2
- %else
- pinsrw %1, %2, %3
- %endif
- %endif
- %else
- movd %1, %2
- %endif
- %if HIGH_BIT_DEPTH
- mov %2, t0d
- %else
- mov %2, t0w
- %endif
- %endmacro
- %macro DCT2x4DC 2
- cglobal dct2x4dc, 2,3
- xor t0d, t0d
- INSERT_COEFF m0, [r1+0*16*SIZEOF_DCTCOEF], 0
- INSERT_COEFF m0, [r1+1*16*SIZEOF_DCTCOEF], 2
- add r1, 4*16*SIZEOF_DCTCOEF
- INSERT_COEFF m0, [r1-2*16*SIZEOF_DCTCOEF], 1
- INSERT_COEFF m0, [r1-1*16*SIZEOF_DCTCOEF], 3
- INSERT_COEFF m1, [r1+0*16*SIZEOF_DCTCOEF], 0
- INSERT_COEFF m1, [r1+1*16*SIZEOF_DCTCOEF], 2
- INSERT_COEFF m1, [r1+2*16*SIZEOF_DCTCOEF], 1
- INSERT_COEFF m1, [r1+3*16*SIZEOF_DCTCOEF], 3
- SUMSUB_BA %1, 1, 0, 2
- SBUTTERFLY %2, 1, 0, 2
- SUMSUB_BA %1, 0, 1, 2
- SBUTTERFLY %2, 0, 1, 2
- SUMSUB_BA %1, 1, 0, 2
- pshuf%1 m0, m0, q1032
- mova [r0], m1
- mova [r0+mmsize], m0
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- DCT2x4DC d, dq
- INIT_XMM avx
- DCT2x4DC d, dq
- %else
- INIT_MMX mmx2
- DCT2x4DC w, wd
- %endif
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
- ;-----------------------------------------------------------------------------
- INIT_MMX mmx
- cglobal sub4x4_dct, 3,3
- .skip_prologue:
- LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
- LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
- DCT4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- SUMSUB_BADC w, 3, 0, 2, 1
- SUMSUB_BA w, 2, 3, 4
- DCT_UNPACK m2, m4, m5
- DCT_UNPACK m3, m6, m7
- mova [r0+ 0], m2 ; s03 + s12
- mova [r0+ 8], m4
- mova [r0+32], m3 ; s03 - s12
- mova [r0+40], m6
- DCT_UNPACK m0, m2, m4
- DCT_UNPACK m1, m3, m5
- SUMSUB2_AB d, 0, 1, 4
- SUMSUB2_AB d, 2, 3, 5
- mova [r0+16], m0 ; d03*2 + d12
- mova [r0+24], m2
- mova [r0+48], m4 ; d03 - 2*d12
- mova [r0+56], m5
- RET
- %else
- %macro SUB_DCT4 0
- cglobal sub4x4_dct, 3,3
- .skip_prologue:
- %if cpuflag(ssse3)
- mova m5, [hsub_mul]
- %endif
- LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
- DCT4_1D 0,1,2,3,4
- TRANSPOSE4x4W 0,1,2,3,4
- DCT4_1D 0,1,2,3,4
- movq [r0+ 0], m0
- movq [r0+ 8], m1
- movq [r0+16], m2
- movq [r0+24], m3
- RET
- %endmacro
- INIT_MMX mmx
- SUB_DCT4
- INIT_MMX ssse3
- SUB_DCT4
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
- ;-----------------------------------------------------------------------------
- %macro STORE_DIFFx2 6
- psrad %1, 6
- psrad %2, 6
- packssdw %1, %2
- movq %3, %5
- movhps %3, %6
- paddsw %1, %3
- CLIPW %1, %4, [pw_pixel_max]
- movq %5, %1
- movhps %6, %1
- %endmacro
- %macro ADD4x4_IDCT 0
- cglobal add4x4_idct, 2,2,6
- add r0, 2*FDEC_STRIDEB
- .skip_prologue:
- mova m1, [r1+16]
- mova m3, [r1+48]
- mova m2, [r1+32]
- mova m0, [r1+ 0]
- IDCT4_1D d,0,1,2,3,4,5
- TRANSPOSE4x4D 0,1,2,3,4
- paddd m0, [pd_32]
- IDCT4_1D d,0,1,2,3,4,5
- pxor m5, m5
- STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
- STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
- RET
- %endmacro
- INIT_XMM sse2
- ADD4x4_IDCT
- INIT_XMM avx
- ADD4x4_IDCT
- %else ; !HIGH_BIT_DEPTH
- INIT_MMX mmx
- cglobal add4x4_idct, 2,2
- pxor m7, m7
- .skip_prologue:
- movq m1, [r1+ 8]
- movq m3, [r1+24]
- movq m2, [r1+16]
- movq m0, [r1+ 0]
- IDCT4_1D w,0,1,2,3,4,5
- TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32]
- IDCT4_1D w,0,1,2,3,4,5
- STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
- STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
- STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
- RET
- %macro ADD4x4 0
- cglobal add4x4_idct, 2,2,6
- mova m1, [r1+0x00] ; row1/row0
- mova m3, [r1+0x10] ; row3/row2
- psraw m0, m1, 1 ; row1>>1/...
- psraw m2, m3, 1 ; row3>>1/...
- movsd m0, m1 ; row1>>1/row0
- movsd m2, m3 ; row3>>1/row2
- psubw m0, m3 ; row1>>1-row3/row0-2
- paddw m2, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 wd, 0, 2, 1
- SUMSUB_BA w, 2, 0, 1
- pshuflw m1, m2, q2301
- pshufhw m2, m2, q2301
- punpckldq m1, m0
- punpckhdq m2, m0
- SWAP 0, 1
- mova m1, [pw_32_0]
- paddw m1, m0 ; row1/row0 corrected
- psraw m0, 1 ; row1>>1/...
- psraw m3, m2, 1 ; row3>>1/...
- movsd m0, m1 ; row1>>1/row0
- movsd m3, m2 ; row3>>1/row2
- psubw m0, m2 ; row1>>1-row3/row0-2
- paddw m3, m1 ; row3>>1+row1/row0+2
- SBUTTERFLY2 qdq, 0, 3, 1
- SUMSUB_BA w, 3, 0, 1
- movd m4, [r0+FDEC_STRIDE*0]
- movd m1, [r0+FDEC_STRIDE*1]
- movd m2, [r0+FDEC_STRIDE*2]
- movd m5, [r0+FDEC_STRIDE*3]
- punpckldq m1, m4 ; row0/row1
- pxor m4, m4
- punpckldq m2, m5 ; row3/row2
- punpcklbw m1, m4
- psraw m3, 6
- punpcklbw m2, m4
- psraw m0, 6
- paddsw m3, m1
- paddsw m0, m2
- packuswb m0, m3 ; row0/row1/row3/row2
- pextrd [r0+FDEC_STRIDE*0], m0, 3
- pextrd [r0+FDEC_STRIDE*1], m0, 2
- movd [r0+FDEC_STRIDE*2], m0
- pextrd [r0+FDEC_STRIDE*3], m0, 1
- RET
- %endmacro ; ADD4x4
- INIT_XMM sse4
- ADD4x4
- INIT_XMM avx
- ADD4x4
- %macro STOREx2_AVX2 9
- movq xm%3, [r0+%5*FDEC_STRIDE]
- vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
- movq xm%4, [r0+%7*FDEC_STRIDE]
- vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
- punpcklbw m%3, m%9
- punpcklbw m%4, m%9
- psraw m%1, 6
- psraw m%2, 6
- paddsw m%1, m%3
- paddsw m%2, m%4
- packuswb m%1, m%2
- vextracti128 xm%2, m%1, 1
- movq [r0+%5*FDEC_STRIDE], xm%1
- movq [r0+%6*FDEC_STRIDE], xm%2
- movhps [r0+%7*FDEC_STRIDE], xm%1
- movhps [r0+%8*FDEC_STRIDE], xm%2
- %endmacro
- INIT_YMM avx2
- cglobal add8x8_idct, 2,3,8
- add r0, 4*FDEC_STRIDE
- pxor m7, m7
- TAIL_CALL .skip_prologue, 0
- cglobal_label .skip_prologue
- ; TRANSPOSE4x4Q
- mova xm0, [r1+ 0]
- mova xm1, [r1+32]
- mova xm2, [r1+16]
- mova xm3, [r1+48]
- vinserti128 m0, m0, [r1+ 64], 1
- vinserti128 m1, m1, [r1+ 96], 1
- vinserti128 m2, m2, [r1+ 80], 1
- vinserti128 m3, m3, [r1+112], 1
- SBUTTERFLY qdq, 0, 1, 4
- SBUTTERFLY qdq, 2, 3, 4
- IDCT4_1D w,0,1,2,3,4,5
- TRANSPOSE2x4x4W 0,1,2,3,4
- paddw m0, [pw_32]
- IDCT4_1D w,0,1,2,3,4,5
- STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
- STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
- ret
- ; 2xdst, 2xtmp, 4xsrcrow, 1xzero
- %macro LOAD_DIFF8x2_AVX2 9
- movq xm%1, [r1+%5*FENC_STRIDE]
- movq xm%2, [r1+%6*FENC_STRIDE]
- vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
- vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
- punpcklbw m%1, m%9
- punpcklbw m%2, m%9
- movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
- movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
- vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
- vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
- punpcklbw m%3, m%9
- punpcklbw m%4, m%9
- psubw m%1, m%3
- psubw m%2, m%4
- %endmacro
- ; 4x src, 1x tmp
- %macro STORE8_DCT_AVX2 5
- SBUTTERFLY qdq, %1, %2, %5
- SBUTTERFLY qdq, %3, %4, %5
- mova [r0+ 0], xm%1
- mova [r0+ 16], xm%3
- mova [r0+ 32], xm%2
- mova [r0+ 48], xm%4
- vextracti128 [r0+ 64], m%1, 1
- vextracti128 [r0+ 80], m%3, 1
- vextracti128 [r0+ 96], m%2, 1
- vextracti128 [r0+112], m%4, 1
- %endmacro
- %macro STORE16_DCT_AVX2 5
- SBUTTERFLY qdq, %1, %2, %5
- SBUTTERFLY qdq, %3, %4, %5
- mova [r0+ 0-128], xm%1
- mova [r0+16-128], xm%3
- mova [r0+32-128], xm%2
- mova [r0+48-128], xm%4
- vextracti128 [r0+ 0], m%1, 1
- vextracti128 [r0+16], m%3, 1
- vextracti128 [r0+32], m%2, 1
- vextracti128 [r0+48], m%4, 1
- %endmacro
- INIT_YMM avx2
- cglobal sub8x8_dct, 3,3,7
- pxor m6, m6
- add r2, 4*FDEC_STRIDE
- LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
- LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
- DCT4_1D 0, 1, 2, 3, 4
- TRANSPOSE2x4x4W 0, 1, 2, 3, 4
- DCT4_1D 0, 1, 2, 3, 4
- STORE8_DCT_AVX2 0, 1, 2, 3, 4
- RET
- INIT_YMM avx2
- cglobal sub16x16_dct, 3,3,6
- add r0, 128
- add r2, 4*FDEC_STRIDE
- call .sub16x4_dct
- add r0, 64
- add r1, 4*FENC_STRIDE
- add r2, 4*FDEC_STRIDE
- call .sub16x4_dct
- add r0, 256-64
- add r1, 4*FENC_STRIDE
- add r2, 4*FDEC_STRIDE
- call .sub16x4_dct
- add r0, 64
- add r1, 4*FENC_STRIDE
- add r2, 4*FDEC_STRIDE
- call .sub16x4_dct
- RET
- .sub16x4_dct:
- LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
- LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
- DCT4_1D 0, 1, 2, 3, 4
- TRANSPOSE2x4x4W 0, 1, 2, 3, 4
- DCT4_1D 0, 1, 2, 3, 4
- STORE16_DCT_AVX2 0, 1, 2, 3, 4
- ret
- %macro DCT4x4_AVX512 0
- psubw m0, m2 ; 0 1
- psubw m1, m3 ; 3 2
- SUMSUB_BA w, 1, 0, 2
- SBUTTERFLY wd, 1, 0, 2
- paddw m2, m1, m0
- psubw m3, m1, m0
- vpaddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
- vpsubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
- shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
- punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
- SUMSUB_BA w, 1, 2, 3
- shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
- shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
- paddw m2, m1, m3
- psubw m0, m1, m3
- vpaddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
- vpsubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
- %endmacro
- INIT_XMM avx512
- cglobal sub4x4_dct
- mov eax, 0xf0aa
- kmovw k1, eax
- PROLOGUE 3,3
- movd m0, [r1+0*FENC_STRIDE]
- movd m2, [r2+0*FDEC_STRIDE]
- vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
- vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
- movd m1, [r1+3*FENC_STRIDE]
- movd m3, [r2+3*FDEC_STRIDE]
- vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
- vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
- kshiftrw k2, k1, 8
- pxor m4, m4
- punpcklbw m0, m4
- punpcklbw m2, m4
- punpcklbw m1, m4
- punpcklbw m3, m4
- DCT4x4_AVX512
- mova [r0], m2
- mova [r0+16], m0
- RET
- INIT_ZMM avx512
- cglobal dct4x4x4_internal
- punpcklbw m0, m1, m4
- punpcklbw m2, m3, m4
- punpckhbw m1, m4
- punpckhbw m3, m4
- DCT4x4_AVX512
- mova m1, m2
- vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
- vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
- ret
- %macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
- movu %1, [r1+%3*FENC_STRIDE]
- vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
- %endmacro
- %macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
- movu %1, [r2+(%4 )*FDEC_STRIDE]
- vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
- movu %3, [r2+(%5 )*FDEC_STRIDE]
- vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
- vpermt2d %1, %2, %3
- %endmacro
- cglobal sub8x8_dct, 3,3
- mova m0, [dct_avx512]
- DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
- mov r1d, 0xaaaaaaaa
- kmovd k1, r1d
- psrld m0, 5
- DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
- mov r1d, 0xf0f0f0f0
- kmovd k2, r1d
- pxor xm4, xm4
- knotw k3, k2
- call dct4x4x4_internal_avx512
- mova [r0], m0
- mova [r0+64], m1
- RET
- %macro SUB4x16_DCT_AVX512 2 ; dst, src
- vpermd m1, m5, [r1+1*%2*64]
- mova m3, [r2+2*%2*64]
- vpermt2d m3, m6, [r2+2*%2*64+64]
- call dct4x4x4_internal_avx512
- mova [r0+%1*64 ], m0
- mova [r0+%1*64+128], m1
- %endmacro
- cglobal sub16x16_dct
- psrld m5, [dct_avx512], 10
- mov eax, 0xaaaaaaaa
- kmovd k1, eax
- mov eax, 0xf0f0f0f0
- kmovd k2, eax
- PROLOGUE 3,3
- pxor xm4, xm4
- knotw k3, k2
- psrld m6, m5, 4
- SUB4x16_DCT_AVX512 0, 0
- SUB4x16_DCT_AVX512 1, 1
- SUB4x16_DCT_AVX512 4, 2
- SUB4x16_DCT_AVX512 5, 3
- RET
- cglobal sub8x8_dct_dc, 3,3
- mova m3, [dct_avx512]
- DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
- mov r1d, 0xaa
- kmovb k1, r1d
- psrld m3, 5
- DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
- pxor xm3, xm3
- psadbw m0, m3
- psadbw m1, m3
- psubw m0, m1
- vpmovqw xmm0, m0
- vprold xmm1, xmm0, 16
- paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
- punpckhqdq xmm2, xmm0, xmm0
- psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
- paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
- punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
- punpcklqdq xmm1, xmm0, xmm0
- vpsubw xmm0 {k1}, xm3, xmm0
- paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
- movhps [r0], xmm0
- RET
- cglobal sub8x16_dct_dc, 3,3
- mova m5, [dct_avx512]
- DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
- DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
- mov r1d, 0xaa
- kmovb k1, r1d
- psrld m5, 5
- DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
- DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
- pxor xm4, xm4
- psadbw m0, m4
- psadbw m1, m4
- psadbw m2, m4
- psadbw m3, m4
- psubw m0, m2
- psubw m1, m3
- SBUTTERFLY qdq, 0, 1, 2
- paddw m0, m1
- vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
- psrlq xmm2, xmm0, 32
- psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
- paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
- punpckhdq xmm2, xmm0, xmm1
- punpckldq xmm0, xmm1
- psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
- paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
- punpcklwd xmm0, xmm1
- psrlq xmm2, xmm0, 32
- psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
- paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
- shufps xmm0, xmm1, q0220
- mova [r0], xmm0
- RET
- %macro SARSUMSUB 3 ; a, b, tmp
- mova m%3, m%1
- vpsraw m%1 {k1}, 1
- psubw m%1, m%2 ; 0-2 1>>1-3
- vpsraw m%2 {k1}, 1
- paddw m%2, m%3 ; 0+2 1+3>>1
- %endmacro
- cglobal add8x8_idct, 2,2
- mova m1, [r1]
- mova m2, [r1+64]
- mova m3, [dct_avx512]
- vbroadcasti32x4 m4, [pw_32]
- mov r1d, 0xf0f0f0f0
- kxnorb k2, k2, k2
- kmovd k1, r1d
- kmovb k3, k2
- vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
- vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
- psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
- vpgatherqq m6 {k2}, [r0+m5]
- SARSUMSUB 0, 1, 2
- SBUTTERFLY wd, 1, 0, 2
- psrlq m7, m3, 28
- SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
- vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
- SBUTTERFLY dq, 0, 1, 2
- psrlq m3, 24
- SARSUMSUB 0, 1, 2
- vpermi2q m3, m1, m0
- vpermt2q m1, m7, m0
- paddw m3, m4 ; += 32
- SUMSUB_BA w, 1, 3, 0
- psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
- psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
- pxor xm0, xm0
- SBUTTERFLY bw, 6, 0, 2
- paddsw m1, m6
- paddsw m3, m0
- packuswb m1, m3
- vpscatterqq [r0+m5] {k3}, m1
- RET
- %endif ; HIGH_BIT_DEPTH
- INIT_MMX
- ;-----------------------------------------------------------------------------
- ; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
- ;-----------------------------------------------------------------------------
- %macro SUB_NxN_DCT 7
- cglobal %1, 3,3,%7
- %if HIGH_BIT_DEPTH == 0
- %if mmsize == 8
- pxor m7, m7
- %else
- add r2, 4*FDEC_STRIDE
- mova m7, [hsub_mul]
- %endif
- %endif ; !HIGH_BIT_DEPTH
- .skip_prologue:
- call %2.skip_prologue
- add r0, %3
- add r1, %4-%5-%6*FENC_STRIDE
- add r2, %4-%5-%6*FDEC_STRIDE
- call %2.skip_prologue
- add r0, %3
- add r1, (%4-%6)*FENC_STRIDE-%5-%4
- add r2, (%4-%6)*FDEC_STRIDE-%5-%4
- call %2.skip_prologue
- add r0, %3
- add r1, %4-%5-%6*FENC_STRIDE
- add r2, %4-%5-%6*FDEC_STRIDE
- TAIL_CALL %2.skip_prologue, 1
- %endmacro
- ;-----------------------------------------------------------------------------
- ; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
- ;-----------------------------------------------------------------------------
- %macro ADD_NxN_IDCT 6-7
- %if HIGH_BIT_DEPTH
- cglobal %1, 2,2,%7
- %if %3==256
- add r1, 128
- %endif
- %else
- cglobal %1, 2,2,11
- pxor m7, m7
- %endif
- %if mmsize>=16 && %3!=256
- add r0, 4*FDEC_STRIDE
- %endif
- .skip_prologue:
- call %2.skip_prologue
- add r0, %4-%5-%6*FDEC_STRIDE
- add r1, %3
- call %2.skip_prologue
- add r0, (%4-%6)*FDEC_STRIDE-%5-%4
- add r1, %3
- call %2.skip_prologue
- add r0, %4-%5-%6*FDEC_STRIDE
- add r1, %3
- TAIL_CALL %2.skip_prologue, 1
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_MMX
- SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
- SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
- INIT_XMM
- ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2, 64, 8, 0, 0, 6
- ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
- ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx, 64, 8, 0, 0, 6
- ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 64, 16, 8, 8, 6
- cextern add8x8_idct8_sse2.skip_prologue
- cextern add8x8_idct8_avx.skip_prologue
- ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
- ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 256, 16, 0, 0, 16
- cextern sub8x8_dct8_sse2.skip_prologue
- cextern sub8x8_dct8_sse4.skip_prologue
- cextern sub8x8_dct8_avx.skip_prologue
- SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
- SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
- SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
- %else ; !HIGH_BIT_DEPTH
- %if ARCH_X86_64 == 0
- INIT_MMX
- SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
- ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
- SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 32, 8, 4, 4, 0
- ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx, 32, 8, 4, 4
- cextern sub8x8_dct8_mmx.skip_prologue
- cextern add8x8_idct8_mmx.skip_prologue
- SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx, 128, 8, 0, 0, 0
- ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
- %endif
- INIT_XMM
- cextern sub8x8_dct_sse2.skip_prologue
- cextern sub8x8_dct_ssse3.skip_prologue
- cextern sub8x8_dct_avx.skip_prologue
- cextern sub8x8_dct_xop.skip_prologue
- SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2, 128, 8, 0, 0, 10
- SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
- SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx, 128, 8, 0, 0, 10
- SUB_NxN_DCT sub16x16_dct_xop, sub8x8_dct_xop, 128, 8, 0, 0, 10
- cextern add8x8_idct_sse2.skip_prologue
- cextern add8x8_idct_avx.skip_prologue
- ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
- ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx, 128, 8, 0, 0
- cextern add8x8_idct8_sse2.skip_prologue
- cextern add8x8_idct8_avx.skip_prologue
- ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
- ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx, 128, 8, 0, 0
- cextern sub8x8_dct8_sse2.skip_prologue
- cextern sub8x8_dct8_ssse3.skip_prologue
- cextern sub8x8_dct8_avx.skip_prologue
- SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
- SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
- SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
- INIT_YMM
- ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
- %endif ; HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
- ;-----------------------------------------------------------------------------
- %macro ADD_DC 2
- mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
- mova m1, [%1+FDEC_STRIDEB*1]
- mova m2, [%1+FDEC_STRIDEB*2]
- paddsw m0, %2
- paddsw m1, %2
- paddsw m2, %2
- paddsw %2, [%1+FDEC_STRIDEB*3]
- CLIPW m0, m5, m6
- CLIPW m1, m5, m6
- CLIPW m2, m5, m6
- CLIPW %2, m5, m6
- mova [%1+FDEC_STRIDEB*0], m0
- mova [%1+FDEC_STRIDEB*1], m1
- mova [%1+FDEC_STRIDEB*2], m2
- mova [%1+FDEC_STRIDEB*3], %2
- %endmacro
- %macro ADD_IDCT_DC 0
- cglobal add8x8_idct_dc, 2,2,7
- mova m6, [pw_pixel_max]
- pxor m5, m5
- mova m3, [r1]
- paddd m3, [pd_32]
- psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
- pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
- pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
- pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
- pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
- ADD_DC r0+FDEC_STRIDEB*0, m4
- ADD_DC r0+FDEC_STRIDEB*4, m3
- RET
- cglobal add16x16_idct_dc, 2,3,8
- mov r2, 4
- mova m6, [pw_pixel_max]
- mova m7, [pd_32]
- pxor m5, m5
- .loop:
- mova m3, [r1]
- paddd m3, m7
- psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
- pshuflw m4, m3, q2200 ; dc0 dc0 dc1 dc1 _ _ _ _
- pshufhw m3, m3, q2200 ; _ _ _ _ dc2 dc2 dc3 dc3
- pshufd m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
- pshufd m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
- ADD_DC r0+FDEC_STRIDEB*0, m4
- ADD_DC r0+SIZEOF_PIXEL*8, m3
- add r1, 16
- add r0, 4*FDEC_STRIDEB
- dec r2
- jg .loop
- RET
- %endmacro ; ADD_IDCT_DC
- INIT_XMM sse2
- ADD_IDCT_DC
- INIT_XMM avx
- ADD_IDCT_DC
- %else ;!HIGH_BIT_DEPTH
- %macro ADD_DC 3
- mova m4, [%3+FDEC_STRIDE*0]
- mova m5, [%3+FDEC_STRIDE*1]
- mova m6, [%3+FDEC_STRIDE*2]
- paddusb m4, %1
- paddusb m5, %1
- paddusb m6, %1
- paddusb %1, [%3+FDEC_STRIDE*3]
- psubusb m4, %2
- psubusb m5, %2
- psubusb m6, %2
- psubusb %1, %2
- mova [%3+FDEC_STRIDE*0], m4
- mova [%3+FDEC_STRIDE*1], m5
- mova [%3+FDEC_STRIDE*2], m6
- mova [%3+FDEC_STRIDE*3], %1
- %endmacro
- INIT_MMX mmx2
- cglobal add8x8_idct_dc, 2,2
- mova m0, [r1]
- pxor m1, m1
- add r0, FDEC_STRIDE*4
- paddw m0, [pw_32]
- psraw m0, 6
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
- pshufw m2, m0, q3322
- pshufw m3, m1, q3322
- punpcklbw m0, m0
- punpcklbw m1, m1
- ADD_DC m0, m1, r0-FDEC_STRIDE*4
- ADD_DC m2, m3, r0
- RET
- INIT_XMM ssse3
- cglobal add8x8_idct_dc, 2,2
- movh m0, [r1]
- pxor m1, m1
- add r0, FDEC_STRIDE*4
- pmulhrsw m0, [pw_512]
- psubw m1, m0
- mova m5, [pb_unpackbd1]
- packuswb m0, m0
- packuswb m1, m1
- pshufb m0, m5
- pshufb m1, m5
- movh m2, [r0+FDEC_STRIDE*-4]
- movh m3, [r0+FDEC_STRIDE*-3]
- movh m4, [r0+FDEC_STRIDE*-2]
- movh m5, [r0+FDEC_STRIDE*-1]
- movhps m2, [r0+FDEC_STRIDE* 0]
- movhps m3, [r0+FDEC_STRIDE* 1]
- movhps m4, [r0+FDEC_STRIDE* 2]
- movhps m5, [r0+FDEC_STRIDE* 3]
- paddusb m2, m0
- paddusb m3, m0
- paddusb m4, m0
- paddusb m5, m0
- psubusb m2, m1
- psubusb m3, m1
- psubusb m4, m1
- psubusb m5, m1
- movh [r0+FDEC_STRIDE*-4], m2
- movh [r0+FDEC_STRIDE*-3], m3
- movh [r0+FDEC_STRIDE*-2], m4
- movh [r0+FDEC_STRIDE*-1], m5
- movhps [r0+FDEC_STRIDE* 0], m2
- movhps [r0+FDEC_STRIDE* 1], m3
- movhps [r0+FDEC_STRIDE* 2], m4
- movhps [r0+FDEC_STRIDE* 3], m5
- RET
- INIT_MMX mmx2
- cglobal add16x16_idct_dc, 2,3
- mov r2, 4
- .loop:
- mova m0, [r1]
- pxor m1, m1
- paddw m0, [pw_32]
- psraw m0, 6
- psubw m1, m0
- packuswb m0, m0
- packuswb m1, m1
- punpcklbw m0, m0
- punpcklbw m1, m1
- pshufw m2, m0, q3322
- pshufw m3, m1, q3322
- punpcklbw m0, m0
- punpcklbw m1, m1
- ADD_DC m0, m1, r0
- ADD_DC m2, m3, r0+8
- add r1, 8
- add r0, FDEC_STRIDE*4
- dec r2
- jg .loop
- RET
- INIT_XMM sse2
- cglobal add16x16_idct_dc, 2,2,8
- call .loop
- add r0, FDEC_STRIDE*4
- TAIL_CALL .loop, 0
- .loop:
- add r0, FDEC_STRIDE*4
- movq m0, [r1+0]
- movq m2, [r1+8]
- add r1, 16
- punpcklwd m0, m0
- punpcklwd m2, m2
- pxor m3, m3
- paddw m0, [pw_32]
- paddw m2, [pw_32]
- psraw m0, 6
- psraw m2, 6
- psubw m1, m3, m0
- packuswb m0, m1
- psubw m3, m2
- punpckhbw m1, m0, m0
- packuswb m2, m3
- punpckhbw m3, m2, m2
- punpcklbw m0, m0
- punpcklbw m2, m2
- ADD_DC m0, m1, r0+FDEC_STRIDE*-4
- ADD_DC m2, m3, r0
- ret
- %macro ADD16x16 0
- cglobal add16x16_idct_dc, 2,2,8
- call .loop
- add r0, FDEC_STRIDE*4
- TAIL_CALL .loop, 0
- .loop:
- add r0, FDEC_STRIDE*4
- mova m0, [r1]
- add r1, 16
- pxor m1, m1
- pmulhrsw m0, [pw_512]
- psubw m1, m0
- mova m5, [pb_unpackbd1]
- mova m6, [pb_unpackbd2]
- packuswb m0, m0
- packuswb m1, m1
- pshufb m2, m0, m6
- pshufb m0, m5
- pshufb m3, m1, m6
- pshufb m1, m5
- ADD_DC m0, m1, r0+FDEC_STRIDE*-4
- ADD_DC m2, m3, r0
- ret
- %endmacro ; ADD16x16
- INIT_XMM ssse3
- ADD16x16
- INIT_XMM avx
- ADD16x16
- %macro ADD_DC_AVX2 3
- mova xm4, [r0+FDEC_STRIDE*0+%3]
- mova xm5, [r0+FDEC_STRIDE*1+%3]
- vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
- vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
- paddusb m4, %1
- paddusb m5, %1
- psubusb m4, %2
- psubusb m5, %2
- mova [r0+FDEC_STRIDE*0+%3], xm4
- mova [r0+FDEC_STRIDE*1+%3], xm5
- vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
- vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
- %endmacro
- INIT_YMM avx2
- cglobal add16x16_idct_dc, 2,3,6
- add r0, FDEC_STRIDE*4
- mova m0, [r1]
- pxor m1, m1
- pmulhrsw m0, [pw_512]
- psubw m1, m0
- mova m4, [pb_unpackbd1]
- mova m5, [pb_unpackbd2]
- packuswb m0, m0
- packuswb m1, m1
- pshufb m2, m0, m4 ; row0, row2
- pshufb m3, m1, m4 ; row0, row2
- pshufb m0, m5 ; row1, row3
- pshufb m1, m5 ; row1, row3
- lea r2, [r0+FDEC_STRIDE*8]
- ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-4
- ADD_DC_AVX2 m2, m3, FDEC_STRIDE*-2
- ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 0
- ADD_DC_AVX2 m0, m1, FDEC_STRIDE* 2
- RET
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
- ;-----------------------------------------------------------------------------
- %macro DCTDC_2ROW_MMX 4
- mova %1, [r1+FENC_STRIDE*(0+%3)]
- mova m1, [r1+FENC_STRIDE*(1+%3)]
- mova m2, [r2+FDEC_STRIDE*(0+%4)]
- mova m3, [r2+FDEC_STRIDE*(1+%4)]
- mova %2, %1
- punpckldq %1, m1
- punpckhdq %2, m1
- mova m1, m2
- punpckldq m2, m3
- punpckhdq m1, m3
- pxor m3, m3
- psadbw %1, m3
- psadbw %2, m3
- psadbw m2, m3
- psadbw m1, m3
- psubw %1, m2
- psubw %2, m1
- %endmacro
- %macro DCT2x2 2 ; reg s1/s0, reg s3/s2 (!=m0/m1)
- PSHUFLW m1, %1, q2200 ; s1 s1 s0 s0
- PSHUFLW m0, %2, q2301 ; s3 __ s2 __
- paddw m1, %2 ; s1 s13 s0 s02
- psubw m1, m0 ; d13 s13 d02 s02
- PSHUFLW m0, m1, q1010 ; d02 s02 d02 s02
- psrlq m1, 32 ; __ __ d13 s13
- paddw m0, m1 ; d02 s02 d02+d13 s02+s13
- psllq m1, 32 ; d13 s13
- psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
- %endmacro
- %if HIGH_BIT_DEPTH == 0
- INIT_MMX mmx2
- cglobal sub8x8_dct_dc, 3,3
- DCTDC_2ROW_MMX m0, m4, 0, 0
- DCTDC_2ROW_MMX m5, m6, 2, 2
- paddw m0, m5
- paddw m4, m6
- punpckldq m0, m4
- add r2, FDEC_STRIDE*4
- DCTDC_2ROW_MMX m7, m4, 4, 0
- DCTDC_2ROW_MMX m5, m6, 6, 2
- paddw m7, m5
- paddw m4, m6
- punpckldq m7, m4
- DCT2x2 m0, m7
- mova [r0], m0
- ret
- %macro DCTDC_2ROW_SSE2 4
- movh m1, [r1+FENC_STRIDE*(0+%1)]
- movh m2, [r1+FENC_STRIDE*(1+%1)]
- punpckldq m1, m2
- movh m2, [r2+FDEC_STRIDE*(0+%2)]
- punpckldq m2, [r2+FDEC_STRIDE*(1+%2)]
- psadbw m1, m0
- psadbw m2, m0
- ACCUM paddd, %4, 1, %3
- psubd m%4, m2
- %endmacro
- INIT_XMM sse2
- cglobal sub8x8_dct_dc, 3,3
- pxor m0, m0
- DCTDC_2ROW_SSE2 0, 0, 0, 3
- DCTDC_2ROW_SSE2 2, 2, 1, 3
- add r2, FDEC_STRIDE*4
- DCTDC_2ROW_SSE2 4, 0, 0, 4
- DCTDC_2ROW_SSE2 6, 2, 1, 4
- packssdw m3, m3
- packssdw m4, m4
- DCT2x2 m3, m4
- movq [r0], m0
- RET
- %macro SUB8x16_DCT_DC 0
- cglobal sub8x16_dct_dc, 3,3
- pxor m0, m0
- DCTDC_2ROW_SSE2 0, 0, 0, 3
- DCTDC_2ROW_SSE2 2, 2, 1, 3
- add r1, FENC_STRIDE*8
- add r2, FDEC_STRIDE*8
- DCTDC_2ROW_SSE2 -4, -4, 0, 4
- DCTDC_2ROW_SSE2 -2, -2, 1, 4
- shufps m3, m4, q2020
- DCTDC_2ROW_SSE2 0, 0, 0, 5
- DCTDC_2ROW_SSE2 2, 2, 1, 5
- add r2, FDEC_STRIDE*4
- DCTDC_2ROW_SSE2 4, 0, 0, 4
- DCTDC_2ROW_SSE2 6, 2, 1, 4
- shufps m5, m4, q2020
- %if cpuflag(ssse3)
- %define %%sign psignw
- %else
- %define %%sign pmullw
- %endif
- SUMSUB_BA d, 5, 3, 0
- packssdw m5, m3
- pshuflw m0, m5, q2301
- pshufhw m0, m0, q2301
- %%sign m5, [pw_pmpmpmpm]
- paddw m0, m5
- pshufd m1, m0, q1320
- pshufd m0, m0, q0231
- %%sign m1, [pw_ppppmmmm]
- paddw m0, m1
- mova [r0], m0
- RET
- %endmacro ; SUB8x16_DCT_DC
- INIT_XMM sse2
- SUB8x16_DCT_DC
- INIT_XMM ssse3
- SUB8x16_DCT_DC
- %endif ; !HIGH_BIT_DEPTH
- %macro DCTDC_4ROW_SSE2 2
- mova %1, [r1+FENC_STRIDEB*%2]
- mova m0, [r2+FDEC_STRIDEB*%2]
- %assign Y (%2+1)
- %rep 3
- paddw %1, [r1+FENC_STRIDEB*Y]
- paddw m0, [r2+FDEC_STRIDEB*Y]
- %assign Y (Y+1)
- %endrep
- psubw %1, m0
- pshufd m0, %1, q2301
- paddw %1, m0
- %endmacro
- %if HIGH_BIT_DEPTH
- %macro SUB8x8_DCT_DC_10 0
- cglobal sub8x8_dct_dc, 3,3,3
- DCTDC_4ROW_SSE2 m1, 0
- DCTDC_4ROW_SSE2 m2, 4
- mova m0, [pw_ppmmmmpp]
- pmaddwd m1, m0
- pmaddwd m2, m0
- pshufd m0, m1, q2200 ; -1 -1 +0 +0
- pshufd m1, m1, q0033 ; +0 +0 +1 +1
- paddd m1, m0
- pshufd m0, m2, q1023 ; -2 +2 -3 +3
- paddd m1, m2
- paddd m1, m0
- mova [r0], m1
- RET
- %endmacro
- INIT_XMM sse2
- SUB8x8_DCT_DC_10
- %macro SUB8x16_DCT_DC_10 0
- cglobal sub8x16_dct_dc, 3,3,6
- DCTDC_4ROW_SSE2 m1, 0
- DCTDC_4ROW_SSE2 m2, 4
- DCTDC_4ROW_SSE2 m3, 8
- DCTDC_4ROW_SSE2 m4, 12
- mova m0, [pw_ppmmmmpp]
- pmaddwd m1, m0
- pmaddwd m2, m0
- pshufd m5, m1, q2200 ; -1 -1 +0 +0
- pshufd m1, m1, q0033 ; +0 +0 +1 +1
- paddd m1, m5
- pshufd m5, m2, q1023 ; -2 +2 -3 +3
- paddd m1, m2
- paddd m1, m5 ; a6 a2 a4 a0
- pmaddwd m3, m0
- pmaddwd m4, m0
- pshufd m5, m3, q2200
- pshufd m3, m3, q0033
- paddd m3, m5
- pshufd m5, m4, q1023
- paddd m3, m4
- paddd m3, m5 ; a7 a3 a5 a1
- paddd m0, m1, m3
- psubd m1, m3
- pshufd m0, m0, q3120
- pshufd m1, m1, q3120
- punpcklqdq m2, m0, m1
- punpckhqdq m1, m0
- mova [r0+ 0], m2
- mova [r0+16], m1
- RET
- %endmacro
- INIT_XMM sse2
- SUB8x16_DCT_DC_10
- INIT_XMM avx
- SUB8x16_DCT_DC_10
- %endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
- ;-----------------------------------------------------------------------------
- %macro SCAN_8x8 0
- cglobal zigzag_scan_8x8_frame, 2,2,8
- movdqa xmm0, [r1]
- movdqa xmm1, [r1+16]
- movdq2q mm0, xmm0
- PALIGNR xmm1, xmm1, 14, xmm2
- movdq2q mm1, xmm1
- movdqa xmm2, [r1+32]
- movdqa xmm3, [r1+48]
- PALIGNR xmm2, xmm2, 12, xmm4
- movdq2q mm2, xmm2
- PALIGNR xmm3, xmm3, 10, xmm4
- movdq2q mm3, xmm3
- punpckhwd xmm0, xmm1
- punpckhwd xmm2, xmm3
- movq mm4, mm1
- movq mm5, mm1
- movq mm6, mm2
- movq mm7, mm3
- punpckhwd mm1, mm0
- psllq mm0, 16
- psrlq mm3, 16
- punpckhdq mm1, mm1
- punpckhdq mm2, mm0
- punpcklwd mm0, mm4
- punpckhwd mm4, mm3
- punpcklwd mm4, mm2
- punpckhdq mm0, mm2
- punpcklwd mm6, mm3
- punpcklwd mm5, mm7
- punpcklwd mm5, mm6
- movdqa xmm4, [r1+64]
- movdqa xmm5, [r1+80]
- movdqa xmm6, [r1+96]
- movdqa xmm7, [r1+112]
- movq [r0+2*00], mm0
- movq [r0+2*04], mm4
- movd [r0+2*08], mm1
- movq [r0+2*36], mm5
- movq [r0+2*46], mm6
- PALIGNR xmm4, xmm4, 14, xmm3
- movdq2q mm4, xmm4
- PALIGNR xmm5, xmm5, 12, xmm3
- movdq2q mm5, xmm5
- PALIGNR xmm6, xmm6, 10, xmm3
- movdq2q mm6, xmm6
- %if cpuflag(ssse3)
- PALIGNR xmm7, xmm7, 8, xmm3
- movdq2q mm7, xmm7
- %else
- movhlps xmm3, xmm7
- punpcklqdq xmm7, xmm7
- movdq2q mm7, xmm3
- %endif
- punpckhwd xmm4, xmm5
- punpckhwd xmm6, xmm7
- movq mm0, mm4
- movq mm1, mm5
- movq mm3, mm7
- punpcklwd mm7, mm6
- psrlq mm6, 16
- punpcklwd mm4, mm6
- punpcklwd mm5, mm4
- punpckhdq mm4, mm3
- punpcklwd mm3, mm6
- punpckhwd mm3, mm4
- punpckhwd mm0, mm1
- punpckldq mm4, mm0
- punpckhdq mm0, mm6
- pshufw mm4, mm4, q1230
- movq [r0+2*14], mm4
- movq [r0+2*25], mm0
- movd [r0+2*54], mm7
- movq [r0+2*56], mm5
- movq [r0+2*60], mm3
- punpckhdq xmm3, xmm0, xmm2
- punpckldq xmm0, xmm2
- punpckhdq xmm7, xmm4, xmm6
- punpckldq xmm4, xmm6
- pshufhw xmm0, xmm0, q0123
- pshuflw xmm4, xmm4, q0123
- pshufhw xmm3, xmm3, q0123
- pshuflw xmm7, xmm7, q0123
- movlps [r0+2*10], xmm0
- movhps [r0+2*17], xmm0
- movlps [r0+2*21], xmm3
- movlps [r0+2*28], xmm4
- movhps [r0+2*32], xmm3
- movhps [r0+2*39], xmm4
- movlps [r0+2*43], xmm7
- movhps [r0+2*50], xmm7
- RET
- %endmacro
- %if HIGH_BIT_DEPTH == 0
- INIT_XMM sse2
- SCAN_8x8
- INIT_XMM ssse3
- SCAN_8x8
- %endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
- ;-----------------------------------------------------------------------------
- ; Output order:
- ; 0 8 1 2 9 16 24 17
- ; 10 3 4 11 18 25 32 40
- ; 33 26 19 12 5 6 13 20
- ; 27 34 41 48 56 49 42 35
- ; 28 21 14 7 15 22 29 36
- ; 43 50 57 58 51 44 37 30
- ; 23 31 38 45 52 59 60 53
- ; 46 39 47 54 61 62 55 63
- %macro SCAN_8x8_FRAME 5
- cglobal zigzag_scan_8x8_frame, 2,2,8
- mova m0, [r1]
- mova m1, [r1+ 8*SIZEOF_DCTCOEF]
- movu m2, [r1+14*SIZEOF_DCTCOEF]
- movu m3, [r1+21*SIZEOF_DCTCOEF]
- mova m4, [r1+28*SIZEOF_DCTCOEF]
- punpckl%4 m5, m0, m1
- psrl%2 m0, %1
- punpckh%4 m6, m1, m0
- punpckl%3 m5, m0
- punpckl%3 m1, m1
- punpckh%4 m1, m3
- mova m7, [r1+52*SIZEOF_DCTCOEF]
- mova m0, [r1+60*SIZEOF_DCTCOEF]
- punpckh%4 m1, m2
- punpckl%4 m2, m4
- punpckh%4 m4, m3
- punpckl%3 m3, m3
- punpckh%4 m3, m2
- mova [r0], m5
- mova [r0+ 4*SIZEOF_DCTCOEF], m1
- mova [r0+ 8*SIZEOF_DCTCOEF], m6
- punpckl%4 m6, m0
- punpckl%4 m6, m7
- mova m1, [r1+32*SIZEOF_DCTCOEF]
- movu m5, [r1+39*SIZEOF_DCTCOEF]
- movu m2, [r1+46*SIZEOF_DCTCOEF]
- movu [r0+35*SIZEOF_DCTCOEF], m3
- movu [r0+47*SIZEOF_DCTCOEF], m4
- punpckh%4 m7, m0
- psll%2 m0, %1
- punpckh%3 m3, m5, m5
- punpckl%4 m5, m1
- punpckh%4 m1, m2
- mova [r0+52*SIZEOF_DCTCOEF], m6
- movu [r0+13*SIZEOF_DCTCOEF], m5
- movu m4, [r1+11*SIZEOF_DCTCOEF]
- movu m6, [r1+25*SIZEOF_DCTCOEF]
- punpckl%4 m5, m7
- punpckl%4 m1, m3
- punpckh%3 m0, m7
- mova m3, [r1+ 4*SIZEOF_DCTCOEF]
- movu m7, [r1+18*SIZEOF_DCTCOEF]
- punpckl%4 m2, m5
- movu [r0+25*SIZEOF_DCTCOEF], m1
- mova m1, m4
- mova m5, m6
- punpckl%4 m4, m3
- punpckl%4 m6, m7
- punpckh%4 m1, m3
- punpckh%4 m5, m7
- punpckh%3 m3, m6, m4
- punpckh%3 m7, m5, m1
- punpckl%3 m6, m4
- punpckl%3 m5, m1
- movu m4, [r1+35*SIZEOF_DCTCOEF]
- movu m1, [r1+49*SIZEOF_DCTCOEF]
- pshuf%5 m6, m6, q0123
- pshuf%5 m5, m5, q0123
- mova [r0+60*SIZEOF_DCTCOEF], m0
- mova [r0+56*SIZEOF_DCTCOEF], m2
- movu m0, [r1+42*SIZEOF_DCTCOEF]
- mova m2, [r1+56*SIZEOF_DCTCOEF]
- movu [r0+17*SIZEOF_DCTCOEF], m3
- mova [r0+32*SIZEOF_DCTCOEF], m7
- movu [r0+10*SIZEOF_DCTCOEF], m6
- movu [r0+21*SIZEOF_DCTCOEF], m5
- punpckh%4 m3, m0, m4
- punpckh%4 m7, m2, m1
- punpckl%4 m0, m4
- punpckl%4 m2, m1
- punpckl%3 m4, m2, m0
- punpckl%3 m1, m7, m3
- punpckh%3 m2, m0
- punpckh%3 m7, m3
- pshuf%5 m2, m2, q0123
- pshuf%5 m7, m7, q0123
- mova [r0+28*SIZEOF_DCTCOEF], m4
- movu [r0+43*SIZEOF_DCTCOEF], m1
- movu [r0+39*SIZEOF_DCTCOEF], m2
- movu [r0+50*SIZEOF_DCTCOEF], m7
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- SCAN_8x8_FRAME 4 , dq, qdq, dq, d
- INIT_XMM avx
- SCAN_8x8_FRAME 4 , dq, qdq, dq, d
- %else
- INIT_MMX mmx2
- SCAN_8x8_FRAME 16, q , dq , wd, w
- %endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
- ;-----------------------------------------------------------------------------
- %macro SCAN_4x4 4
- cglobal zigzag_scan_4x4_frame, 2,2,6
- mova m0, [r1+ 0*SIZEOF_DCTCOEF]
- mova m1, [r1+ 4*SIZEOF_DCTCOEF]
- mova m2, [r1+ 8*SIZEOF_DCTCOEF]
- mova m3, [r1+12*SIZEOF_DCTCOEF]
- punpckl%4 m4, m0, m1
- psrl%2 m0, %1
- punpckl%3 m4, m0
- mova [r0+ 0*SIZEOF_DCTCOEF], m4
- punpckh%4 m0, m2
- punpckh%4 m4, m2, m3
- psll%2 m3, %1
- punpckl%3 m2, m2
- punpckl%4 m5, m1, m3
- punpckh%3 m1, m1
- punpckh%4 m5, m2
- punpckl%4 m1, m0
- punpckh%3 m3, m4
- mova [r0+ 4*SIZEOF_DCTCOEF], m5
- mova [r0+ 8*SIZEOF_DCTCOEF], m1
- mova [r0+12*SIZEOF_DCTCOEF], m3
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- SCAN_4x4 4, dq, qdq, dq
- INIT_XMM avx
- SCAN_4x4 4, dq, qdq, dq
- %else
- INIT_MMX mmx
- SCAN_4x4 16, q , dq , wd
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
- ;-----------------------------------------------------------------------------
- %macro SCAN_4x4_FRAME 0
- cglobal zigzag_scan_4x4_frame, 2,2
- mova m1, [r1+16]
- mova m0, [r1+ 0]
- pshufb m1, [pb_scan4frameb]
- pshufb m0, [pb_scan4framea]
- psrldq m2, m1, 6
- palignr m1, m0, 6
- pslldq m0, 10
- palignr m2, m0, 10
- mova [r0+ 0], m1
- mova [r0+16], m2
- RET
- %endmacro
- INIT_XMM ssse3
- SCAN_4x4_FRAME
- INIT_XMM avx
- SCAN_4x4_FRAME
- INIT_XMM xop
- cglobal zigzag_scan_4x4_frame, 2,2
- mova m0, [r1+ 0]
- mova m1, [r1+16]
- vpperm m2, m0, m1, [pb_scan4frame2a]
- vpperm m1, m0, m1, [pb_scan4frame2b]
- mova [r0+ 0], m2
- mova [r0+16], m1
- RET
- %endif ; !HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
- ;-----------------------------------------------------------------------------
- INIT_XMM sse2
- cglobal zigzag_scan_4x4_field, 2,2
- movu m0, [r1+ 8]
- pshufd m0, m0, q3102
- mova m1, [r1+32]
- mova m2, [r1+48]
- movu [r0+ 8], m0
- mova [r0+32], m1
- mova [r0+48], m2
- movq mm0, [r1]
- movq [r0], mm0
- movq mm0, [r1+24]
- movq [r0+24], mm0
- RET
- %else
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
- ;-----------------------------------------------------------------------------
- INIT_XMM sse
- cglobal zigzag_scan_4x4_field, 2,2
- mova m0, [r1]
- mova m1, [r1+16]
- pshufw mm0, [r1+4], q3102
- mova [r0], m0
- mova [r0+16], m1
- movq [r0+4], mm0
- RET
- %endif ; HIGH_BIT_DEPTH
- ;-----------------------------------------------------------------------------
- ; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
- ;-----------------------------------------------------------------------------
- ; Output order:
- ; 0 1 2 8 9 3 4 10
- ; 16 11 5 6 7 12 17 24
- ; 18 13 14 15 19 25 32 26
- ; 20 21 22 23 27 33 40 34
- ; 28 29 30 31 35 41 48 42
- ; 36 37 38 39 43 49 50 44
- ; 45 46 47 51 56 57 52 53
- ; 54 55 58 59 60 61 62 63
- %undef SCAN_8x8
- %macro SCAN_8x8 5
- cglobal zigzag_scan_8x8_field, 2,3,8
- mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
- mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
- mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
- pshuf%1 m3, m0, q3333 ; 03 03 03 03
- movd r2d, m2 ; 09 08
- pshuf%1 m2, m2, q0321 ; 08 11 10 09
- punpckl%2 m3, m1 ; 05 03 04 03
- pinsr%1 m0, r2d, 3 ; 08 02 01 00
- punpckl%2 m4, m2, m3 ; 04 10 03 09
- pshuf%1 m4, m4, q2310 ; 10 04 03 09
- mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
- mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
- mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
- mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
- punpckl%3 m6, m5 ; 17 16 XX XX
- psrl%4 m1, %5 ; XX 07 06 05
- punpckh%2 m6, m2 ; 08 17 11 16
- punpckl%3 m6, m1 ; 06 05 11 16
- mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
- psrl%4 m1, %5 ; XX XX 07 06
- punpckl%2 m1, m5 ; 17 07 16 06
- mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
- mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
- punpckh%3 m1, m1 ; 17 07 17 07
- punpckl%2 m6, m3, m2 ; 25 13 24 12
- pextr%1 r2d, m5, 2
- mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
- punpckl%2 m1, m6 ; 24 17 12 07
- mova [r0+12*SIZEOF_DCTCOEF], m1
- pinsr%1 m3, r2d, 0 ; 15 14 13 18
- mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
- mova m7, [r1+28*SIZEOF_DCTCOEF]
- mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
- psrl%4 m5, %5*3 ; XX XX XX 19
- pshuf%1 m1, m2, q3321 ; 27 27 26 25
- punpckl%2 m5, m0 ; 33 XX 32 19
- psrl%4 m2, %5*3 ; XX XX XX 27
- punpckl%2 m5, m1 ; 26 32 25 19
- mova [r0+32*SIZEOF_DCTCOEF], m7
- mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
- mova m7, [r1+36*SIZEOF_DCTCOEF]
- mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
- pshuf%1 m3, m0, q3321 ; 35 35 34 33
- punpckl%2 m2, m1 ; 41 XX 40 27
- mova [r0+40*SIZEOF_DCTCOEF], m7
- punpckl%2 m2, m3 ; 34 40 33 27
- mova [r0+28*SIZEOF_DCTCOEF], m2
- mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
- mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
- psrl%4 m0, %5*3 ; XX XX XX 35
- punpckl%2 m0, m2 ; 49 XX 48 35
- pshuf%1 m3, m1, q3321 ; 43 43 42 41
- punpckl%2 m0, m3 ; 42 48 41 35
- mova [r0+36*SIZEOF_DCTCOEF], m0
- pextr%1 r2d, m2, 3 ; 51
- psrl%4 m1, %5*3 ; XX XX XX 43
- punpckl%2 m1, m7 ; 45 XX 44 43
- psrl%4 m2, %5 ; XX 51 50 49
- punpckl%2 m1, m2 ; 50 44 49 43
- pshuf%1 m1, m1, q2310 ; 44 50 49 43
- mova [r0+44*SIZEOF_DCTCOEF], m1
- psrl%4 m7, %5 ; XX 47 46 45
- pinsr%1 m7, r2d, 3 ; 51 47 46 45
- mova [r0+48*SIZEOF_DCTCOEF], m7
- mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
- mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
- mova m7, [r1+60*SIZEOF_DCTCOEF]
- punpckl%3 m2, m0, m1 ; 53 52 57 56
- punpckh%3 m1, m0 ; 59 58 55 54
- mova [r0+52*SIZEOF_DCTCOEF], m2
- mova [r0+56*SIZEOF_DCTCOEF], m1
- mova [r0+60*SIZEOF_DCTCOEF], m7
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse4
- SCAN_8x8 d, dq, qdq, dq, 4
- INIT_XMM avx
- SCAN_8x8 d, dq, qdq, dq, 4
- %else
- INIT_MMX mmx2
- SCAN_8x8 w, wd, dq , q , 16
- %endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
- ;-----------------------------------------------------------------------------
- %macro ZIGZAG_SUB_4x4 2
- %ifidn %1, ac
- cglobal zigzag_sub_4x4%1_%2, 4,4,8
- %else
- cglobal zigzag_sub_4x4%1_%2, 3,3,8
- %endif
- movd m0, [r1+0*FENC_STRIDE]
- movd m1, [r1+1*FENC_STRIDE]
- movd m2, [r1+2*FENC_STRIDE]
- movd m3, [r1+3*FENC_STRIDE]
- movd m4, [r2+0*FDEC_STRIDE]
- movd m5, [r2+1*FDEC_STRIDE]
- movd m6, [r2+2*FDEC_STRIDE]
- movd m7, [r2+3*FDEC_STRIDE]
- movd [r2+0*FDEC_STRIDE], m0
- movd [r2+1*FDEC_STRIDE], m1
- movd [r2+2*FDEC_STRIDE], m2
- movd [r2+3*FDEC_STRIDE], m3
- punpckldq m0, m1
- punpckldq m2, m3
- punpckldq m4, m5
- punpckldq m6, m7
- punpcklqdq m0, m2
- punpcklqdq m4, m6
- mova m7, [pb_sub4%2]
- pshufb m0, m7
- pshufb m4, m7
- mova m7, [hsub_mul]
- punpckhbw m1, m0, m4
- punpcklbw m0, m4
- pmaddubsw m1, m7
- pmaddubsw m0, m7
- %ifidn %1, ac
- movd r2d, m0
- pand m0, [pb_subacmask]
- %endif
- mova [r0+ 0], m0
- por m0, m1
- pxor m2, m2
- mova [r0+16], m1
- pcmpeqb m0, m2
- pmovmskb eax, m0
- %ifidn %1, ac
- mov [r3], r2w
- %endif
- sub eax, 0xffff
- shr eax, 31
- RET
- %endmacro
- %if HIGH_BIT_DEPTH == 0
- INIT_XMM ssse3
- ZIGZAG_SUB_4x4 , frame
- ZIGZAG_SUB_4x4 ac, frame
- ZIGZAG_SUB_4x4 , field
- ZIGZAG_SUB_4x4 ac, field
- INIT_XMM avx
- ZIGZAG_SUB_4x4 , frame
- ZIGZAG_SUB_4x4 ac, frame
- ZIGZAG_SUB_4x4 , field
- ZIGZAG_SUB_4x4 ac, field
- %endif ; !HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH == 0
- INIT_XMM xop
- cglobal zigzag_scan_8x8_field, 2,3,7
- lea r2, [pb_scan8field1]
- %define off(m) (r2+m-pb_scan8field1)
- mova m0, [r1+ 0]
- mova m1, [r1+ 16]
- vpperm m5, m0, m1, [off(pb_scan8field1)]
- mova [r0+ 0], m5
- vpperm m0, m0, m1, [off(pb_scan8field2a)]
- mova m2, [r1+ 32]
- mova m3, [r1+ 48]
- vpperm m5, m2, m3, [off(pb_scan8field2b)]
- por m5, m0
- mova [r0+ 16], m5
- mova m4, [off(pb_scan8field3b)]
- vpperm m1, m1, m2, [off(pb_scan8field3a)]
- mova m0, [r1+ 64]
- vpperm m5, m3, m0, m4
- por m5, m1
- mova [r0+ 32], m5
- ; 4b, 5b are the same as pb_scan8field3b.
- ; 5a is the same as pb_scan8field4a.
- mova m5, [off(pb_scan8field4a)]
- vpperm m2, m2, m3, m5
- mova m1, [r1+ 80]
- vpperm m6, m0, m1, m4
- por m6, m2
- mova [r0+ 48], m6
- vpperm m3, m3, m0, m5
- mova m2, [r1+ 96]
- vpperm m5, m1, m2, m4
- por m5, m3
- mova [r0+ 64], m5
- vpperm m5, m0, m1, [off(pb_scan8field6)]
- mova [r0+ 80], m5
- vpperm m5, m1, m2, [off(pb_scan8field7)]
- mov r2d, [r1+ 98]
- mov [r0+ 90], r2d
- mova [r0+ 96], m5
- mova m3, [r1+112]
- movd [r0+104], m3
- mov r2d, [r1+108]
- mova [r0+112], m3
- mov [r0+112], r2d
- %undef off
- RET
- cglobal zigzag_scan_8x8_frame, 2,3,8
- lea r2, [pb_scan8frame1]
- %define off(m) (r2+m-pb_scan8frame1)
- mova m7, [r1+ 16]
- mova m3, [r1+ 32]
- vpperm m7, m7, m3, [off(pb_scan8framet1)] ; 8 9 14 15 16 17 21 22
- mova m2, [r1+ 48]
- vpperm m0, m3, m2, [off(pb_scan8framet2)] ; 18 19 20 23 25 31 26 30
- mova m1, [r1+ 80]
- mova m4, [r1+ 64]
- vpperm m3, m4, m1, [off(pb_scan8framet3)] ; 32 33 37 38 40 43 44 45
- vpperm m6, m0, m3, [off(pb_scan8framet4)] ; 18 23 25 31 32 38 40 45
- vpperm m5, m0, m3, [off(pb_scan8framet5)] ; 19 20 26 30 33 37 43 44
- vpperm m3, m2, m4, [off(pb_scan8framet6)] ; 24 27 28 29 34 35 36 39
- mova m4, [r1+ 96]
- vpperm m4, m1, m4, [off(pb_scan8framet7)] ; 41 42 46 47 48 49 54 55
- mova m1, [r1+ 0]
- vpperm m2, m1, m3, [off(pb_scan8framet8)] ; 0 1 2 7 24 28 29 36
- vpperm m1, m2, m7, [off(pb_scan8frame1)] ; 0 8 1 2 9 16 24 17
- mova [r0+ 0], m1
- movh m0, [r1+ 6]
- movhps m0, [r1+ 20] ; 3 4 5 6 10 11 12 13
- vpperm m1, m0, m6, [off(pb_scan8frame2)] ; 10 3 4 11 18 25 32 40
- mova [r0+ 16], m1
- vpperm m1, m0, m5, [off(pb_scan8frame3)] ; 33 26 19 12 5 6 13 20
- mova [r0+ 32], m1
- vpperm m1, m2, m7, [off(pb_scan8frame5)] ; 28 21 14 7 15 22 29 36
- mova [r0+ 64], m1
- movh m0, [r1+100]
- movhps m0, [r1+114] ; 50 51 52 53 57 58 59 60
- vpperm m1, m5, m0, [off(pb_scan8frame6)] ; 43 50 57 58 51 44 37 30
- mova [r0+ 80], m1
- vpperm m1, m6, m0, [off(pb_scan8frame7)] ; 23 31 38 45 52 59 60 53
- mova [r0+ 96], m1
- mova m1, [r1+112]
- vpperm m0, m3, m1, [off(pb_scan8framet9)] ; 27 34 35 39 56 61 62 63
- vpperm m1, m0, m4, [off(pb_scan8frame4)] ; 27 34 41 48 56 49 42 35
- mova [r0+ 48], m1
- vpperm m1, m0, m4, [off(pb_scan8frame8)] ; 46 39 47 54 61 62 55 63
- mova [r0+112], m1
- %undef off
- RET
- %endif
- ;-----------------------------------------------------------------------------
- ; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
- ;-----------------------------------------------------------------------------
- %macro INTERLEAVE 2
- mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
- mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
- mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
- mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
- TRANSPOSE4x4%2 0,1,2,3,4
- mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
- mova [r0+(%1+32)*SIZEOF_PIXEL], m1
- mova [r0+(%1+64)*SIZEOF_PIXEL], m2
- mova [r0+(%1+96)*SIZEOF_PIXEL], m3
- packsswb m0, m1
- ACCUM por, 6, 2, %1
- ACCUM por, 7, 3, %1
- ACCUM por, 5, 0, %1
- %endmacro
- %macro ZIGZAG_8x8_CAVLC 1
- cglobal zigzag_interleave_8x8_cavlc, 3,3,8
- INTERLEAVE 0, %1
- INTERLEAVE 8, %1
- INTERLEAVE 16, %1
- INTERLEAVE 24, %1
- packsswb m6, m7
- packsswb m5, m6
- packsswb m5, m5
- pxor m0, m0
- %if HIGH_BIT_DEPTH
- packsswb m5, m5
- %endif
- pcmpeqb m5, m0
- paddb m5, [pb_1]
- movd r0d, m5
- mov [r2+0], r0w
- shr r0d, 16
- mov [r2+8], r0w
- RET
- %endmacro
- %if HIGH_BIT_DEPTH
- INIT_XMM sse2
- ZIGZAG_8x8_CAVLC D
- INIT_XMM avx
- ZIGZAG_8x8_CAVLC D
- %else
- INIT_MMX mmx
- ZIGZAG_8x8_CAVLC W
- %endif
- %macro INTERLEAVE_XMM 1
- mova m0, [r1+%1*4+ 0]
- mova m1, [r1+%1*4+16]
- mova m4, [r1+%1*4+32]
- mova m5, [r1+%1*4+48]
- SBUTTERFLY wd, 0, 1, 6
- SBUTTERFLY wd, 4, 5, 7
- SBUTTERFLY wd, 0, 1, 6
- SBUTTERFLY wd, 4, 5, 7
- movh [r0+%1+ 0], m0
- movhps [r0+%1+ 32], m0
- movh [r0+%1+ 64], m1
- movhps [r0+%1+ 96], m1
- movh [r0+%1+ 8], m4
- movhps [r0+%1+ 40], m4
- movh [r0+%1+ 72], m5
- movhps [r0+%1+104], m5
- ACCUM por, 2, 0, %1
- ACCUM por, 3, 1, %1
- por m2, m4
- por m3, m5
- %endmacro
- %if HIGH_BIT_DEPTH == 0
- %macro ZIGZAG_8x8_CAVLC 0
- cglobal zigzag_interleave_8x8_cavlc, 3,3,8
- INTERLEAVE_XMM 0
- INTERLEAVE_XMM 16
- packsswb m2, m3
- pxor m5, m5
- packsswb m2, m2
- packsswb m2, m2
- pcmpeqb m5, m2
- paddb m5, [pb_1]
- movd r0d, m5
- mov [r2+0], r0w
- shr r0d, 16
- mov [r2+8], r0w
- RET
- %endmacro
- INIT_XMM sse2
- ZIGZAG_8x8_CAVLC
- INIT_XMM avx
- ZIGZAG_8x8_CAVLC
- INIT_YMM avx2
- cglobal zigzag_interleave_8x8_cavlc, 3,3,6
- mova m0, [r1+ 0]
- mova m1, [r1+32]
- mova m2, [r1+64]
- mova m3, [r1+96]
- mova m5, [deinterleave_shufd]
- SBUTTERFLY wd, 0, 1, 4
- SBUTTERFLY wd, 2, 3, 4
- SBUTTERFLY wd, 0, 1, 4
- SBUTTERFLY wd, 2, 3, 4
- vpermd m0, m5, m0
- vpermd m1, m5, m1
- vpermd m2, m5, m2
- vpermd m3, m5, m3
- mova [r0+ 0], xm0
- mova [r0+ 16], xm2
- vextracti128 [r0+ 32], m0, 1
- vextracti128 [r0+ 48], m2, 1
- mova [r0+ 64], xm1
- mova [r0+ 80], xm3
- vextracti128 [r0+ 96], m1, 1
- vextracti128 [r0+112], m3, 1
- packsswb m0, m2 ; nnz0, nnz1
- packsswb m1, m3 ; nnz2, nnz3
- packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
- vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
- pxor m5, m5
- pcmpeqq m0, m5
- pmovmskb r0d, m0
- not r0d
- and r0d, 0x01010101
- mov [r2+0], r0w
- shr r0d, 16
- mov [r2+8], r0w
- RET
- %endif ; !HIGH_BIT_DEPTH
- %if HIGH_BIT_DEPTH
- INIT_ZMM avx512
- cglobal zigzag_scan_4x4_frame, 2,2
- mova m0, [scan_frame_avx512]
- vpermd m0, m0, [r1]
- mova [r0], m0
- RET
- cglobal zigzag_scan_4x4_field, 2,2
- mova m0, [r1]
- pshufd xmm1, [r1+8], q3102
- mova [r0], m0
- movu [r0+8], xmm1
- RET
- cglobal zigzag_scan_8x8_frame, 2,2
- psrld m0, [scan_frame_avx512], 4
- mova m1, [r1+0*64]
- mova m2, [r1+1*64]
- mova m3, [r1+2*64]
- mova m4, [r1+3*64]
- mov r1d, 0x01fe7f80
- kmovd k1, r1d
- kshiftrd k2, k1, 16
- vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
- psrld m6, m0, 5
- vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
- vmovdqa64 m0 {k1}, m5
- mova [r0+0*64], m0
- mova m5, m1
- vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
- psrld m0, m6, 5
- vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
- vmovdqa32 m6 {k2}, m1
- mova [r0+1*64], m6
- vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
- psrld m1, m0, 5
- vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
- vmovdqa32 m5 {k1}, m0
- mova [r0+2*64], m5
- vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
- vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
- vmovdqa64 m2 {k2}, m3
- mova [r0+3*64], m2
- RET
- cglobal zigzag_scan_8x8_field, 2,2
- mova m0, [scan_field_avx512]
- mova m1, [r1+0*64]
- mova m2, [r1+1*64]
- mova m3, [r1+2*64]
- mova m4, [r1+3*64]
- mov r1d, 0x3f
- kmovb k1, r1d
- psrld m5, m0, 5
- vpermi2d m0, m1, m2
- vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
- vpermt2d m1, m5, m2
- psrld m5, 5
- vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
- vpermt2d m2, m5, m3
- psrld m5, 5
- vpermt2d m3, m5, m4
- mova [r0+0*64], m0
- mova [r0+1*64], m1
- mova [r0+2*64], m2
- mova [r0+3*64], m3
- RET
- cglobal zigzag_interleave_8x8_cavlc, 3,3
- mova m0, [cavlc_shuf_avx512]
- mova m1, [r1+0*64]
- mova m2, [r1+1*64]
- mova m3, [r1+2*64]
- mova m4, [r1+3*64]
- kxnorb k1, k1, k1
- por m7, m1, m2
- psrld m5, m0, 5
- vpermi2d m0, m1, m2 ; a0 a1 b0 b1
- vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
- psrld m6, m5, 5
- vpermi2d m5, m3, m4 ; b2 b3 a2 a3
- vptestmd k0, m7, m7
- vpermt2d m1, m6, m2 ; c0 c1 d0 d1
- psrld m6, 5
- vpermt2d m3, m6, m4 ; d2 d3 c2 c3
- vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
- vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
- vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
- vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
- mova [r0+0*64], m5
- mova [r0+1*64], m2
- mova [r0+2*64], m3
- mova [r0+3*64], m4
- kmovw r1d, k0
- test r1d, 0x1111
- setnz [r2]
- test r1d, 0x2222
- setnz [r2+1]
- test r1d, 0x4444
- setnz [r2+8]
- test r1d, 0x8888
- setnz [r2+9]
- RET
- %else ; !HIGH_BIT_DEPTH
- INIT_YMM avx512
- cglobal zigzag_scan_4x4_frame, 2,2
- mova m0, [scan_frame_avx512]
- vpermw m0, m0, [r1]
- mova [r0], m0
- RET
- cglobal zigzag_scan_4x4_field, 2,2
- mova m0, [r1]
- pshuflw xmm1, [r1+4], q3102
- mova [r0], m0
- movq [r0+4], xmm1
- RET
- INIT_ZMM avx512
- cglobal zigzag_scan_8x8_frame, 2,2
- psrlw m0, [scan_frame_avx512], 4
- scan8_avx512:
- mova m1, [r1]
- mova m2, [r1+64]
- psrlw m3, m0, 6
- vpermi2w m0, m1, m2
- vpermt2w m1, m3, m2
- mova [r0], m0
- mova [r0+64], m1
- RET
- cglobal zigzag_scan_8x8_field, 2,2
- mova m0, [scan_field_avx512]
- jmp scan8_avx512
- cglobal zigzag_interleave_8x8_cavlc, 3,3
- mova m0, [cavlc_shuf_avx512]
- mova m1, [r1]
- mova m2, [r1+64]
- psrlw m3, m0, 6
- vpermi2w m0, m1, m2
- vpermt2w m1, m3, m2
- kxnorb k2, k2, k2
- vptestmd k0, m0, m0
- vptestmd k1, m1, m1
- mova [r0], m0
- mova [r0+64], m1
- ktestw k2, k0
- setnz [r2]
- setnc [r2+1]
- ktestw k2, k1
- setnz [r2+8]
- setnc [r2+9]
- RET
- %endif ; !HIGH_BIT_DEPTH
|