pixel-a.asm 136 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578
  1. ;*****************************************************************************
  2. ;* pixel.asm: x86 pixel metrics
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2003-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Holger Lubitz <holger@lubitz.org>
  8. ;* Laurent Aimar <fenrir@via.ecp.fr>
  9. ;* Alex Izvorski <aizvorksi@gmail.com>
  10. ;* Fiona Glaser <fiona@x264.com>
  11. ;* Oskar Arvidsson <oskar@irock.se>
  12. ;*
  13. ;* This program is free software; you can redistribute it and/or modify
  14. ;* it under the terms of the GNU General Public License as published by
  15. ;* the Free Software Foundation; either version 2 of the License, or
  16. ;* (at your option) any later version.
  17. ;*
  18. ;* This program is distributed in the hope that it will be useful,
  19. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  21. ;* GNU General Public License for more details.
  22. ;*
  23. ;* You should have received a copy of the GNU General Public License
  24. ;* along with this program; if not, write to the Free Software
  25. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  26. ;*
  27. ;* This program is also available under a commercial proprietary license.
  28. ;* For more information, contact us at licensing@x264.com.
  29. ;*****************************************************************************
  30. %include "x86inc.asm"
  31. %include "x86util.asm"
  32. SECTION_RODATA 32
  33. var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
  34. db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
  35. hmul_16p: times 16 db 1
  36. times 8 db 1, -1
  37. hmul_8p: times 8 db 1
  38. times 4 db 1, -1
  39. times 8 db 1
  40. times 4 db 1, -1
  41. mask_ff: times 16 db 0xff
  42. times 16 db 0
  43. mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
  44. mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
  45. mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
  46. %if HIGH_BIT_DEPTH
  47. ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
  48. %endif
  49. %if BIT_DEPTH == 10
  50. ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
  51. ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
  52. pf_64: times 4 dd 64.0
  53. pf_128: times 4 dd 128.0
  54. %elif BIT_DEPTH == 9
  55. ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
  56. ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
  57. %else ; 8-bit
  58. ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
  59. ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
  60. %endif
  61. hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
  62. mask_10: times 4 dw 0, -1
  63. mask_1100: times 2 dd 0, -1
  64. pb_pppm: times 4 db 1,1,1,-1
  65. deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
  66. intrax3_shuf: db 7,6,7,6,5,4,5,4,3,2,3,2,1,0,1,0
  67. intrax9a_ddlr1: db 6, 7, 8, 9, 7, 8, 9,10, 4, 5, 6, 7, 3, 4, 5, 6
  68. intrax9a_ddlr2: db 8, 9,10,11, 9,10,11,12, 2, 3, 4, 5, 1, 2, 3, 4
  69. intrax9a_hdu1: db 15, 4, 5, 6,14, 3,15, 4,14, 2,13, 1,13, 1,12, 0
  70. intrax9a_hdu2: db 13, 2,14, 3,12, 1,13, 2,12, 0,11,11,11,11,11,11
  71. intrax9a_vrl1: db 10,11,12,13, 3, 4, 5, 6,11,12,13,14, 5, 6, 7, 8
  72. intrax9a_vrl2: db 2,10,11,12, 1, 3, 4, 5,12,13,14,15, 6, 7, 8, 9
  73. intrax9a_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 4, 4, 4, 3, 3, 3, 3
  74. intrax9a_vh2: db 6, 7, 8, 9, 6, 7, 8, 9, 2, 2, 2, 2, 1, 1, 1, 1
  75. intrax9a_dc: db 1, 2, 3, 4, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1
  76. intrax9a_lut: db 0x60,0x68,0x80,0x00,0x08,0x20,0x40,0x28,0x48,0,0,0,0,0,0,0
  77. pw_s01234567: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8005,0x8006,0x8007
  78. pw_s01234657: dw 0x8000,0x8001,0x8002,0x8003,0x8004,0x8006,0x8005,0x8007
  79. intrax9_edge: db 0, 0, 1, 2, 3, 7, 8, 9,10,11,12,13,14,15,15,15
  80. intrax9b_ddlr1: db 6, 7, 8, 9, 4, 5, 6, 7, 7, 8, 9,10, 3, 4, 5, 6
  81. intrax9b_ddlr2: db 8, 9,10,11, 2, 3, 4, 5, 9,10,11,12, 1, 2, 3, 4
  82. intrax9b_hdu1: db 15, 4, 5, 6,14, 2,13, 1,14, 3,15, 4,13, 1,12, 0
  83. intrax9b_hdu2: db 13, 2,14, 3,12, 0,11,11,12, 1,13, 2,11,11,11,11
  84. intrax9b_vrl1: db 10,11,12,13,11,12,13,14, 3, 4, 5, 6, 5, 6, 7, 8
  85. intrax9b_vrl2: db 2,10,11,12,12,13,14,15, 1, 3, 4, 5, 6, 7, 8, 9
  86. intrax9b_vh1: db 6, 7, 8, 9, 4, 4, 4, 4, 6, 7, 8, 9, 3, 3, 3, 3
  87. intrax9b_vh2: db 6, 7, 8, 9, 2, 2, 2, 2, 6, 7, 8, 9, 1, 1, 1, 1
  88. intrax9b_edge2: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
  89. intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
  90. intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
  91. intrax9b_lut: db 0x60,0x64,0x80,0x00,0x04,0x20,0x40,0x24,0x44,0,0,0,0,0,0,0
  92. ALIGN 32
  93. intra8x9_h1: db 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5
  94. intra8x9_h2: db 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4
  95. intra8x9_h3: db 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1
  96. intra8x9_h4: db 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0
  97. intra8x9_ddl1: db 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9,10
  98. intra8x9_ddl2: db 2, 3, 4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9,10,11
  99. intra8x9_ddl3: db 5, 6, 7, 8, 9,10,11,12, 7, 8, 9,10,11,12,13,14
  100. intra8x9_ddl4: db 6, 7, 8, 9,10,11,12,13, 8, 9,10,11,12,13,14,15
  101. intra8x9_vl1: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
  102. intra8x9_vl2: db 1, 2, 3, 4, 5, 6, 7, 8, 2, 3, 4, 5, 6, 7, 8, 9
  103. intra8x9_vl3: db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9,10
  104. intra8x9_vl4: db 3, 4, 5, 6, 7, 8, 9,10, 4, 5, 6, 7, 8, 9,10,11
  105. intra8x9_ddr1: db 8, 9,10,11,12,13,14,15, 6, 7, 8, 9,10,11,12,13
  106. intra8x9_ddr2: db 7, 8, 9,10,11,12,13,14, 5, 6, 7, 8, 9,10,11,12
  107. intra8x9_ddr3: db 4, 5, 6, 7, 8, 9,10,11, 2, 3, 4, 5, 6, 7, 8, 9
  108. intra8x9_ddr4: db 3, 4, 5, 6, 7, 8, 9,10, 1, 2, 3, 4, 5, 6, 7, 8
  109. intra8x9_vr1: db 8, 9,10,11,12,13,14,15, 7, 8, 9,10,11,12,13,14
  110. intra8x9_vr2: db 8, 9,10,11,12,13,14,15, 6, 8, 9,10,11,12,13,14
  111. intra8x9_vr3: db 5, 7, 8, 9,10,11,12,13, 3, 5, 7, 8, 9,10,11,12
  112. intra8x9_vr4: db 4, 6, 8, 9,10,11,12,13, 2, 4, 6, 8, 9,10,11,12
  113. intra8x9_hd1: db 3, 8, 9,10,11,12,13,14, 1, 6, 2, 7, 3, 8, 9,10
  114. intra8x9_hd2: db 2, 7, 3, 8, 9,10,11,12, 0, 5, 1, 6, 2, 7, 3, 8
  115. intra8x9_hd3: db 7, 8, 9,10,11,12,13,14, 3, 4, 5, 6, 7, 8, 9,10
  116. intra8x9_hd4: db 5, 6, 7, 8, 9,10,11,12, 1, 2, 3, 4, 5, 6, 7, 8
  117. intra8x9_hu1: db 13,12,11,10, 9, 8, 7, 6, 9, 8, 7, 6, 5, 4, 3, 2
  118. intra8x9_hu2: db 11,10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4, 3, 2, 1, 0
  119. intra8x9_hu3: db 5, 4, 3, 2, 1, 0,15,15, 1, 0,15,15,15,15,15,15
  120. intra8x9_hu4: db 3, 2, 1, 0,15,15,15,15,15,15,15,15,15,15,15,15
  121. pw_s00112233: dw 0x8000,0x8000,0x8001,0x8001,0x8002,0x8002,0x8003,0x8003
  122. pw_s00001111: dw 0x8000,0x8000,0x8000,0x8000,0x8001,0x8001,0x8001,0x8001
  123. transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
  124. transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
  125. sw_f0: dq 0xfff0, 0
  126. pd_f0: times 4 dd 0xffff0000
  127. pd_2: times 4 dd 2
  128. pw_76543210: dw 0, 1, 2, 3, 4, 5, 6, 7
  129. ads_mvs_shuffle:
  130. %macro ADS_MVS_SHUFFLE 8
  131. %assign y x
  132. %rep 8
  133. %rep 7
  134. %rotate (~y)&1
  135. %assign y y>>((~y)&1)
  136. %endrep
  137. db %1*2, %1*2+1
  138. %rotate 1
  139. %assign y y>>1
  140. %endrep
  141. %endmacro
  142. %assign x 0
  143. %rep 256
  144. ADS_MVS_SHUFFLE 0, 1, 2, 3, 4, 5, 6, 7
  145. %assign x x+1
  146. %endrep
  147. SECTION .text
  148. cextern pb_0
  149. cextern pb_1
  150. cextern pw_1
  151. cextern pw_8
  152. cextern pw_16
  153. cextern pw_32
  154. cextern pw_00ff
  155. cextern pw_ppppmmmm
  156. cextern pw_ppmmppmm
  157. cextern pw_pmpmpmpm
  158. cextern pw_pmmpzzzz
  159. cextern pd_1
  160. cextern hsub_mul
  161. cextern popcnt_table
  162. ;=============================================================================
  163. ; SSD
  164. ;=============================================================================
  165. %if HIGH_BIT_DEPTH
  166. ;-----------------------------------------------------------------------------
  167. ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
  168. ;-----------------------------------------------------------------------------
  169. %macro SSD_ONE 2
  170. cglobal pixel_ssd_%1x%2, 4,7,6
  171. FIX_STRIDES r1, r3
  172. %if mmsize == %1*2
  173. %define offset0_1 r1
  174. %define offset0_2 r1*2
  175. %define offset0_3 r5
  176. %define offset1_1 r3
  177. %define offset1_2 r3*2
  178. %define offset1_3 r6
  179. lea r5, [3*r1]
  180. lea r6, [3*r3]
  181. %elif mmsize == %1
  182. %define offset0_1 mmsize
  183. %define offset0_2 r1
  184. %define offset0_3 r1+mmsize
  185. %define offset1_1 mmsize
  186. %define offset1_2 r3
  187. %define offset1_3 r3+mmsize
  188. %elif mmsize == %1/2
  189. %define offset0_1 mmsize
  190. %define offset0_2 mmsize*2
  191. %define offset0_3 mmsize*3
  192. %define offset1_1 mmsize
  193. %define offset1_2 mmsize*2
  194. %define offset1_3 mmsize*3
  195. %endif
  196. %assign %%n %2/(2*mmsize/%1)
  197. %if %%n > 1
  198. mov r4d, %%n
  199. %endif
  200. pxor m0, m0
  201. .loop:
  202. mova m1, [r0]
  203. mova m2, [r0+offset0_1]
  204. mova m3, [r0+offset0_2]
  205. mova m4, [r0+offset0_3]
  206. psubw m1, [r2]
  207. psubw m2, [r2+offset1_1]
  208. psubw m3, [r2+offset1_2]
  209. psubw m4, [r2+offset1_3]
  210. %if %%n > 1
  211. lea r0, [r0+r1*(%2/%%n)]
  212. lea r2, [r2+r3*(%2/%%n)]
  213. %endif
  214. pmaddwd m1, m1
  215. pmaddwd m2, m2
  216. pmaddwd m3, m3
  217. pmaddwd m4, m4
  218. paddd m1, m2
  219. paddd m3, m4
  220. paddd m0, m1
  221. paddd m0, m3
  222. %if %%n > 1
  223. dec r4d
  224. jg .loop
  225. %endif
  226. HADDD m0, m5
  227. movd eax, xm0
  228. RET
  229. %endmacro
  230. INIT_MMX mmx2
  231. SSD_ONE 4, 4
  232. SSD_ONE 4, 8
  233. SSD_ONE 4, 16
  234. SSD_ONE 8, 4
  235. SSD_ONE 8, 8
  236. SSD_ONE 8, 16
  237. SSD_ONE 16, 8
  238. SSD_ONE 16, 16
  239. INIT_XMM sse2
  240. SSD_ONE 8, 4
  241. SSD_ONE 8, 8
  242. SSD_ONE 8, 16
  243. SSD_ONE 16, 8
  244. SSD_ONE 16, 16
  245. INIT_YMM avx2
  246. SSD_ONE 16, 8
  247. SSD_ONE 16, 16
  248. %endif ; HIGH_BIT_DEPTH
  249. %if HIGH_BIT_DEPTH == 0
  250. %macro SSD_LOAD_FULL 5
  251. mova m1, [t0+%1]
  252. mova m2, [t2+%2]
  253. mova m3, [t0+%3]
  254. mova m4, [t2+%4]
  255. %if %5==1
  256. add t0, t1
  257. add t2, t3
  258. %elif %5==2
  259. lea t0, [t0+2*t1]
  260. lea t2, [t2+2*t3]
  261. %endif
  262. %endmacro
  263. %macro LOAD 5
  264. movh m%1, %3
  265. movh m%2, %4
  266. %if %5
  267. lea t0, [t0+2*t1]
  268. %endif
  269. %endmacro
  270. %macro JOIN 7
  271. movh m%3, %5
  272. movh m%4, %6
  273. %if %7
  274. lea t2, [t2+2*t3]
  275. %endif
  276. punpcklbw m%1, m7
  277. punpcklbw m%3, m7
  278. psubw m%1, m%3
  279. punpcklbw m%2, m7
  280. punpcklbw m%4, m7
  281. psubw m%2, m%4
  282. %endmacro
  283. %macro JOIN_SSE2 7
  284. movh m%3, %5
  285. movh m%4, %6
  286. %if %7
  287. lea t2, [t2+2*t3]
  288. %endif
  289. punpcklqdq m%1, m%2
  290. punpcklqdq m%3, m%4
  291. DEINTB %2, %1, %4, %3, 7
  292. psubw m%2, m%4
  293. psubw m%1, m%3
  294. %endmacro
  295. %macro JOIN_SSSE3 7
  296. movh m%3, %5
  297. movh m%4, %6
  298. %if %7
  299. lea t2, [t2+2*t3]
  300. %endif
  301. punpcklbw m%1, m%3
  302. punpcklbw m%2, m%4
  303. %endmacro
  304. %macro LOAD_AVX2 5
  305. mova xm%1, %3
  306. vinserti128 m%1, m%1, %4, 1
  307. %if %5
  308. lea t0, [t0+2*t1]
  309. %endif
  310. %endmacro
  311. %macro JOIN_AVX2 7
  312. mova xm%2, %5
  313. vinserti128 m%2, m%2, %6, 1
  314. %if %7
  315. lea t2, [t2+2*t3]
  316. %endif
  317. SBUTTERFLY bw, %1, %2, %3
  318. %endmacro
  319. %macro SSD_LOAD_HALF 5
  320. LOAD 1, 2, [t0+%1], [t0+%3], 1
  321. JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
  322. LOAD 3, 4, [t0+%1], [t0+%3], %5
  323. JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
  324. %endmacro
  325. %macro SSD_CORE 7-8
  326. %ifidn %8, FULL
  327. mova m%6, m%2
  328. mova m%7, m%4
  329. psubusb m%2, m%1
  330. psubusb m%4, m%3
  331. psubusb m%1, m%6
  332. psubusb m%3, m%7
  333. por m%1, m%2
  334. por m%3, m%4
  335. punpcklbw m%2, m%1, m%5
  336. punpckhbw m%1, m%5
  337. punpcklbw m%4, m%3, m%5
  338. punpckhbw m%3, m%5
  339. %endif
  340. pmaddwd m%1, m%1
  341. pmaddwd m%2, m%2
  342. pmaddwd m%3, m%3
  343. pmaddwd m%4, m%4
  344. %endmacro
  345. %macro SSD_CORE_SSE2 7-8
  346. %ifidn %8, FULL
  347. DEINTB %6, %1, %7, %2, %5
  348. psubw m%6, m%7
  349. psubw m%1, m%2
  350. SWAP %6, %2, %1
  351. DEINTB %6, %3, %7, %4, %5
  352. psubw m%6, m%7
  353. psubw m%3, m%4
  354. SWAP %6, %4, %3
  355. %endif
  356. pmaddwd m%1, m%1
  357. pmaddwd m%2, m%2
  358. pmaddwd m%3, m%3
  359. pmaddwd m%4, m%4
  360. %endmacro
  361. %macro SSD_CORE_SSSE3 7-8
  362. %ifidn %8, FULL
  363. punpckhbw m%6, m%1, m%2
  364. punpckhbw m%7, m%3, m%4
  365. punpcklbw m%1, m%2
  366. punpcklbw m%3, m%4
  367. SWAP %6, %2, %3
  368. SWAP %7, %4
  369. %endif
  370. pmaddubsw m%1, m%5
  371. pmaddubsw m%2, m%5
  372. pmaddubsw m%3, m%5
  373. pmaddubsw m%4, m%5
  374. pmaddwd m%1, m%1
  375. pmaddwd m%2, m%2
  376. pmaddwd m%3, m%3
  377. pmaddwd m%4, m%4
  378. %endmacro
  379. %macro SSD_ITER 6
  380. SSD_LOAD_%1 %2,%3,%4,%5,%6
  381. SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
  382. paddd m1, m2
  383. paddd m3, m4
  384. paddd m0, m1
  385. paddd m0, m3
  386. %endmacro
  387. ;-----------------------------------------------------------------------------
  388. ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
  389. ;-----------------------------------------------------------------------------
  390. %macro SSD 2
  391. %if %1 != %2
  392. %assign function_align 8
  393. %else
  394. %assign function_align 16
  395. %endif
  396. cglobal pixel_ssd_%1x%2, 0,0,0
  397. mov al, %1*%2/mmsize/2
  398. %if %1 != %2
  399. jmp mangle(private_prefix %+ _pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop)
  400. %else
  401. .startloop:
  402. %if ARCH_X86_64
  403. DECLARE_REG_TMP 0,1,2,3
  404. PROLOGUE 0,0,8
  405. %else
  406. PROLOGUE 0,5
  407. DECLARE_REG_TMP 1,2,3,4
  408. mov t0, r0m
  409. mov t1, r1m
  410. mov t2, r2m
  411. mov t3, r3m
  412. %endif
  413. %if cpuflag(ssse3)
  414. mova m7, [hsub_mul]
  415. %elifidn cpuname, sse2
  416. mova m7, [pw_00ff]
  417. %elif %1 >= mmsize
  418. pxor m7, m7
  419. %endif
  420. pxor m0, m0
  421. ALIGN 16
  422. .loop:
  423. %if %1 > mmsize
  424. SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
  425. %elif %1 == mmsize
  426. SSD_ITER FULL, 0, 0, t1, t3, 2
  427. %else
  428. SSD_ITER HALF, 0, 0, t1, t3, 2
  429. %endif
  430. dec al
  431. jg .loop
  432. %if mmsize==32
  433. vextracti128 xm1, m0, 1
  434. paddd xm0, xm1
  435. HADDD xm0, xm1
  436. movd eax, xm0
  437. %else
  438. HADDD m0, m1
  439. movd eax, m0
  440. %endif
  441. RET
  442. %endif
  443. %endmacro
  444. INIT_MMX mmx
  445. SSD 16, 16
  446. SSD 16, 8
  447. SSD 8, 8
  448. SSD 8, 16
  449. SSD 4, 4
  450. SSD 8, 4
  451. SSD 4, 8
  452. SSD 4, 16
  453. INIT_XMM sse2slow
  454. SSD 16, 16
  455. SSD 8, 8
  456. SSD 16, 8
  457. SSD 8, 16
  458. SSD 8, 4
  459. INIT_XMM sse2
  460. %define SSD_CORE SSD_CORE_SSE2
  461. %define JOIN JOIN_SSE2
  462. SSD 16, 16
  463. SSD 8, 8
  464. SSD 16, 8
  465. SSD 8, 16
  466. SSD 8, 4
  467. INIT_XMM ssse3
  468. %define SSD_CORE SSD_CORE_SSSE3
  469. %define JOIN JOIN_SSSE3
  470. SSD 16, 16
  471. SSD 8, 8
  472. SSD 16, 8
  473. SSD 8, 16
  474. SSD 8, 4
  475. INIT_XMM avx
  476. SSD 16, 16
  477. SSD 8, 8
  478. SSD 16, 8
  479. SSD 8, 16
  480. SSD 8, 4
  481. INIT_MMX ssse3
  482. SSD 4, 4
  483. SSD 4, 8
  484. SSD 4, 16
  485. INIT_XMM xop
  486. SSD 16, 16
  487. SSD 8, 8
  488. SSD 16, 8
  489. SSD 8, 16
  490. SSD 8, 4
  491. %define LOAD LOAD_AVX2
  492. %define JOIN JOIN_AVX2
  493. INIT_YMM avx2
  494. SSD 16, 16
  495. SSD 16, 8
  496. %assign function_align 16
  497. %endif ; !HIGH_BIT_DEPTH
  498. ;-----------------------------------------------------------------------------
  499. ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
  500. ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
  501. ;
  502. ; The maximum width this function can handle without risk of overflow is given
  503. ; in the following equation: (mmsize in bits)
  504. ;
  505. ; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
  506. ;
  507. ; For 10-bit XMM this means width >= 32832. At sane distortion levels
  508. ; it will take much more than that though.
  509. ;-----------------------------------------------------------------------------
  510. %if HIGH_BIT_DEPTH
  511. %macro SSD_NV12 0
  512. cglobal pixel_ssd_nv12_core, 6,7,7
  513. shl r4d, 2
  514. FIX_STRIDES r1, r3
  515. add r0, r4
  516. add r2, r4
  517. neg r4
  518. pxor m4, m4
  519. pxor m5, m5
  520. %if mmsize == 32
  521. vbroadcasti128 m6, [ssd_nv12_shuf]
  522. %endif
  523. .loopy:
  524. mov r6, r4
  525. pxor m2, m2
  526. pxor m3, m3
  527. .loopx:
  528. mova m0, [r0+r6]
  529. mova m1, [r0+r6+mmsize]
  530. psubw m0, [r2+r6]
  531. psubw m1, [r2+r6+mmsize]
  532. %if mmsize == 32
  533. pshufb m0, m6
  534. pshufb m1, m6
  535. %else
  536. SBUTTERFLY wd, 0, 1, 6
  537. %endif
  538. %if cpuflag(xop)
  539. pmadcswd m2, m0, m0, m2
  540. pmadcswd m3, m1, m1, m3
  541. %else
  542. pmaddwd m0, m0
  543. pmaddwd m1, m1
  544. paddd m2, m0
  545. paddd m3, m1
  546. %endif
  547. add r6, 2*mmsize
  548. jl .loopx
  549. %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
  550. jz .no_overread
  551. psubd m3, m1
  552. .no_overread:
  553. %endif
  554. punpckhdq m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
  555. punpckhdq m1, m3, m5 ; equation above, putting the width limit at 8208
  556. punpckldq m2, m5
  557. punpckldq m3, m5
  558. paddq m0, m1
  559. paddq m2, m3
  560. paddq m4, m0
  561. paddq m4, m2
  562. add r0, r1
  563. add r2, r3
  564. dec r5d
  565. jg .loopy
  566. mov r0, r6m
  567. mov r1, r7m
  568. %if mmsize == 32
  569. vextracti128 xm0, m4, 1
  570. paddq xm4, xm0
  571. %endif
  572. movq [r0], xm4
  573. movhps [r1], xm4
  574. RET
  575. %endmacro ; SSD_NV12
  576. %else ; !HIGH_BIT_DEPTH
  577. ;-----------------------------------------------------------------------------
  578. ; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
  579. ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
  580. ;
  581. ; This implementation can potentially overflow on image widths >= 11008 (or
  582. ; 6604 if interlaced), since it is called on blocks of height up to 12 (resp
  583. ; 20). At sane distortion levels it will take much more than that though.
  584. ;-----------------------------------------------------------------------------
  585. %macro SSD_NV12 0
  586. cglobal pixel_ssd_nv12_core, 6,7
  587. add r4d, r4d
  588. add r0, r4
  589. add r2, r4
  590. neg r4
  591. pxor m3, m3
  592. pxor m4, m4
  593. mova m5, [pw_00ff]
  594. .loopy:
  595. mov r6, r4
  596. .loopx:
  597. %if mmsize == 32 ; only 16-byte alignment is guaranteed
  598. movu m2, [r0+r6]
  599. movu m1, [r2+r6]
  600. %else
  601. mova m2, [r0+r6]
  602. mova m1, [r2+r6]
  603. %endif
  604. psubusb m0, m2, m1
  605. psubusb m1, m2
  606. por m0, m1
  607. psrlw m2, m0, 8
  608. pand m0, m5
  609. %if cpuflag(xop)
  610. pmadcswd m4, m2, m2, m4
  611. pmadcswd m3, m0, m0, m3
  612. %else
  613. pmaddwd m2, m2
  614. pmaddwd m0, m0
  615. paddd m4, m2
  616. paddd m3, m0
  617. %endif
  618. add r6, mmsize
  619. jl .loopx
  620. %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
  621. jz .no_overread
  622. pcmpeqb xm1, xm1
  623. pandn m0, m1, m0 ; zero the lower half
  624. pandn m2, m1, m2
  625. psubd m3, m0
  626. psubd m4, m2
  627. .no_overread:
  628. %endif
  629. add r0, r1
  630. add r2, r3
  631. dec r5d
  632. jg .loopy
  633. mov r0, r6m
  634. mov r1, r7m
  635. %if cpuflag(ssse3)
  636. phaddd m3, m4
  637. %else
  638. SBUTTERFLY qdq, 3, 4, 0
  639. paddd m3, m4
  640. %endif
  641. %if mmsize == 32
  642. vextracti128 xm4, m3, 1
  643. paddd xm3, xm4
  644. %endif
  645. psllq xm4, xm3, 32
  646. paddd xm3, xm4
  647. psrlq xm3, 32
  648. movq [r0], xm3
  649. movhps [r1], xm3
  650. RET
  651. %endmacro ; SSD_NV12
  652. %endif ; !HIGH_BIT_DEPTH
  653. INIT_XMM sse2
  654. SSD_NV12
  655. INIT_XMM avx
  656. SSD_NV12
  657. INIT_XMM xop
  658. SSD_NV12
  659. INIT_YMM avx2
  660. SSD_NV12
  661. ;=============================================================================
  662. ; variance
  663. ;=============================================================================
  664. %macro VAR_START 1
  665. pxor m5, m5 ; sum
  666. pxor m6, m6 ; sum squared
  667. %if HIGH_BIT_DEPTH == 0
  668. %if %1
  669. mova m7, [pw_00ff]
  670. %elif mmsize == 16
  671. pxor m7, m7 ; zero
  672. %endif
  673. %endif ; !HIGH_BIT_DEPTH
  674. %endmacro
  675. %macro VAR_END 0
  676. pmaddwd m5, [pw_1]
  677. SBUTTERFLY dq, 5, 6, 0
  678. paddd m5, m6
  679. %if mmsize == 32
  680. vextracti128 xm6, m5, 1
  681. paddd xm5, xm6
  682. %endif
  683. MOVHL xm6, xm5
  684. paddd xm5, xm6
  685. %if ARCH_X86_64
  686. movq rax, xm5
  687. %else
  688. movd eax, xm5
  689. %if cpuflag(avx)
  690. pextrd edx, xm5, 1
  691. %else
  692. pshuflw xm5, xm5, q1032
  693. movd edx, xm5
  694. %endif
  695. %endif
  696. RET
  697. %endmacro
  698. %macro VAR_CORE 0
  699. paddw m5, m0
  700. paddw m5, m3
  701. paddw m5, m1
  702. paddw m5, m4
  703. pmaddwd m0, m0
  704. pmaddwd m3, m3
  705. pmaddwd m1, m1
  706. pmaddwd m4, m4
  707. paddd m6, m0
  708. paddd m6, m3
  709. paddd m6, m1
  710. paddd m6, m4
  711. %endmacro
  712. ;-----------------------------------------------------------------------------
  713. ; int pixel_var_wxh( uint8_t *, intptr_t )
  714. ;-----------------------------------------------------------------------------
  715. %if HIGH_BIT_DEPTH
  716. %macro VAR 0
  717. cglobal pixel_var_16x16, 2,3,8
  718. FIX_STRIDES r1
  719. VAR_START 0
  720. mov r2d, 8
  721. .loop:
  722. mova m0, [r0]
  723. mova m1, [r0+mmsize]
  724. mova m3, [r0+r1]
  725. mova m4, [r0+r1+mmsize]
  726. lea r0, [r0+r1*2]
  727. VAR_CORE
  728. dec r2d
  729. jg .loop
  730. VAR_END
  731. cglobal pixel_var_8x8, 2,3,8
  732. lea r2, [r1*3]
  733. VAR_START 0
  734. mova m0, [r0]
  735. mova m1, [r0+r1*2]
  736. mova m3, [r0+r1*4]
  737. mova m4, [r0+r2*2]
  738. lea r0, [r0+r1*8]
  739. VAR_CORE
  740. mova m0, [r0]
  741. mova m1, [r0+r1*2]
  742. mova m3, [r0+r1*4]
  743. mova m4, [r0+r2*2]
  744. VAR_CORE
  745. VAR_END
  746. %endmacro ; VAR
  747. INIT_XMM sse2
  748. VAR
  749. INIT_XMM avx
  750. VAR
  751. %else ; HIGH_BIT_DEPTH == 0
  752. %macro VAR 0
  753. cglobal pixel_var_16x16, 2,3,8
  754. VAR_START 1
  755. mov r2d, 8
  756. .loop:
  757. mova m0, [r0]
  758. mova m3, [r0+r1]
  759. DEINTB 1, 0, 4, 3, 7
  760. lea r0, [r0+r1*2]
  761. VAR_CORE
  762. dec r2d
  763. jg .loop
  764. VAR_END
  765. cglobal pixel_var_8x8, 2,4,8
  766. VAR_START 1
  767. mov r2d, 2
  768. lea r3, [r1*3]
  769. .loop:
  770. movh m0, [r0]
  771. movh m3, [r0+r1]
  772. movhps m0, [r0+r1*2]
  773. movhps m3, [r0+r3]
  774. DEINTB 1, 0, 4, 3, 7
  775. lea r0, [r0+r1*4]
  776. VAR_CORE
  777. dec r2d
  778. jg .loop
  779. VAR_END
  780. cglobal pixel_var_8x16, 2,4,8
  781. VAR_START 1
  782. mov r2d, 4
  783. lea r3, [r1*3]
  784. .loop:
  785. movh m0, [r0]
  786. movh m3, [r0+r1]
  787. movhps m0, [r0+r1*2]
  788. movhps m3, [r0+r3]
  789. DEINTB 1, 0, 4, 3, 7
  790. lea r0, [r0+r1*4]
  791. VAR_CORE
  792. dec r2d
  793. jg .loop
  794. VAR_END
  795. %endmacro ; VAR
  796. INIT_XMM sse2
  797. VAR
  798. INIT_XMM avx
  799. VAR
  800. %endif ; !HIGH_BIT_DEPTH
  801. INIT_YMM avx2
  802. cglobal pixel_var_16x16, 2,4,7
  803. FIX_STRIDES r1
  804. VAR_START 0
  805. mov r2d, 4
  806. lea r3, [r1*3]
  807. .loop:
  808. %if HIGH_BIT_DEPTH
  809. mova m0, [r0]
  810. mova m3, [r0+r1]
  811. mova m1, [r0+r1*2]
  812. mova m4, [r0+r3]
  813. %else
  814. pmovzxbw m0, [r0]
  815. pmovzxbw m3, [r0+r1]
  816. pmovzxbw m1, [r0+r1*2]
  817. pmovzxbw m4, [r0+r3]
  818. %endif
  819. lea r0, [r0+r1*4]
  820. VAR_CORE
  821. dec r2d
  822. jg .loop
  823. VAR_END
  824. %macro VAR_AVX512_CORE 1 ; accum
  825. %if %1
  826. paddw m0, m2
  827. pmaddwd m2, m2
  828. paddw m0, m3
  829. pmaddwd m3, m3
  830. paddd m1, m2
  831. paddd m1, m3
  832. %else
  833. paddw m0, m2, m3
  834. pmaddwd m2, m2
  835. pmaddwd m3, m3
  836. paddd m1, m2, m3
  837. %endif
  838. %endmacro
  839. %macro VAR_AVX512_CORE_16x16 1 ; accum
  840. %if HIGH_BIT_DEPTH
  841. mova ym2, [r0]
  842. vinserti64x4 m2, [r0+r1], 1
  843. mova ym3, [r0+2*r1]
  844. vinserti64x4 m3, [r0+r3], 1
  845. %else
  846. vbroadcasti64x2 ym2, [r0]
  847. vbroadcasti64x2 m2 {k1}, [r0+r1]
  848. vbroadcasti64x2 ym3, [r0+2*r1]
  849. vbroadcasti64x2 m3 {k1}, [r0+r3]
  850. pshufb m2, m4
  851. pshufb m3, m4
  852. %endif
  853. VAR_AVX512_CORE %1
  854. %endmacro
  855. %macro VAR_AVX512_CORE_8x8 1 ; accum
  856. %if HIGH_BIT_DEPTH
  857. mova xm2, [r0]
  858. mova xm3, [r0+r1]
  859. %else
  860. movq xm2, [r0]
  861. movq xm3, [r0+r1]
  862. %endif
  863. vinserti128 ym2, [r0+2*r1], 1
  864. vinserti128 ym3, [r0+r2], 1
  865. lea r0, [r0+4*r1]
  866. vinserti32x4 m2, [r0], 2
  867. vinserti32x4 m3, [r0+r1], 2
  868. vinserti32x4 m2, [r0+2*r1], 3
  869. vinserti32x4 m3, [r0+r2], 3
  870. %if HIGH_BIT_DEPTH == 0
  871. punpcklbw m2, m4
  872. punpcklbw m3, m4
  873. %endif
  874. VAR_AVX512_CORE %1
  875. %endmacro
  876. INIT_ZMM avx512
  877. cglobal pixel_var_16x16, 2,4
  878. FIX_STRIDES r1
  879. mov r2d, 0xf0
  880. lea r3, [3*r1]
  881. %if HIGH_BIT_DEPTH == 0
  882. vbroadcasti64x4 m4, [var_shuf_avx512]
  883. kmovb k1, r2d
  884. %endif
  885. VAR_AVX512_CORE_16x16 0
  886. .loop:
  887. lea r0, [r0+4*r1]
  888. VAR_AVX512_CORE_16x16 1
  889. sub r2d, 0x50
  890. jg .loop
  891. %if ARCH_X86_64 == 0
  892. pop r3d
  893. %assign regs_used 3
  894. %endif
  895. var_avx512_end:
  896. vbroadcasti32x4 m2, [pw_1]
  897. pmaddwd m0, m2
  898. SBUTTERFLY dq, 0, 1, 2
  899. paddd m0, m1
  900. vextracti32x8 ym1, m0, 1
  901. paddd ym0, ym1
  902. vextracti128 xm1, ym0, 1
  903. paddd xmm0, xm0, xm1
  904. punpckhqdq xmm1, xmm0, xmm0
  905. paddd xmm0, xmm1
  906. %if ARCH_X86_64
  907. movq rax, xmm0
  908. %else
  909. movd eax, xmm0
  910. pextrd edx, xmm0, 1
  911. %endif
  912. RET
  913. %if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
  914. cglobal pixel_var_8x8, 2,3
  915. lea r2, [3*r1]
  916. pxor xm4, xm4
  917. VAR_AVX512_CORE_8x8 0
  918. jmp var_avx512_end
  919. %endif
  920. cglobal pixel_var_8x16, 2,3
  921. FIX_STRIDES r1
  922. lea r2, [3*r1]
  923. %if HIGH_BIT_DEPTH == 0
  924. pxor xm4, xm4
  925. %endif
  926. VAR_AVX512_CORE_8x8 0
  927. lea r0, [r0+4*r1]
  928. VAR_AVX512_CORE_8x8 1
  929. jmp var_avx512_end
  930. ;-----------------------------------------------------------------------------
  931. ; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
  932. ;-----------------------------------------------------------------------------
  933. %if ARCH_X86_64
  934. DECLARE_REG_TMP 6
  935. %else
  936. DECLARE_REG_TMP 2
  937. %endif
  938. %macro VAR2_END 3 ; src, tmp, shift
  939. movifnidn r2, r2mp
  940. pshufd %2, %1, q3331
  941. pmuludq %1, %1
  942. movq [r2], %2 ; sqr_u sqr_v
  943. psrld %1, %3
  944. psubd %2, %1 ; sqr - (sum * sum >> shift)
  945. MOVHL %1, %2
  946. paddd %1, %2
  947. movd eax, %1
  948. RET
  949. %endmacro
  950. %macro VAR2_8x8_SSE2 2
  951. %if HIGH_BIT_DEPTH
  952. cglobal pixel_var2_8x%1, 2,3,6
  953. pxor m4, m4
  954. pxor m5, m5
  955. %define %%sum2 m4
  956. %define %%sqr2 m5
  957. %else
  958. cglobal pixel_var2_8x%1, 2,3,7
  959. mova m6, [pw_00ff]
  960. %define %%sum2 m0
  961. %define %%sqr2 m1
  962. %endif
  963. pxor m0, m0 ; sum
  964. pxor m1, m1 ; sqr
  965. mov t0d, (%1-1)*FENC_STRIDEB
  966. .loop:
  967. %if HIGH_BIT_DEPTH
  968. mova m2, [r0+1*t0]
  969. psubw m2, [r1+2*t0]
  970. mova m3, [r0+1*t0+16]
  971. psubw m3, [r1+2*t0+32]
  972. %else
  973. mova m3, [r0+1*t0]
  974. movq m5, [r1+2*t0]
  975. punpcklqdq m5, [r1+2*t0+16]
  976. DEINTB 2, 3, 4, 5, 6
  977. psubw m2, m4
  978. psubw m3, m5
  979. %endif
  980. paddw m0, m2
  981. pmaddwd m2, m2
  982. paddw %%sum2, m3
  983. pmaddwd m3, m3
  984. paddd m1, m2
  985. paddd %%sqr2, m3
  986. sub t0d, FENC_STRIDEB
  987. jge .loop
  988. %if HIGH_BIT_DEPTH
  989. SBUTTERFLY dq, 0, 4, 2
  990. paddw m0, m4 ; sum_u sum_v
  991. pmaddwd m0, [pw_1]
  992. SBUTTERFLY dq, 1, 5, 2
  993. paddd m1, m5 ; sqr_u sqr_v
  994. SBUTTERFLY dq, 0, 1, 2
  995. paddd m0, m1
  996. %else
  997. pmaddwd m0, [pw_1]
  998. shufps m2, m0, m1, q2020
  999. shufps m0, m1, q3131
  1000. paddd m0, m2
  1001. pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v
  1002. %endif
  1003. VAR2_END m0, m1, %2
  1004. %endmacro
  1005. INIT_XMM sse2
  1006. VAR2_8x8_SSE2 8, 6
  1007. VAR2_8x8_SSE2 16, 7
  1008. %macro VAR2_CORE 3 ; src1, src2, accum
  1009. %if %3
  1010. paddw m0, %1
  1011. pmaddwd %1, %1
  1012. paddw m0, %2
  1013. pmaddwd %2, %2
  1014. paddd m1, %1
  1015. paddd m1, %2
  1016. %else
  1017. paddw m0, %1, %2
  1018. pmaddwd %1, %1
  1019. pmaddwd %2, %2
  1020. paddd m1, %1, %2
  1021. %endif
  1022. %endmacro
  1023. %if HIGH_BIT_DEPTH == 0
  1024. INIT_XMM ssse3
  1025. cglobal pixel_var2_internal
  1026. pxor m0, m0 ; sum
  1027. pxor m1, m1 ; sqr
  1028. .loop:
  1029. movq m2, [r0+1*t0]
  1030. punpcklbw m2, [r1+2*t0]
  1031. movq m3, [r0+1*t0-1*FENC_STRIDE]
  1032. punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE]
  1033. movq m4, [r0+1*t0-2*FENC_STRIDE]
  1034. punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE]
  1035. movq m5, [r0+1*t0-3*FENC_STRIDE]
  1036. punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE]
  1037. pmaddubsw m2, m7
  1038. pmaddubsw m3, m7
  1039. pmaddubsw m4, m7
  1040. pmaddubsw m5, m7
  1041. VAR2_CORE m2, m3, 1
  1042. VAR2_CORE m4, m5, 1
  1043. sub t0d, 4*FENC_STRIDE
  1044. jg .loop
  1045. pmaddwd m0, [pw_1]
  1046. ret
  1047. %macro VAR2_8x8_SSSE3 2
  1048. cglobal pixel_var2_8x%1, 2,3,8
  1049. mova m7, [hsub_mul]
  1050. mov t0d, (%1-1)*FENC_STRIDE
  1051. call pixel_var2_internal_ssse3 ; u
  1052. add r0, 8
  1053. add r1, 16
  1054. SBUTTERFLY qdq, 0, 1, 6
  1055. paddd m1, m0
  1056. mov t0d, (%1-1)*FENC_STRIDE
  1057. call pixel_var2_internal_ssse3 ; v
  1058. SBUTTERFLY qdq, 0, 6, 2
  1059. paddd m0, m6
  1060. phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v
  1061. VAR2_END m1, m0, %2
  1062. %endmacro
  1063. VAR2_8x8_SSSE3 8, 6
  1064. VAR2_8x8_SSSE3 16, 7
  1065. %endif ; !HIGH_BIT_DEPTH
  1066. %macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
  1067. %if HIGH_BIT_DEPTH
  1068. %if mmsize == 64
  1069. mova m2, [r1+2*%1+%2*FDEC_STRIDEB]
  1070. vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020
  1071. mova m3, [r1+2*%1+%3*FDEC_STRIDEB]
  1072. vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020
  1073. %else
  1074. mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
  1075. vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
  1076. mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
  1077. vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
  1078. %endif
  1079. psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
  1080. psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
  1081. %else
  1082. pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE]
  1083. mova m4, [r1+2*%1+%2*FDEC_STRIDE]
  1084. pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE]
  1085. mova m5, [r1+2*%1+%3*FDEC_STRIDE]
  1086. punpcklbw m4, m6
  1087. punpcklbw m5, m6
  1088. psubw m2, m4
  1089. psubw m3, m5
  1090. %endif
  1091. %endmacro
  1092. %macro VAR2_8x8_AVX2 2
  1093. %if HIGH_BIT_DEPTH
  1094. cglobal pixel_var2_8x%1, 2,3,4
  1095. %else
  1096. cglobal pixel_var2_8x%1, 2,3,7
  1097. pxor m6, m6
  1098. %endif
  1099. mov t0d, (%1-3)*FENC_STRIDEB
  1100. VAR2_AVX2_LOAD t0, 2, 1
  1101. VAR2_CORE m2, m3, 0
  1102. .loop:
  1103. VAR2_AVX2_LOAD t0, 0, -1
  1104. VAR2_CORE m2, m3, 1
  1105. sub t0d, 2*FENC_STRIDEB
  1106. jg .loop
  1107. pmaddwd m0, [pw_1]
  1108. SBUTTERFLY qdq, 0, 1, 2
  1109. paddd m0, m1
  1110. vextracti128 xm1, m0, 1
  1111. phaddd xm0, xm1
  1112. VAR2_END xm0, xm1, %2
  1113. %endmacro
  1114. INIT_YMM avx2
  1115. VAR2_8x8_AVX2 8, 6
  1116. VAR2_8x8_AVX2 16, 7
  1117. %macro VAR2_AVX512_END 1 ; shift
  1118. vbroadcasti32x4 m2, [pw_1]
  1119. pmaddwd m0, m2
  1120. SBUTTERFLY qdq, 0, 1, 2
  1121. paddd m0, m1
  1122. vextracti32x8 ym1, m0, 1
  1123. paddd ym0, ym1
  1124. psrlq ym1, ym0, 32
  1125. paddd ym0, ym1
  1126. vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v
  1127. VAR2_END xmm0, xmm1, %1
  1128. %endmacro
  1129. INIT_ZMM avx512
  1130. cglobal pixel_var2_8x8, 2,3
  1131. %if HIGH_BIT_DEPTH == 0
  1132. pxor xm6, xm6
  1133. %endif
  1134. VAR2_AVX2_LOAD 0, 0, 2
  1135. VAR2_CORE m2, m3, 0
  1136. VAR2_AVX2_LOAD 0, 4, 6
  1137. VAR2_CORE m2, m3, 1
  1138. VAR2_AVX512_END 6
  1139. cglobal pixel_var2_8x16, 2,3
  1140. %if HIGH_BIT_DEPTH == 0
  1141. pxor xm6, xm6
  1142. %endif
  1143. mov t0d, 10*FENC_STRIDEB
  1144. VAR2_AVX2_LOAD 0, 14, 12
  1145. VAR2_CORE m2, m3, 0
  1146. .loop:
  1147. VAR2_AVX2_LOAD t0, 0, -2
  1148. VAR2_CORE m2, m3, 1
  1149. sub t0d, 4*FENC_STRIDEB
  1150. jg .loop
  1151. VAR2_AVX512_END 7
  1152. ;=============================================================================
  1153. ; SATD
  1154. ;=============================================================================
  1155. %macro JDUP 2
  1156. %if cpuflag(sse4)
  1157. ; just use shufps on anything post conroe
  1158. shufps %1, %2, 0
  1159. %elif cpuflag(ssse3) && notcpuflag(atom)
  1160. ; join 2x 32 bit and duplicate them
  1161. ; emulating shufps is faster on conroe
  1162. punpcklqdq %1, %2
  1163. movsldup %1, %1
  1164. %else
  1165. ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d
  1166. punpckldq %1, %2
  1167. %endif
  1168. %endmacro
  1169. %macro HSUMSUB 5
  1170. pmaddubsw m%2, m%5
  1171. pmaddubsw m%1, m%5
  1172. pmaddubsw m%4, m%5
  1173. pmaddubsw m%3, m%5
  1174. %endmacro
  1175. %macro DIFF_UNPACK_SSE2 5
  1176. punpcklbw m%1, m%5
  1177. punpcklbw m%2, m%5
  1178. punpcklbw m%3, m%5
  1179. punpcklbw m%4, m%5
  1180. psubw m%1, m%2
  1181. psubw m%3, m%4
  1182. %endmacro
  1183. %macro DIFF_SUMSUB_SSSE3 5
  1184. HSUMSUB %1, %2, %3, %4, %5
  1185. psubw m%1, m%2
  1186. psubw m%3, m%4
  1187. %endmacro
  1188. %macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer
  1189. movd %1, %3
  1190. movd %2, %4
  1191. JDUP %1, %2
  1192. %endmacro
  1193. %macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer
  1194. movddup m%3, %6
  1195. movddup m%4, %8
  1196. movddup m%1, %5
  1197. movddup m%2, %7
  1198. %endmacro
  1199. %macro LOAD_DUP_4x8P_PENRYN 8
  1200. ; penryn and nehalem run punpcklqdq and movddup in different units
  1201. movh m%3, %6
  1202. movh m%4, %8
  1203. punpcklqdq m%3, m%3
  1204. movddup m%1, %5
  1205. punpcklqdq m%4, m%4
  1206. movddup m%2, %7
  1207. %endmacro
  1208. %macro LOAD_SUMSUB_8x2P 9
  1209. LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9
  1210. DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
  1211. %endmacro
  1212. %macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
  1213. ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
  1214. LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
  1215. LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
  1216. %if %10
  1217. lea %8, [%8+4*r1]
  1218. lea %9, [%9+4*r3]
  1219. %endif
  1220. %endmacro
  1221. %macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr
  1222. movddup m%1, [%7]
  1223. movddup m%2, [%7+8]
  1224. mova m%4, [%6]
  1225. movddup m%3, m%4
  1226. punpckhqdq m%4, m%4
  1227. DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
  1228. %endmacro
  1229. %macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr
  1230. movu m%4, [%7]
  1231. mova m%2, [%6]
  1232. DEINTB %1, %2, %3, %4, %5
  1233. psubw m%1, m%3
  1234. psubw m%2, m%4
  1235. SUMSUB_BA w, %1, %2, %3
  1236. %endmacro
  1237. %macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none
  1238. ; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp]
  1239. LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12
  1240. LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3
  1241. LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3
  1242. LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5
  1243. %endmacro
  1244. %macro LOAD_SUMSUB_16x2P_AVX2 9
  1245. ; 2*dst, 2*tmp, mul, 4*ptr
  1246. vbroadcasti128 m%1, [%6]
  1247. vbroadcasti128 m%3, [%7]
  1248. vbroadcasti128 m%2, [%8]
  1249. vbroadcasti128 m%4, [%9]
  1250. DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
  1251. %endmacro
  1252. %macro LOAD_SUMSUB_16x4P_AVX2 7-11 r0, r2, 0, 0
  1253. ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
  1254. LOAD_SUMSUB_16x2P_AVX2 %1, %2, %5, %6, %7, %8, %9, %8+r1, %9+r3
  1255. LOAD_SUMSUB_16x2P_AVX2 %3, %4, %5, %6, %7, %8+2*r1, %9+2*r3, %8+r4, %9+r5
  1256. %if %10
  1257. lea %8, [%8+4*r1]
  1258. lea %9, [%9+4*r3]
  1259. %endif
  1260. %endmacro
  1261. %macro LOAD_DUP_4x16P_AVX2 8 ; 4*dst, 4*pointer
  1262. mova xm%3, %6
  1263. mova xm%4, %8
  1264. mova xm%1, %5
  1265. mova xm%2, %7
  1266. vpermq m%3, m%3, q0011
  1267. vpermq m%4, m%4, q0011
  1268. vpermq m%1, m%1, q0011
  1269. vpermq m%2, m%2, q0011
  1270. %endmacro
  1271. %macro LOAD_SUMSUB8_16x2P_AVX2 9
  1272. ; 2*dst, 2*tmp, mul, 4*ptr
  1273. LOAD_DUP_4x16P_AVX2 %1, %2, %3, %4, %6, %7, %8, %9
  1274. DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
  1275. %endmacro
  1276. %macro LOAD_SUMSUB8_16x4P_AVX2 7-11 r0, r2, 0, 0
  1277. ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
  1278. LOAD_SUMSUB8_16x2P_AVX2 %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
  1279. LOAD_SUMSUB8_16x2P_AVX2 %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
  1280. %if %10
  1281. lea %8, [%8+4*r1]
  1282. lea %9, [%9+4*r3]
  1283. %endif
  1284. %endmacro
  1285. ; in: r4=3*stride1, r5=3*stride2
  1286. ; in: %2 = horizontal offset
  1287. ; in: %3 = whether we need to increment pix1 and pix2
  1288. ; clobber: m3..m7
  1289. ; out: %1 = satd
  1290. %macro SATD_4x4_MMX 3
  1291. %xdefine %%n nn%1
  1292. %assign offset %2*SIZEOF_PIXEL
  1293. LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset]
  1294. LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset]
  1295. LOAD_DIFF m6, m3, none, [r0+2*r1+offset], [r2+2*r3+offset]
  1296. LOAD_DIFF m7, m3, none, [r0+ r4+offset], [r2+ r5+offset]
  1297. %if %3
  1298. lea r0, [r0+4*r1]
  1299. lea r2, [r2+4*r3]
  1300. %endif
  1301. HADAMARD4_2D 4, 5, 6, 7, 3, %%n
  1302. paddw m4, m6
  1303. SWAP %%n, 4
  1304. %endmacro
  1305. ; in: %1 = horizontal if 0, vertical if 1
  1306. %macro SATD_8x4_SSE 8-9
  1307. %if %1
  1308. HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
  1309. %else
  1310. HADAMARD4_V %2, %3, %4, %5, %6
  1311. ; doing the abs first is a slight advantage
  1312. ABSW2 m%2, m%4, m%2, m%4, m%6, m%7
  1313. ABSW2 m%3, m%5, m%3, m%5, m%6, m%7
  1314. HADAMARD 1, max, %2, %4, %6, %7
  1315. %endif
  1316. %ifnidn %9, swap
  1317. paddw m%8, m%2
  1318. %else
  1319. SWAP %8, %2
  1320. %endif
  1321. %if %1
  1322. paddw m%8, m%4
  1323. %else
  1324. HADAMARD 1, max, %3, %5, %6, %7
  1325. paddw m%8, m%3
  1326. %endif
  1327. %endmacro
  1328. %macro SATD_START_MMX 0
  1329. FIX_STRIDES r1, r3
  1330. lea r4, [3*r1] ; 3*stride1
  1331. lea r5, [3*r3] ; 3*stride2
  1332. %endmacro
  1333. %macro SATD_END_MMX 0
  1334. %if HIGH_BIT_DEPTH
  1335. HADDUW m0, m1
  1336. movd eax, m0
  1337. %else ; !HIGH_BIT_DEPTH
  1338. pshufw m1, m0, q1032
  1339. paddw m0, m1
  1340. pshufw m1, m0, q2301
  1341. paddw m0, m1
  1342. movd eax, m0
  1343. and eax, 0xffff
  1344. %endif ; HIGH_BIT_DEPTH
  1345. RET
  1346. %endmacro
  1347. ; FIXME avoid the spilling of regs to hold 3*stride.
  1348. ; for small blocks on x86_32, modify pixel pointer instead.
  1349. ;-----------------------------------------------------------------------------
  1350. ; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
  1351. ;-----------------------------------------------------------------------------
  1352. INIT_MMX mmx2
  1353. cglobal pixel_satd_16x4_internal
  1354. SATD_4x4_MMX m2, 0, 0
  1355. SATD_4x4_MMX m1, 4, 0
  1356. paddw m0, m2
  1357. SATD_4x4_MMX m2, 8, 0
  1358. paddw m0, m1
  1359. SATD_4x4_MMX m1, 12, 0
  1360. paddw m0, m2
  1361. paddw m0, m1
  1362. ret
  1363. cglobal pixel_satd_8x8_internal
  1364. SATD_4x4_MMX m2, 0, 0
  1365. SATD_4x4_MMX m1, 4, 1
  1366. paddw m0, m2
  1367. paddw m0, m1
  1368. pixel_satd_8x4_internal_mmx2:
  1369. SATD_4x4_MMX m2, 0, 0
  1370. SATD_4x4_MMX m1, 4, 0
  1371. paddw m0, m2
  1372. paddw m0, m1
  1373. ret
  1374. %if HIGH_BIT_DEPTH
  1375. %macro SATD_MxN_MMX 3
  1376. cglobal pixel_satd_%1x%2, 4,7
  1377. SATD_START_MMX
  1378. pxor m0, m0
  1379. call pixel_satd_%1x%3_internal_mmx2
  1380. HADDUW m0, m1
  1381. movd r6d, m0
  1382. %rep %2/%3-1
  1383. pxor m0, m0
  1384. lea r0, [r0+4*r1]
  1385. lea r2, [r2+4*r3]
  1386. call pixel_satd_%1x%3_internal_mmx2
  1387. movd m2, r4
  1388. HADDUW m0, m1
  1389. movd r4, m0
  1390. add r6, r4
  1391. movd r4, m2
  1392. %endrep
  1393. movifnidn eax, r6d
  1394. RET
  1395. %endmacro
  1396. SATD_MxN_MMX 16, 16, 4
  1397. SATD_MxN_MMX 16, 8, 4
  1398. SATD_MxN_MMX 8, 16, 8
  1399. %endif ; HIGH_BIT_DEPTH
  1400. %if HIGH_BIT_DEPTH == 0
  1401. cglobal pixel_satd_16x16, 4,6
  1402. SATD_START_MMX
  1403. pxor m0, m0
  1404. %rep 3
  1405. call pixel_satd_16x4_internal_mmx2
  1406. lea r0, [r0+4*r1]
  1407. lea r2, [r2+4*r3]
  1408. %endrep
  1409. call pixel_satd_16x4_internal_mmx2
  1410. HADDUW m0, m1
  1411. movd eax, m0
  1412. RET
  1413. cglobal pixel_satd_16x8, 4,6
  1414. SATD_START_MMX
  1415. pxor m0, m0
  1416. call pixel_satd_16x4_internal_mmx2
  1417. lea r0, [r0+4*r1]
  1418. lea r2, [r2+4*r3]
  1419. call pixel_satd_16x4_internal_mmx2
  1420. SATD_END_MMX
  1421. cglobal pixel_satd_8x16, 4,6
  1422. SATD_START_MMX
  1423. pxor m0, m0
  1424. call pixel_satd_8x8_internal_mmx2
  1425. lea r0, [r0+4*r1]
  1426. lea r2, [r2+4*r3]
  1427. call pixel_satd_8x8_internal_mmx2
  1428. SATD_END_MMX
  1429. %endif ; !HIGH_BIT_DEPTH
  1430. cglobal pixel_satd_8x8, 4,6
  1431. SATD_START_MMX
  1432. pxor m0, m0
  1433. call pixel_satd_8x8_internal_mmx2
  1434. SATD_END_MMX
  1435. cglobal pixel_satd_8x4, 4,6
  1436. SATD_START_MMX
  1437. pxor m0, m0
  1438. call pixel_satd_8x4_internal_mmx2
  1439. SATD_END_MMX
  1440. cglobal pixel_satd_4x16, 4,6
  1441. SATD_START_MMX
  1442. SATD_4x4_MMX m0, 0, 1
  1443. SATD_4x4_MMX m1, 0, 1
  1444. paddw m0, m1
  1445. SATD_4x4_MMX m1, 0, 1
  1446. paddw m0, m1
  1447. SATD_4x4_MMX m1, 0, 0
  1448. paddw m0, m1
  1449. SATD_END_MMX
  1450. cglobal pixel_satd_4x8, 4,6
  1451. SATD_START_MMX
  1452. SATD_4x4_MMX m0, 0, 1
  1453. SATD_4x4_MMX m1, 0, 0
  1454. paddw m0, m1
  1455. SATD_END_MMX
  1456. cglobal pixel_satd_4x4, 4,6
  1457. SATD_START_MMX
  1458. SATD_4x4_MMX m0, 0, 0
  1459. SATD_END_MMX
  1460. %macro SATD_START_SSE2 2-3 0
  1461. FIX_STRIDES r1, r3
  1462. %if HIGH_BIT_DEPTH && %3
  1463. pxor %2, %2
  1464. %elif cpuflag(ssse3) && notcpuflag(atom)
  1465. %if mmsize==32
  1466. mova %2, [hmul_16p]
  1467. %else
  1468. mova %2, [hmul_8p]
  1469. %endif
  1470. %endif
  1471. lea r4, [3*r1]
  1472. lea r5, [3*r3]
  1473. pxor %1, %1
  1474. %endmacro
  1475. %macro SATD_END_SSE2 1-2
  1476. %if HIGH_BIT_DEPTH
  1477. HADDUW %1, xm0
  1478. %if %0 == 2
  1479. paddd %1, %2
  1480. %endif
  1481. %else
  1482. HADDW %1, xm7
  1483. %endif
  1484. movd eax, %1
  1485. RET
  1486. %endmacro
  1487. %macro SATD_ACCUM 3
  1488. %if HIGH_BIT_DEPTH
  1489. HADDUW %1, %2
  1490. paddd %3, %1
  1491. pxor %1, %1
  1492. %endif
  1493. %endmacro
  1494. %macro BACKUP_POINTERS 0
  1495. %if ARCH_X86_64
  1496. %if WIN64
  1497. PUSH r7
  1498. %endif
  1499. mov r6, r0
  1500. mov r7, r2
  1501. %endif
  1502. %endmacro
  1503. %macro RESTORE_AND_INC_POINTERS 0
  1504. %if ARCH_X86_64
  1505. lea r0, [r6+8*SIZEOF_PIXEL]
  1506. lea r2, [r7+8*SIZEOF_PIXEL]
  1507. %if WIN64
  1508. POP r7
  1509. %endif
  1510. %else
  1511. mov r0, r0mp
  1512. mov r2, r2mp
  1513. add r0, 8*SIZEOF_PIXEL
  1514. add r2, 8*SIZEOF_PIXEL
  1515. %endif
  1516. %endmacro
  1517. %macro SATD_4x8_SSE 3
  1518. %if HIGH_BIT_DEPTH
  1519. movh m0, [r0+0*r1]
  1520. movh m4, [r2+0*r3]
  1521. movh m1, [r0+1*r1]
  1522. movh m5, [r2+1*r3]
  1523. movhps m0, [r0+4*r1]
  1524. movhps m4, [r2+4*r3]
  1525. movh m2, [r0+2*r1]
  1526. movh m6, [r2+2*r3]
  1527. psubw m0, m4
  1528. movh m3, [r0+r4]
  1529. movh m4, [r2+r5]
  1530. lea r0, [r0+4*r1]
  1531. lea r2, [r2+4*r3]
  1532. movhps m1, [r0+1*r1]
  1533. movhps m5, [r2+1*r3]
  1534. movhps m2, [r0+2*r1]
  1535. movhps m6, [r2+2*r3]
  1536. psubw m1, m5
  1537. movhps m3, [r0+r4]
  1538. movhps m4, [r2+r5]
  1539. psubw m2, m6
  1540. psubw m3, m4
  1541. %else ; !HIGH_BIT_DEPTH
  1542. movd m4, [r2]
  1543. movd m5, [r2+r3]
  1544. movd m6, [r2+2*r3]
  1545. add r2, r5
  1546. movd m0, [r0]
  1547. movd m1, [r0+r1]
  1548. movd m2, [r0+2*r1]
  1549. add r0, r4
  1550. movd m3, [r2+r3]
  1551. JDUP m4, m3
  1552. movd m3, [r0+r1]
  1553. JDUP m0, m3
  1554. movd m3, [r2+2*r3]
  1555. JDUP m5, m3
  1556. movd m3, [r0+2*r1]
  1557. JDUP m1, m3
  1558. %if %1==0 && %2==1
  1559. mova m3, [hmul_4p]
  1560. DIFFOP 0, 4, 1, 5, 3
  1561. %else
  1562. DIFFOP 0, 4, 1, 5, 7
  1563. %endif
  1564. movd m5, [r2]
  1565. add r2, r5
  1566. movd m3, [r0]
  1567. add r0, r4
  1568. movd m4, [r2]
  1569. JDUP m6, m4
  1570. movd m4, [r0]
  1571. JDUP m2, m4
  1572. movd m4, [r2+r3]
  1573. JDUP m5, m4
  1574. movd m4, [r0+r1]
  1575. JDUP m3, m4
  1576. %if %1==0 && %2==1
  1577. mova m4, [hmul_4p]
  1578. DIFFOP 2, 6, 3, 5, 4
  1579. %else
  1580. DIFFOP 2, 6, 3, 5, 7
  1581. %endif
  1582. %endif ; HIGH_BIT_DEPTH
  1583. SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
  1584. %endmacro
  1585. ;-----------------------------------------------------------------------------
  1586. ; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
  1587. ;-----------------------------------------------------------------------------
  1588. %macro SATDS_SSE2 0
  1589. %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
  1590. %if cpuflag(ssse3) && (vertical==0 || HIGH_BIT_DEPTH)
  1591. cglobal pixel_satd_4x4, 4, 6, 6
  1592. SATD_START_MMX
  1593. mova m4, [hmul_4p]
  1594. LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
  1595. LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
  1596. LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
  1597. LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
  1598. DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
  1599. HADAMARD 0, sumsub, 0, 1, 2, 3
  1600. HADAMARD 4, sumsub, 0, 1, 2, 3
  1601. HADAMARD 1, amax, 0, 1, 2, 3
  1602. HADDW m0, m1
  1603. movd eax, m0
  1604. RET
  1605. %endif
  1606. cglobal pixel_satd_4x8, 4, 6, 8
  1607. SATD_START_MMX
  1608. %if vertical==0
  1609. mova m7, [hmul_4p]
  1610. %endif
  1611. SATD_4x8_SSE vertical, 0, swap
  1612. HADDW m7, m1
  1613. movd eax, m7
  1614. RET
  1615. cglobal pixel_satd_4x16, 4, 6, 8
  1616. SATD_START_MMX
  1617. %if vertical==0
  1618. mova m7, [hmul_4p]
  1619. %endif
  1620. SATD_4x8_SSE vertical, 0, swap
  1621. lea r0, [r0+r1*2*SIZEOF_PIXEL]
  1622. lea r2, [r2+r3*2*SIZEOF_PIXEL]
  1623. SATD_4x8_SSE vertical, 1, add
  1624. HADDW m7, m1
  1625. movd eax, m7
  1626. RET
  1627. cglobal pixel_satd_8x8_internal
  1628. LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
  1629. SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
  1630. %%pixel_satd_8x4_internal:
  1631. LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
  1632. SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
  1633. ret
  1634. ; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
  1635. ; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
  1636. %if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
  1637. cglobal pixel_satd_16x4_internal
  1638. LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
  1639. lea r2, [r2+4*r3]
  1640. lea r0, [r0+4*r1]
  1641. ; always use horizontal mode here
  1642. SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
  1643. SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
  1644. ret
  1645. cglobal pixel_satd_16x8, 4,6,12
  1646. SATD_START_SSE2 m10, m7
  1647. %if vertical
  1648. mova m7, [pw_00ff]
  1649. %endif
  1650. jmp %%pixel_satd_16x8_internal
  1651. cglobal pixel_satd_16x16, 4,6,12
  1652. SATD_START_SSE2 m10, m7
  1653. %if vertical
  1654. mova m7, [pw_00ff]
  1655. %endif
  1656. call pixel_satd_16x4_internal
  1657. call pixel_satd_16x4_internal
  1658. %%pixel_satd_16x8_internal:
  1659. call pixel_satd_16x4_internal
  1660. call pixel_satd_16x4_internal
  1661. SATD_END_SSE2 m10
  1662. %else
  1663. cglobal pixel_satd_16x8, 4,6,8
  1664. SATD_START_SSE2 m6, m7
  1665. BACKUP_POINTERS
  1666. call pixel_satd_8x8_internal
  1667. RESTORE_AND_INC_POINTERS
  1668. call pixel_satd_8x8_internal
  1669. SATD_END_SSE2 m6
  1670. cglobal pixel_satd_16x16, 4,6,8
  1671. SATD_START_SSE2 m6, m7, 1
  1672. BACKUP_POINTERS
  1673. call pixel_satd_8x8_internal
  1674. call pixel_satd_8x8_internal
  1675. SATD_ACCUM m6, m0, m7
  1676. RESTORE_AND_INC_POINTERS
  1677. call pixel_satd_8x8_internal
  1678. call pixel_satd_8x8_internal
  1679. SATD_END_SSE2 m6, m7
  1680. %endif
  1681. cglobal pixel_satd_8x16, 4,6,8
  1682. SATD_START_SSE2 m6, m7
  1683. call pixel_satd_8x8_internal
  1684. call pixel_satd_8x8_internal
  1685. SATD_END_SSE2 m6
  1686. cglobal pixel_satd_8x8, 4,6,8
  1687. SATD_START_SSE2 m6, m7
  1688. call pixel_satd_8x8_internal
  1689. SATD_END_SSE2 m6
  1690. cglobal pixel_satd_8x4, 4,6,8
  1691. SATD_START_SSE2 m6, m7
  1692. call %%pixel_satd_8x4_internal
  1693. SATD_END_SSE2 m6
  1694. %endmacro ; SATDS_SSE2
  1695. %macro SA8D_INTER 0
  1696. %if ARCH_X86_64
  1697. %define lh m10
  1698. %define rh m0
  1699. %else
  1700. %define lh m0
  1701. %define rh [esp+48]
  1702. %endif
  1703. %if HIGH_BIT_DEPTH
  1704. HADDUW m0, m1
  1705. paddd lh, rh
  1706. %else
  1707. paddusw lh, rh
  1708. %endif ; HIGH_BIT_DEPTH
  1709. %endmacro
  1710. %macro SA8D 0
  1711. ; sse2 doesn't seem to like the horizontal way of doing things
  1712. %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
  1713. %if ARCH_X86_64
  1714. ;-----------------------------------------------------------------------------
  1715. ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
  1716. ;-----------------------------------------------------------------------------
  1717. cglobal pixel_sa8d_8x8_internal
  1718. lea r6, [r0+4*r1]
  1719. lea r7, [r2+4*r3]
  1720. LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
  1721. LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
  1722. %if vertical
  1723. HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
  1724. %else ; non-sse2
  1725. HADAMARD8_2D_HMUL 0, 1, 2, 8, 4, 5, 3, 9, 6, 11
  1726. %endif
  1727. paddw m0, m1
  1728. paddw m0, m2
  1729. paddw m0, m8
  1730. SAVE_MM_PERMUTATION
  1731. ret
  1732. cglobal pixel_sa8d_8x8, 4,8,12
  1733. FIX_STRIDES r1, r3
  1734. lea r4, [3*r1]
  1735. lea r5, [3*r3]
  1736. %if vertical == 0
  1737. mova m7, [hmul_8p]
  1738. %endif
  1739. call pixel_sa8d_8x8_internal
  1740. %if HIGH_BIT_DEPTH
  1741. HADDUW m0, m1
  1742. %else
  1743. HADDW m0, m1
  1744. %endif ; HIGH_BIT_DEPTH
  1745. movd eax, m0
  1746. add eax, 1
  1747. shr eax, 1
  1748. RET
  1749. cglobal pixel_sa8d_16x16, 4,8,12
  1750. FIX_STRIDES r1, r3
  1751. lea r4, [3*r1]
  1752. lea r5, [3*r3]
  1753. %if vertical == 0
  1754. mova m7, [hmul_8p]
  1755. %endif
  1756. call pixel_sa8d_8x8_internal ; pix[0]
  1757. add r2, 8*SIZEOF_PIXEL
  1758. add r0, 8*SIZEOF_PIXEL
  1759. %if HIGH_BIT_DEPTH
  1760. HADDUW m0, m1
  1761. %endif
  1762. mova m10, m0
  1763. call pixel_sa8d_8x8_internal ; pix[8]
  1764. lea r2, [r2+8*r3]
  1765. lea r0, [r0+8*r1]
  1766. SA8D_INTER
  1767. call pixel_sa8d_8x8_internal ; pix[8*stride+8]
  1768. sub r2, 8*SIZEOF_PIXEL
  1769. sub r0, 8*SIZEOF_PIXEL
  1770. SA8D_INTER
  1771. call pixel_sa8d_8x8_internal ; pix[8*stride]
  1772. SA8D_INTER
  1773. SWAP 0, 10
  1774. %if HIGH_BIT_DEPTH == 0
  1775. HADDUW m0, m1
  1776. %endif
  1777. movd eax, m0
  1778. add eax, 1
  1779. shr eax, 1
  1780. RET
  1781. %else ; ARCH_X86_32
  1782. %if mmsize == 16
  1783. cglobal pixel_sa8d_8x8_internal
  1784. %define spill0 [esp+4]
  1785. %define spill1 [esp+20]
  1786. %define spill2 [esp+36]
  1787. %if vertical
  1788. LOAD_DIFF_8x4P 0, 1, 2, 3, 4, 5, 6, r0, r2, 1
  1789. HADAMARD4_2D 0, 1, 2, 3, 4
  1790. movdqa spill0, m3
  1791. LOAD_DIFF_8x4P 4, 5, 6, 7, 3, 3, 2, r0, r2, 1
  1792. HADAMARD4_2D 4, 5, 6, 7, 3
  1793. HADAMARD2_2D 0, 4, 1, 5, 3, qdq, amax
  1794. movdqa m3, spill0
  1795. paddw m0, m1
  1796. HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
  1797. %else ; mmsize == 8
  1798. mova m7, [hmul_8p]
  1799. LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
  1800. ; could do first HADAMARD4_V here to save spilling later
  1801. ; surprisingly, not a win on conroe or even p4
  1802. mova spill0, m2
  1803. mova spill1, m3
  1804. mova spill2, m1
  1805. SWAP 1, 7
  1806. LOAD_SUMSUB_8x4P 4, 5, 6, 7, 2, 3, 1, r0, r2, 1
  1807. HADAMARD4_V 4, 5, 6, 7, 3
  1808. mova m1, spill2
  1809. mova m2, spill0
  1810. mova m3, spill1
  1811. mova spill0, m6
  1812. mova spill1, m7
  1813. HADAMARD4_V 0, 1, 2, 3, 7
  1814. SUMSUB_BADC w, 0, 4, 1, 5, 7
  1815. HADAMARD 2, sumsub, 0, 4, 7, 6
  1816. HADAMARD 2, sumsub, 1, 5, 7, 6
  1817. HADAMARD 1, amax, 0, 4, 7, 6
  1818. HADAMARD 1, amax, 1, 5, 7, 6
  1819. mova m6, spill0
  1820. mova m7, spill1
  1821. paddw m0, m1
  1822. SUMSUB_BADC w, 2, 6, 3, 7, 4
  1823. HADAMARD 2, sumsub, 2, 6, 4, 5
  1824. HADAMARD 2, sumsub, 3, 7, 4, 5
  1825. HADAMARD 1, amax, 2, 6, 4, 5
  1826. HADAMARD 1, amax, 3, 7, 4, 5
  1827. %endif ; sse2/non-sse2
  1828. paddw m0, m2
  1829. paddw m0, m3
  1830. SAVE_MM_PERMUTATION
  1831. ret
  1832. %endif ; ifndef mmx2
  1833. cglobal pixel_sa8d_8x8, 4,7
  1834. FIX_STRIDES r1, r3
  1835. mov r6, esp
  1836. and esp, ~15
  1837. sub esp, 48
  1838. lea r4, [3*r1]
  1839. lea r5, [3*r3]
  1840. call pixel_sa8d_8x8_internal
  1841. %if HIGH_BIT_DEPTH
  1842. HADDUW m0, m1
  1843. %else
  1844. HADDW m0, m1
  1845. %endif ; HIGH_BIT_DEPTH
  1846. movd eax, m0
  1847. add eax, 1
  1848. shr eax, 1
  1849. mov esp, r6
  1850. RET
  1851. cglobal pixel_sa8d_16x16, 4,7
  1852. FIX_STRIDES r1, r3
  1853. mov r6, esp
  1854. and esp, ~15
  1855. sub esp, 64
  1856. lea r4, [3*r1]
  1857. lea r5, [3*r3]
  1858. call pixel_sa8d_8x8_internal
  1859. %if mmsize == 8
  1860. lea r0, [r0+4*r1]
  1861. lea r2, [r2+4*r3]
  1862. %endif
  1863. %if HIGH_BIT_DEPTH
  1864. HADDUW m0, m1
  1865. %endif
  1866. mova [esp+48], m0
  1867. call pixel_sa8d_8x8_internal
  1868. mov r0, [r6+20]
  1869. mov r2, [r6+28]
  1870. add r0, 8*SIZEOF_PIXEL
  1871. add r2, 8*SIZEOF_PIXEL
  1872. SA8D_INTER
  1873. mova [esp+48], m0
  1874. call pixel_sa8d_8x8_internal
  1875. %if mmsize == 8
  1876. lea r0, [r0+4*r1]
  1877. lea r2, [r2+4*r3]
  1878. %else
  1879. SA8D_INTER
  1880. %endif
  1881. mova [esp+64-mmsize], m0
  1882. call pixel_sa8d_8x8_internal
  1883. %if HIGH_BIT_DEPTH
  1884. SA8D_INTER
  1885. %else ; !HIGH_BIT_DEPTH
  1886. paddusw m0, [esp+64-mmsize]
  1887. %if mmsize == 16
  1888. HADDUW m0, m1
  1889. %else
  1890. mova m2, [esp+48]
  1891. pxor m7, m7
  1892. mova m1, m0
  1893. mova m3, m2
  1894. punpcklwd m0, m7
  1895. punpckhwd m1, m7
  1896. punpcklwd m2, m7
  1897. punpckhwd m3, m7
  1898. paddd m0, m1
  1899. paddd m2, m3
  1900. paddd m0, m2
  1901. HADDD m0, m1
  1902. %endif
  1903. %endif ; HIGH_BIT_DEPTH
  1904. movd eax, m0
  1905. add eax, 1
  1906. shr eax, 1
  1907. mov esp, r6
  1908. RET
  1909. %endif ; !ARCH_X86_64
  1910. %endmacro ; SA8D
  1911. ;=============================================================================
  1912. ; SA8D_SATD
  1913. ;=============================================================================
  1914. ; %1: vertical/horizontal mode
  1915. ; %2-%5: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
  1916. ; m10: satd result
  1917. ; m6, m11-15: tmp regs
  1918. %macro SA8D_SATD_8x4 5
  1919. %if %1
  1920. LOAD_DIFF_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
  1921. HADAMARD 0, sumsub, %2, %3, 6
  1922. HADAMARD 0, sumsub, %4, %5, 6
  1923. SBUTTERFLY wd, %2, %3, 6
  1924. SBUTTERFLY wd, %4, %5, 6
  1925. HADAMARD2_2D %2, %4, %3, %5, 6, dq
  1926. mova m12, m%2
  1927. mova m13, m%3
  1928. mova m14, m%4
  1929. mova m15, m%5
  1930. HADAMARD 0, sumsub, %2, %3, 6
  1931. HADAMARD 0, sumsub, %4, %5, 6
  1932. SBUTTERFLY qdq, 12, 13, 6
  1933. HADAMARD 0, amax, 12, 13, 6
  1934. SBUTTERFLY qdq, 14, 15, 6
  1935. paddw m10, m12
  1936. HADAMARD 0, amax, 14, 15, 6
  1937. paddw m10, m14
  1938. %else
  1939. LOAD_SUMSUB_8x4P %2, %3, %4, %5, 6, 11, 7, r0, r2, 1
  1940. HADAMARD4_V %2, %3, %4, %5, 6
  1941. pabsw m12, m%2 ; doing the abs first is a slight advantage
  1942. pabsw m14, m%4
  1943. pabsw m13, m%3
  1944. pabsw m15, m%5
  1945. HADAMARD 1, max, 12, 14, 6, 11
  1946. paddw m10, m12
  1947. HADAMARD 1, max, 13, 15, 6, 11
  1948. paddw m10, m13
  1949. %endif
  1950. %endmacro ; SA8D_SATD_8x4
  1951. ; %1: add spilled regs?
  1952. ; %2: spill regs?
  1953. %macro SA8D_SATD_ACCUM 2
  1954. %if HIGH_BIT_DEPTH
  1955. pmaddwd m10, [pw_1]
  1956. HADDUWD m0, m1
  1957. %if %1
  1958. paddd m10, temp1
  1959. paddd m0, temp0
  1960. %endif
  1961. %if %2
  1962. mova temp1, m10
  1963. pxor m10, m10
  1964. %endif
  1965. %elif %1
  1966. paddw m0, temp0
  1967. %endif
  1968. %if %2
  1969. mova temp0, m0
  1970. %endif
  1971. %endmacro
  1972. %macro SA8D_SATD 0
  1973. %define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
  1974. cglobal pixel_sa8d_satd_8x8_internal
  1975. SA8D_SATD_8x4 vertical, 0, 1, 2, 3
  1976. SA8D_SATD_8x4 vertical, 4, 5, 8, 9
  1977. %if vertical ; sse2-style
  1978. HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
  1979. HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
  1980. %else ; complete sa8d
  1981. SUMSUB_BADC w, 0, 4, 1, 5, 12
  1982. HADAMARD 2, sumsub, 0, 4, 12, 11
  1983. HADAMARD 2, sumsub, 1, 5, 12, 11
  1984. SUMSUB_BADC w, 2, 8, 3, 9, 12
  1985. HADAMARD 2, sumsub, 2, 8, 12, 11
  1986. HADAMARD 2, sumsub, 3, 9, 12, 11
  1987. HADAMARD 1, amax, 0, 4, 12, 11
  1988. HADAMARD 1, amax, 1, 5, 12, 4
  1989. HADAMARD 1, amax, 2, 8, 12, 4
  1990. HADAMARD 1, amax, 3, 9, 12, 4
  1991. %endif
  1992. ; create sa8d sub results
  1993. paddw m1, m2
  1994. paddw m0, m3
  1995. paddw m0, m1
  1996. SAVE_MM_PERMUTATION
  1997. ret
  1998. ;-------------------------------------------------------------------------------
  1999. ; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
  2000. ;-------------------------------------------------------------------------------
  2001. cglobal pixel_sa8d_satd_16x16, 4,8-(mmsize/32),16,SIZEOF_PIXEL*mmsize
  2002. %define temp0 [rsp+0*mmsize]
  2003. %define temp1 [rsp+1*mmsize]
  2004. FIX_STRIDES r1, r3
  2005. %if vertical==0
  2006. mova m7, [hmul_8p]
  2007. %endif
  2008. lea r4, [3*r1]
  2009. lea r5, [3*r3]
  2010. pxor m10, m10
  2011. %if mmsize==32
  2012. call pixel_sa8d_satd_8x8_internal
  2013. SA8D_SATD_ACCUM 0, 1
  2014. call pixel_sa8d_satd_8x8_internal
  2015. SA8D_SATD_ACCUM 1, 0
  2016. vextracti128 xm1, m0, 1
  2017. vextracti128 xm2, m10, 1
  2018. paddw xm0, xm1
  2019. paddw xm10, xm2
  2020. %else
  2021. lea r6, [r2+8*SIZEOF_PIXEL]
  2022. lea r7, [r0+8*SIZEOF_PIXEL]
  2023. call pixel_sa8d_satd_8x8_internal
  2024. SA8D_SATD_ACCUM 0, 1
  2025. call pixel_sa8d_satd_8x8_internal
  2026. SA8D_SATD_ACCUM 1, 1
  2027. mov r0, r7
  2028. mov r2, r6
  2029. call pixel_sa8d_satd_8x8_internal
  2030. SA8D_SATD_ACCUM 1, 1
  2031. call pixel_sa8d_satd_8x8_internal
  2032. SA8D_SATD_ACCUM 1, 0
  2033. %endif
  2034. ; xop already has fast horizontal sums
  2035. %if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
  2036. pmaddwd xm10, [pw_1]
  2037. HADDUWD xm0, xm1
  2038. phaddd xm0, xm10 ; sa8d1 sa8d2 satd1 satd2
  2039. pshufd xm1, xm0, q2301 ; sa8d2 sa8d1 satd2 satd1
  2040. paddd xm0, xm1 ; sa8d sa8d satd satd
  2041. movd r0d, xm0
  2042. pextrd eax, xm0, 2
  2043. %else
  2044. %if HIGH_BIT_DEPTH
  2045. HADDD xm0, xm1
  2046. HADDD xm10, xm2
  2047. %else
  2048. HADDUW xm0, xm1
  2049. HADDW xm10, xm2
  2050. %endif
  2051. movd r0d, xm0
  2052. movd eax, xm10
  2053. %endif
  2054. add r0d, 1
  2055. shl rax, 32
  2056. shr r0d, 1
  2057. or rax, r0
  2058. RET
  2059. %endmacro ; SA8D_SATD
  2060. ;=============================================================================
  2061. ; INTRA SATD
  2062. ;=============================================================================
  2063. %macro HSUMSUB2 8
  2064. pshufd %4, %2, %7
  2065. pshufd %5, %3, %7
  2066. %1 %2, %8
  2067. %1 %6, %8
  2068. paddw %2, %4
  2069. paddw %3, %5
  2070. %endmacro
  2071. ; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
  2072. ; and are only retained for old cpus.
  2073. %macro INTRA_SA8D_SSE2 0
  2074. %if ARCH_X86_64
  2075. ;-----------------------------------------------------------------------------
  2076. ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
  2077. ;-----------------------------------------------------------------------------
  2078. cglobal intra_sa8d_x3_8x8, 3,3,13
  2079. ; 8x8 hadamard
  2080. pxor m8, m8
  2081. movq m0, [r0+0*FENC_STRIDE]
  2082. movq m1, [r0+1*FENC_STRIDE]
  2083. movq m2, [r0+2*FENC_STRIDE]
  2084. movq m3, [r0+3*FENC_STRIDE]
  2085. movq m4, [r0+4*FENC_STRIDE]
  2086. movq m5, [r0+5*FENC_STRIDE]
  2087. movq m6, [r0+6*FENC_STRIDE]
  2088. movq m7, [r0+7*FENC_STRIDE]
  2089. punpcklbw m0, m8
  2090. punpcklbw m1, m8
  2091. punpcklbw m2, m8
  2092. punpcklbw m3, m8
  2093. punpcklbw m4, m8
  2094. punpcklbw m5, m8
  2095. punpcklbw m6, m8
  2096. punpcklbw m7, m8
  2097. HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
  2098. ABSW2 m8, m9, m2, m3, m2, m3
  2099. ABSW2 m10, m11, m4, m5, m4, m5
  2100. paddw m8, m10
  2101. paddw m9, m11
  2102. ABSW2 m10, m11, m6, m7, m6, m7
  2103. ABSW m12, m1, m1
  2104. paddw m10, m11
  2105. paddw m8, m9
  2106. paddw m12, m10
  2107. paddw m12, m8
  2108. ; 1D hadamard of edges
  2109. movq m8, [r1+7]
  2110. movq m9, [r1+16]
  2111. pxor m10, m10
  2112. punpcklbw m8, m10
  2113. punpcklbw m9, m10
  2114. HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q1032, [pw_ppppmmmm]
  2115. HSUMSUB2 pmullw, m8, m9, m10, m11, m11, q2301, [pw_ppmmppmm]
  2116. pshuflw m10, m8, q2301
  2117. pshuflw m11, m9, q2301
  2118. pshufhw m10, m10, q2301
  2119. pshufhw m11, m11, q2301
  2120. pmullw m8, [pw_pmpmpmpm]
  2121. pmullw m11, [pw_pmpmpmpm]
  2122. paddw m8, m10
  2123. paddw m9, m11
  2124. ; differences
  2125. paddw m10, m8, m9
  2126. paddw m10, [pw_8]
  2127. pand m10, [sw_f0]
  2128. psllw m8, 3 ; left edge
  2129. psllw m10, 2 ; dc
  2130. psubw m8, m0
  2131. psubw m10, m0
  2132. punpcklwd m0, m1
  2133. punpcklwd m2, m3
  2134. punpcklwd m4, m5
  2135. punpcklwd m6, m7
  2136. ABSW m10, m10, m1
  2137. paddw m10, m12
  2138. punpckldq m0, m2
  2139. punpckldq m4, m6
  2140. punpcklqdq m0, m4 ; transpose
  2141. psllw m9, 3 ; top edge
  2142. psrldq m2, m10, 2 ; 8x7 sum
  2143. psubw m0, m9 ; 8x1 sum
  2144. ABSW2 m8, m0, m8, m0, m1, m3 ; 1x8 sum
  2145. paddw m8, m12
  2146. paddusw m2, m0
  2147. ; 3x HADDW
  2148. mova m7, [pd_f0]
  2149. pandn m0, m7, m10
  2150. psrld m10, 16
  2151. pandn m1, m7, m8
  2152. psrld m8, 16
  2153. pandn m7, m2
  2154. psrld m2, 16
  2155. paddd m0, m10
  2156. paddd m1, m8
  2157. paddd m2, m7
  2158. pshufd m3, m0, q2301
  2159. punpckhdq m4, m2, m1
  2160. punpckldq m2, m1
  2161. paddd m3, m0
  2162. paddd m2, m4
  2163. punpckhqdq m0, m2, m3
  2164. punpcklqdq m2, m3
  2165. paddd m0, [pd_2]
  2166. paddd m0, m2
  2167. psrld m0, 2
  2168. mova [r2], m0
  2169. RET
  2170. %endif ; ARCH_X86_64
  2171. %endmacro ; INTRA_SA8D_SSE2
  2172. ; in: r0 = fenc
  2173. ; out: m0..m3 = hadamard coefs
  2174. INIT_MMX
  2175. cglobal hadamard_load
  2176. ; not really a global, but otherwise cycles get attributed to the wrong function in profiling
  2177. %if HIGH_BIT_DEPTH
  2178. mova m0, [r0+0*FENC_STRIDEB]
  2179. mova m1, [r0+1*FENC_STRIDEB]
  2180. mova m2, [r0+2*FENC_STRIDEB]
  2181. mova m3, [r0+3*FENC_STRIDEB]
  2182. %else
  2183. pxor m7, m7
  2184. movd m0, [r0+0*FENC_STRIDE]
  2185. movd m1, [r0+1*FENC_STRIDE]
  2186. movd m2, [r0+2*FENC_STRIDE]
  2187. movd m3, [r0+3*FENC_STRIDE]
  2188. punpcklbw m0, m7
  2189. punpcklbw m1, m7
  2190. punpcklbw m2, m7
  2191. punpcklbw m3, m7
  2192. %endif
  2193. HADAMARD4_2D 0, 1, 2, 3, 4
  2194. SAVE_MM_PERMUTATION
  2195. ret
  2196. %macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
  2197. %ifidn %1, top
  2198. %if HIGH_BIT_DEPTH
  2199. mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
  2200. %else
  2201. movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
  2202. pxor %5, %5
  2203. punpcklbw %3, %5
  2204. %endif
  2205. %else ; left
  2206. %ifnidn %2, 0
  2207. shl %2d, 5 ; log(FDEC_STRIDEB)
  2208. %endif
  2209. movd %3, [r1+%2*SIZEOF_PIXEL-4+1*FDEC_STRIDEB]
  2210. pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
  2211. pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
  2212. pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
  2213. %if HIGH_BIT_DEPTH == 0
  2214. psrlw %3, 8
  2215. %endif
  2216. %ifnidn %2, 0
  2217. shr %2d, 5
  2218. %endif
  2219. %endif ; direction
  2220. %if cpuflag(ssse3)
  2221. %define %%sign psignw
  2222. %else
  2223. %define %%sign pmullw
  2224. %endif
  2225. pshufw %4, %3, q1032
  2226. %%sign %4, [pw_ppmmppmm]
  2227. paddw %3, %4
  2228. pshufw %4, %3, q2301
  2229. %%sign %4, [pw_pmpmpmpm]
  2230. paddw %3, %4
  2231. psllw %3, 2
  2232. mova [%1_1d+2*%2], %3
  2233. %endmacro
  2234. %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
  2235. pxor %7, %7
  2236. pshufw %4, %1, q1032
  2237. pshufw %5, %2, q1032
  2238. pshufw %6, %3, q1032
  2239. paddw %1, %4
  2240. paddw %2, %5
  2241. paddw %3, %6
  2242. punpcklwd %1, %7
  2243. punpcklwd %2, %7
  2244. punpcklwd %3, %7
  2245. pshufw %4, %1, q1032
  2246. pshufw %5, %2, q1032
  2247. pshufw %6, %3, q1032
  2248. %8 %1, %4
  2249. %8 %2, %5
  2250. %8 %3, %6
  2251. %endmacro
  2252. ; in: m1..m3
  2253. ; out: m7
  2254. ; clobber: m4..m6
  2255. %macro SUM3x4 0
  2256. ABSW2 m4, m5, m1, m2, m1, m2
  2257. ABSW m7, m3, m3
  2258. paddw m4, m5
  2259. paddw m7, m4
  2260. %endmacro
  2261. ; in: m0..m3 (4x4)
  2262. ; out: m0 v, m4 h, m5 dc
  2263. ; clobber: m1..m3
  2264. %macro SUM4x3 3 ; dc, left, top
  2265. movq m4, %2
  2266. %ifnum sizeof%1
  2267. movq m5, %1
  2268. %else
  2269. movd m5, %1
  2270. %endif
  2271. psubw m4, m0
  2272. psubw m5, m0
  2273. punpcklwd m0, m1
  2274. punpcklwd m2, m3
  2275. punpckldq m0, m2 ; transpose
  2276. psubw m0, %3
  2277. ABSW2 m4, m5, m4, m5, m2, m3 ; 1x4 sum
  2278. ABSW m0, m0, m1 ; 4x1 sum
  2279. %endmacro
  2280. %macro INTRA_X3_MMX 0
  2281. ;-----------------------------------------------------------------------------
  2282. ; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
  2283. ;-----------------------------------------------------------------------------
  2284. cglobal intra_satd_x3_4x4, 3,3
  2285. %if UNIX64
  2286. ; stack is 16 byte aligned because abi says so
  2287. %define top_1d rsp-8 ; size 8
  2288. %define left_1d rsp-16 ; size 8
  2289. %else
  2290. ; WIN64: stack is 16 byte aligned because abi says so
  2291. ; X86_32: stack is 16 byte aligned at least in gcc, and we've pushed 3 regs + return address, so it's still aligned
  2292. SUB rsp, 16
  2293. %define top_1d rsp+8
  2294. %define left_1d rsp
  2295. %endif
  2296. call hadamard_load
  2297. SCALAR_HADAMARD left, 0, m4, m5
  2298. SCALAR_HADAMARD top, 0, m6, m5, m7
  2299. paddw m6, m4
  2300. pavgw m6, [pw_16]
  2301. pand m6, [sw_f0] ; dc
  2302. SUM3x4
  2303. SUM4x3 m6, [left_1d], [top_1d]
  2304. paddw m4, m7
  2305. paddw m5, m7
  2306. movq m1, m5
  2307. psrlq m1, 16 ; 4x3 sum
  2308. paddw m0, m1
  2309. SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
  2310. movd [r2+0], m0 ; i4x4_v satd
  2311. movd [r2+4], m4 ; i4x4_h satd
  2312. movd [r2+8], m5 ; i4x4_dc satd
  2313. %if UNIX64 == 0
  2314. ADD rsp, 16
  2315. %endif
  2316. RET
  2317. ;-----------------------------------------------------------------------------
  2318. ; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
  2319. ;-----------------------------------------------------------------------------
  2320. cglobal intra_satd_x3_16x16, 0,5
  2321. %assign stack_pad 120 + ((stack_offset+120+gprsize)&15)
  2322. ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
  2323. SUB rsp, stack_pad
  2324. %define sums rsp+64 ; size 56
  2325. %define top_1d rsp+32 ; size 32
  2326. %define left_1d rsp ; size 32
  2327. movifnidn r1, r1mp
  2328. pxor m7, m7
  2329. mova [sums+ 0], m7
  2330. mova [sums+ 8], m7
  2331. mova [sums+16], m7
  2332. %if HIGH_BIT_DEPTH
  2333. mova [sums+24], m7
  2334. mova [sums+32], m7
  2335. mova [sums+40], m7
  2336. mova [sums+48], m7
  2337. %endif
  2338. ; 1D hadamards
  2339. mov r3d, 12
  2340. movd m6, [pw_32]
  2341. .loop_edge:
  2342. SCALAR_HADAMARD left, r3, m0, m1
  2343. SCALAR_HADAMARD top, r3, m1, m2, m3
  2344. pavgw m0, m1
  2345. paddw m6, m0
  2346. sub r3d, 4
  2347. jge .loop_edge
  2348. psrlw m6, 2
  2349. pand m6, [sw_f0] ; dc
  2350. ; 2D hadamards
  2351. movifnidn r0, r0mp
  2352. mov r3, -4
  2353. .loop_y:
  2354. mov r4, -4
  2355. .loop_x:
  2356. call hadamard_load
  2357. SUM3x4
  2358. SUM4x3 m6, [left_1d+8*(r3+4)], [top_1d+8*(r4+4)]
  2359. pavgw m4, m7
  2360. pavgw m5, m7
  2361. paddw m0, [sums+ 0] ; i16x16_v satd
  2362. paddw m4, [sums+ 8] ; i16x16_h satd
  2363. paddw m5, [sums+16] ; i16x16_dc satd
  2364. mova [sums+ 0], m0
  2365. mova [sums+ 8], m4
  2366. mova [sums+16], m5
  2367. add r0, 4*SIZEOF_PIXEL
  2368. inc r4
  2369. jl .loop_x
  2370. %if HIGH_BIT_DEPTH
  2371. psrld m7, m4, 16
  2372. pslld m4, 16
  2373. psrld m4, 16
  2374. paddd m4, m7
  2375. psrld m7, m0, 16
  2376. pslld m0, 16
  2377. psrld m0, 16
  2378. paddd m0, m7
  2379. paddd m4, [sums+32]
  2380. paddd m0, [sums+24]
  2381. mova [sums+32], m4
  2382. mova [sums+24], m0
  2383. pxor m7, m7
  2384. punpckhwd m3, m5, m7
  2385. punpcklwd m5, m7
  2386. paddd m3, [sums+48]
  2387. paddd m5, [sums+40]
  2388. mova [sums+48], m3
  2389. mova [sums+40], m5
  2390. mova [sums+ 0], m7
  2391. mova [sums+ 8], m7
  2392. mova [sums+16], m7
  2393. %endif
  2394. add r0, 4*FENC_STRIDEB-16*SIZEOF_PIXEL
  2395. inc r3
  2396. jl .loop_y
  2397. ; horizontal sum
  2398. movifnidn r2, r2mp
  2399. %if HIGH_BIT_DEPTH
  2400. mova m1, m5
  2401. paddd m5, m3
  2402. HADDD m5, m7 ; DC satd
  2403. HADDD m4, m7 ; H satd
  2404. HADDD m0, m7 ; the part of V satd that doesn't overlap with DC
  2405. psrld m0, 1
  2406. psrlq m1, 32 ; DC[1]
  2407. paddd m0, m3 ; DC[2]
  2408. psrlq m3, 32 ; DC[3]
  2409. paddd m0, m1
  2410. paddd m0, m3
  2411. %else
  2412. mova m7, m5
  2413. SUM_MM_X3 m0, m4, m5, m3, m1, m2, m6, paddd
  2414. psrld m0, 1
  2415. pslld m7, 16
  2416. psrld m7, 16
  2417. paddd m0, m5
  2418. psubd m0, m7
  2419. %endif
  2420. movd [r2+8], m5 ; i16x16_dc satd
  2421. movd [r2+4], m4 ; i16x16_h satd
  2422. movd [r2+0], m0 ; i16x16_v satd
  2423. ADD rsp, stack_pad
  2424. RET
  2425. %if ARCH_X86_64
  2426. %define t0 r6
  2427. %else
  2428. %define t0 r2
  2429. %endif
  2430. ;-----------------------------------------------------------------------------
  2431. ; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
  2432. ;-----------------------------------------------------------------------------
  2433. cglobal intra_satd_x3_8x8c, 0,6
  2434. ; not really needed on x86_64, just shuts up valgrind about storing data below the stack across a function call
  2435. SUB rsp, 72
  2436. %define sums rsp+48 ; size 24
  2437. %define dc_1d rsp+32 ; size 16
  2438. %define top_1d rsp+16 ; size 16
  2439. %define left_1d rsp ; size 16
  2440. movifnidn r1, r1mp
  2441. pxor m7, m7
  2442. mova [sums+ 0], m7
  2443. mova [sums+ 8], m7
  2444. mova [sums+16], m7
  2445. ; 1D hadamards
  2446. mov r3d, 4
  2447. .loop_edge:
  2448. SCALAR_HADAMARD left, r3, m0, m1
  2449. SCALAR_HADAMARD top, r3, m0, m1, m2
  2450. sub r3d, 4
  2451. jge .loop_edge
  2452. ; dc
  2453. movzx t0d, word [left_1d+0]
  2454. movzx r3d, word [top_1d+0]
  2455. movzx r4d, word [left_1d+8]
  2456. movzx r5d, word [top_1d+8]
  2457. lea t0d, [t0 + r3 + 16]
  2458. lea r3d, [r4 + r5 + 16]
  2459. shr t0d, 1
  2460. shr r3d, 1
  2461. add r4d, 8
  2462. add r5d, 8
  2463. and t0d, -16 ; tl
  2464. and r3d, -16 ; br
  2465. and r4d, -16 ; bl
  2466. and r5d, -16 ; tr
  2467. mov [dc_1d+ 0], t0d ; tl
  2468. mov [dc_1d+ 4], r5d ; tr
  2469. mov [dc_1d+ 8], r4d ; bl
  2470. mov [dc_1d+12], r3d ; br
  2471. lea r5, [dc_1d]
  2472. ; 2D hadamards
  2473. movifnidn r0, r0mp
  2474. movifnidn r2, r2mp
  2475. mov r3, -2
  2476. .loop_y:
  2477. mov r4, -2
  2478. .loop_x:
  2479. call hadamard_load
  2480. SUM3x4
  2481. SUM4x3 [r5+4*(r4+2)], [left_1d+8*(r3+2)], [top_1d+8*(r4+2)]
  2482. pavgw m4, m7
  2483. pavgw m5, m7
  2484. paddw m0, [sums+16] ; i4x4_v satd
  2485. paddw m4, [sums+8] ; i4x4_h satd
  2486. paddw m5, [sums+0] ; i4x4_dc satd
  2487. movq [sums+16], m0
  2488. movq [sums+8], m4
  2489. movq [sums+0], m5
  2490. add r0, 4*SIZEOF_PIXEL
  2491. inc r4
  2492. jl .loop_x
  2493. add r0, 4*FENC_STRIDEB-8*SIZEOF_PIXEL
  2494. add r5, 8
  2495. inc r3
  2496. jl .loop_y
  2497. ; horizontal sum
  2498. movq m0, [sums+0]
  2499. movq m1, [sums+8]
  2500. movq m2, [sums+16]
  2501. movq m7, m0
  2502. %if HIGH_BIT_DEPTH
  2503. psrlq m7, 16
  2504. HADDW m7, m3
  2505. SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
  2506. psrld m2, 1
  2507. paddd m2, m7
  2508. %else
  2509. psrlq m7, 15
  2510. paddw m2, m7
  2511. SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
  2512. psrld m2, 1
  2513. %endif
  2514. movd [r2+0], m0 ; i8x8c_dc satd
  2515. movd [r2+4], m1 ; i8x8c_h satd
  2516. movd [r2+8], m2 ; i8x8c_v satd
  2517. ADD rsp, 72
  2518. RET
  2519. %endmacro ; INTRA_X3_MMX
  2520. %macro PRED4x4_LOWPASS 5
  2521. %ifnum sizeof%5
  2522. pavgb %5, %2, %3
  2523. pxor %3, %2
  2524. pand %3, [pb_1]
  2525. psubusb %5, %3
  2526. pavgb %1, %4, %5
  2527. %else
  2528. mova %5, %2
  2529. pavgb %2, %3
  2530. pxor %3, %5
  2531. pand %3, [pb_1]
  2532. psubusb %2, %3
  2533. pavgb %1, %4, %2
  2534. %endif
  2535. %endmacro
  2536. %macro INTRA_X9_PRED 2
  2537. %if cpuflag(sse4)
  2538. movu m1, [r1-1*FDEC_STRIDE-8]
  2539. pinsrb m1, [r1+3*FDEC_STRIDE-1], 0
  2540. pinsrb m1, [r1+2*FDEC_STRIDE-1], 1
  2541. pinsrb m1, [r1+1*FDEC_STRIDE-1], 2
  2542. pinsrb m1, [r1+0*FDEC_STRIDE-1], 3
  2543. %else
  2544. movd mm0, [r1+3*FDEC_STRIDE-4]
  2545. punpcklbw mm0, [r1+2*FDEC_STRIDE-4]
  2546. movd mm1, [r1+1*FDEC_STRIDE-4]
  2547. punpcklbw mm1, [r1+0*FDEC_STRIDE-4]
  2548. punpckhwd mm0, mm1
  2549. psrlq mm0, 32
  2550. movq2dq m0, mm0
  2551. movu m1, [r1-1*FDEC_STRIDE-8]
  2552. movss m1, m0 ; l3 l2 l1 l0 __ __ __ lt t0 t1 t2 t3 t4 t5 t6 t7
  2553. %endif ; cpuflag
  2554. pshufb m1, [intrax9_edge] ; l3 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __
  2555. psrldq m0, m1, 1 ; l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __
  2556. psrldq m2, m1, 2 ; l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t7 __ __ __
  2557. pavgb m5, m0, m1 ; Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 __ __ __ __ __
  2558. mova %2, m1
  2559. PRED4x4_LOWPASS m0, m1, m2, m0, m4 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 __ __ __
  2560. ; ddl ddr
  2561. ; Ft1 Ft2 Ft3 Ft4 Flt Ft0 Ft1 Ft2
  2562. ; Ft2 Ft3 Ft4 Ft5 Fl0 Flt Ft0 Ft1
  2563. ; Ft3 Ft4 Ft5 Ft6 Fl1 Fl0 Flt Ft0
  2564. ; Ft4 Ft5 Ft6 Ft7 Fl2 Fl1 Fl0 Flt
  2565. pshufb m2, m0, [%1_ddlr1] ; a: ddl row0, ddl row1, ddr row0, ddr row1 / b: ddl row0, ddr row0, ddl row1, ddr row1
  2566. pshufb m3, m0, [%1_ddlr2] ; rows 2,3
  2567. ; hd hu
  2568. ; Glt Flt Ft0 Ft1 Gl0 Fl1 Gl1 Fl2
  2569. ; Gl0 Fl0 Glt Flt Gl1 Fl2 Gl2 Fl3
  2570. ; Gl1 Fl1 Gl0 Fl0 Gl2 Fl3 Gl3 Gl3
  2571. ; Gl2 Fl2 Gl1 Fl1 Gl3 Gl3 Gl3 Gl3
  2572. pslldq m0, 5 ; ___ ___ ___ ___ ___ Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
  2573. palignr m7, m5, m0, 5 ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gl3 Gl2 Gl1 Gl0 Glt
  2574. pshufb m6, m7, [%1_hdu1]
  2575. pshufb m7, m7, [%1_hdu2]
  2576. ; vr vl
  2577. ; Gt0 Gt1 Gt2 Gt3 Gt1 Gt2 Gt3 Gt4
  2578. ; Flt Ft0 Ft1 Ft2 Ft1 Ft2 Ft3 Ft4
  2579. ; Fl0 Gt0 Gt1 Gt2 Gt2 Gt3 Gt4 Gt5
  2580. ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
  2581. psrldq m5, 5 ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 ...
  2582. palignr m5, m0, 6 ; ___ Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
  2583. pshufb m4, m5, [%1_vrl1]
  2584. pshufb m5, m5, [%1_vrl2]
  2585. %endmacro ; INTRA_X9_PRED
  2586. %macro INTRA_X9_VHDC 5 ; edge, fenc01, fenc23, tmp, tmp
  2587. pshufb m2, m%1, [intrax9b_vh1]
  2588. pshufb m3, m%1, [intrax9b_vh2]
  2589. mova [pred_buf+0x60], m2
  2590. mova [pred_buf+0x70], m3
  2591. pshufb m%1, [intrax9b_edge2] ; t0 t1 t2 t3 t0 t1 t2 t3 l0 l1 l2 l3 l0 l1 l2 l3
  2592. pmaddubsw m%1, [hmul_4p]
  2593. pshufhw m0, m%1, q2301
  2594. pshuflw m0, m0, q2301
  2595. psignw m%1, [pw_pmpmpmpm]
  2596. paddw m0, m%1
  2597. psllw m0, 2 ; hadamard(top), hadamard(left)
  2598. MOVHL m3, m0
  2599. pshufb m1, m0, [intrax9b_v1]
  2600. pshufb m2, m0, [intrax9b_v2]
  2601. paddw m0, m3
  2602. psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
  2603. pavgw m0, [pw_16]
  2604. pand m0, [sw_f0] ; dc
  2605. ; This (as well as one of the steps in intra_satd_x9_4x4.satd_8x4) could be
  2606. ; changed from a wd transpose to a qdq, with appropriate rearrangement of inputs.
  2607. ; Which would be faster on conroe, but slower on penryn and sandybridge, and too invasive to ifdef.
  2608. HADAMARD 0, sumsub, %2, %3, %4, %5
  2609. HADAMARD 1, sumsub, %2, %3, %4, %5
  2610. movd r3d, m0
  2611. shr r3d, 4
  2612. imul r3d, 0x01010101
  2613. mov [pred_buf+0x80], r3d
  2614. mov [pred_buf+0x88], r3d
  2615. mov [pred_buf+0x90], r3d
  2616. mov [pred_buf+0x98], r3d
  2617. psubw m3, m%2
  2618. psubw m0, m%2
  2619. psubw m1, m%2
  2620. psubw m2, m%3
  2621. pabsw m%3, m%3
  2622. pabsw m3, m3
  2623. pabsw m0, m0
  2624. pabsw m1, m1
  2625. pabsw m2, m2
  2626. pavgw m3, m%3
  2627. pavgw m0, m%3
  2628. pavgw m1, m2
  2629. %if cpuflag(sse4)
  2630. phaddw m3, m0
  2631. %else
  2632. SBUTTERFLY qdq, 3, 0, 2
  2633. paddw m3, m0
  2634. %endif
  2635. MOVHL m2, m1
  2636. paddw m1, m2
  2637. %if cpuflag(xop)
  2638. vphaddwq m3, m3
  2639. vphaddwq m1, m1
  2640. packssdw m1, m3
  2641. %else
  2642. phaddw m1, m3
  2643. pmaddwd m1, [pw_1] ; v, _, h, dc
  2644. %endif
  2645. %endmacro ; INTRA_X9_VHDC
  2646. %macro INTRA_X9_END 2
  2647. %if cpuflag(sse4)
  2648. phminposuw m0, m0 ; h,dc,ddl,ddr,vr,hd,vl,hu
  2649. movd eax, m0
  2650. add eax, 1<<16
  2651. cmp ax, r3w
  2652. cmovge eax, r3d
  2653. %else
  2654. %if %1
  2655. ; 4x4 sad is up to 12 bits; +bitcosts -> 13 bits; pack with 3 bit index
  2656. psllw m0, 3
  2657. paddw m0, [pw_s01234567] ; h,dc,ddl,ddr,vr,hd,vl,hu
  2658. %else
  2659. ; 4x4 satd is up to 13 bits; +bitcosts and saturate -> 13 bits; pack with 3 bit index
  2660. psllw m0, 2
  2661. paddusw m0, m0
  2662. paddw m0, [pw_s01234657] ; h,dc,ddl,ddr,vr,vl,hd,hu
  2663. %endif
  2664. movhlps m1, m0
  2665. pminsw m0, m1
  2666. pshuflw m1, m0, q0032
  2667. pminsw m0, m1
  2668. pshuflw m1, m0, q0001
  2669. pminsw m0, m1
  2670. movd eax, m0
  2671. movsx r2d, ax
  2672. and eax, 7
  2673. sar r2d, 3
  2674. shl eax, 16
  2675. ; 1<<16: increment index to match intra4x4_pred_e. couldn't do this before because it had to fit in 3 bits
  2676. ; 1<<12: undo sign manipulation
  2677. lea eax, [rax+r2+(1<<16)+(1<<12)]
  2678. cmp ax, r3w
  2679. cmovge eax, r3d
  2680. %endif ; cpuflag
  2681. ; output the predicted samples
  2682. mov r3d, eax
  2683. shr r3d, 16
  2684. %ifdef PIC
  2685. lea r2, [%2_lut]
  2686. movzx r2d, byte [r2+r3]
  2687. %else
  2688. movzx r2d, byte [%2_lut+r3]
  2689. %endif
  2690. %if %1 ; sad
  2691. movq mm0, [pred_buf+r2]
  2692. movq mm1, [pred_buf+r2+16]
  2693. movd [r1+0*FDEC_STRIDE], mm0
  2694. movd [r1+2*FDEC_STRIDE], mm1
  2695. psrlq mm0, 32
  2696. psrlq mm1, 32
  2697. movd [r1+1*FDEC_STRIDE], mm0
  2698. movd [r1+3*FDEC_STRIDE], mm1
  2699. %else ; satd
  2700. %assign i 0
  2701. %rep 4
  2702. mov r3d, [pred_buf+r2+8*i]
  2703. mov [r1+i*FDEC_STRIDE], r3d
  2704. %assign i i+1
  2705. %endrep
  2706. %endif
  2707. %endmacro ; INTRA_X9_END
  2708. %macro INTRA_X9 0
  2709. ;-----------------------------------------------------------------------------
  2710. ; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
  2711. ;-----------------------------------------------------------------------------
  2712. %if notcpuflag(xop)
  2713. cglobal intra_sad_x9_4x4, 3,4,9
  2714. %assign pad 0xc0-gprsize-(stack_offset&15)
  2715. %define pred_buf rsp
  2716. sub rsp, pad
  2717. %if ARCH_X86_64
  2718. INTRA_X9_PRED intrax9a, m8
  2719. %else
  2720. INTRA_X9_PRED intrax9a, [rsp+0xa0]
  2721. %endif
  2722. mova [rsp+0x00], m2
  2723. mova [rsp+0x10], m3
  2724. mova [rsp+0x20], m4
  2725. mova [rsp+0x30], m5
  2726. mova [rsp+0x40], m6
  2727. mova [rsp+0x50], m7
  2728. %if cpuflag(sse4)
  2729. movd m0, [r0+0*FENC_STRIDE]
  2730. pinsrd m0, [r0+1*FENC_STRIDE], 1
  2731. movd m1, [r0+2*FENC_STRIDE]
  2732. pinsrd m1, [r0+3*FENC_STRIDE], 1
  2733. %else
  2734. movd mm0, [r0+0*FENC_STRIDE]
  2735. punpckldq mm0, [r0+1*FENC_STRIDE]
  2736. movd mm1, [r0+2*FENC_STRIDE]
  2737. punpckldq mm1, [r0+3*FENC_STRIDE]
  2738. movq2dq m0, mm0
  2739. movq2dq m1, mm1
  2740. %endif
  2741. punpcklqdq m0, m0
  2742. punpcklqdq m1, m1
  2743. psadbw m2, m0
  2744. psadbw m3, m1
  2745. psadbw m4, m0
  2746. psadbw m5, m1
  2747. psadbw m6, m0
  2748. psadbw m7, m1
  2749. paddd m2, m3
  2750. paddd m4, m5
  2751. paddd m6, m7
  2752. %if ARCH_X86_64
  2753. SWAP 7, 8
  2754. pxor m8, m8
  2755. %define %%zero m8
  2756. %else
  2757. mova m7, [rsp+0xa0]
  2758. %define %%zero [pb_0]
  2759. %endif
  2760. pshufb m3, m7, [intrax9a_vh1]
  2761. pshufb m5, m7, [intrax9a_vh2]
  2762. pshufb m7, [intrax9a_dc]
  2763. psadbw m7, %%zero
  2764. psrlw m7, 2
  2765. mova [rsp+0x60], m3
  2766. mova [rsp+0x70], m5
  2767. psadbw m3, m0
  2768. pavgw m7, %%zero
  2769. pshufb m7, %%zero
  2770. psadbw m5, m1
  2771. movq [rsp+0x80], m7
  2772. movq [rsp+0x90], m7
  2773. psadbw m0, m7
  2774. paddd m3, m5
  2775. psadbw m1, m7
  2776. paddd m0, m1
  2777. movzx r3d, word [r2]
  2778. movd r0d, m3 ; v
  2779. add r3d, r0d
  2780. punpckhqdq m3, m0 ; h, dc
  2781. shufps m3, m2, q2020
  2782. psllq m6, 32
  2783. por m4, m6
  2784. movu m0, [r2+2]
  2785. packssdw m3, m4
  2786. paddw m0, m3
  2787. INTRA_X9_END 1, intrax9a
  2788. add rsp, pad
  2789. RET
  2790. %endif ; cpuflag
  2791. %if ARCH_X86_64
  2792. ;-----------------------------------------------------------------------------
  2793. ; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
  2794. ;-----------------------------------------------------------------------------
  2795. cglobal intra_satd_x9_4x4, 3,4,16
  2796. %assign pad 0xb0-gprsize-(stack_offset&15)
  2797. %define pred_buf rsp
  2798. sub rsp, pad
  2799. INTRA_X9_PRED intrax9b, m15
  2800. mova [rsp+0x00], m2
  2801. mova [rsp+0x10], m3
  2802. mova [rsp+0x20], m4
  2803. mova [rsp+0x30], m5
  2804. mova [rsp+0x40], m6
  2805. mova [rsp+0x50], m7
  2806. movd m8, [r0+0*FENC_STRIDE]
  2807. movd m9, [r0+1*FENC_STRIDE]
  2808. movd m10, [r0+2*FENC_STRIDE]
  2809. movd m11, [r0+3*FENC_STRIDE]
  2810. mova m12, [hmul_8p]
  2811. pshufd m8, m8, 0
  2812. pshufd m9, m9, 0
  2813. pshufd m10, m10, 0
  2814. pshufd m11, m11, 0
  2815. pmaddubsw m8, m12
  2816. pmaddubsw m9, m12
  2817. pmaddubsw m10, m12
  2818. pmaddubsw m11, m12
  2819. movddup m0, m2
  2820. pshufd m1, m2, q3232
  2821. movddup m2, m3
  2822. punpckhqdq m3, m3
  2823. call .satd_8x4 ; ddr, ddl
  2824. movddup m2, m5
  2825. pshufd m3, m5, q3232
  2826. mova m5, m0
  2827. movddup m0, m4
  2828. pshufd m1, m4, q3232
  2829. call .satd_8x4 ; vr, vl
  2830. movddup m2, m7
  2831. pshufd m3, m7, q3232
  2832. mova m4, m0
  2833. movddup m0, m6
  2834. pshufd m1, m6, q3232
  2835. call .satd_8x4 ; hd, hu
  2836. %if cpuflag(sse4)
  2837. punpckldq m4, m0
  2838. %else
  2839. punpcklqdq m4, m0 ; conroe dislikes punpckldq, and ssse3 INTRA_X9_END can handle arbitrary orders whereas phminposuw can't
  2840. %endif
  2841. mova m1, [pw_ppmmppmm]
  2842. psignw m8, m1
  2843. psignw m10, m1
  2844. paddw m8, m9
  2845. paddw m10, m11
  2846. INTRA_X9_VHDC 15, 8, 10, 6, 7
  2847. ; find minimum
  2848. movu m0, [r2+2]
  2849. movd r3d, m1
  2850. palignr m5, m1, 8
  2851. %if notcpuflag(sse4)
  2852. pshufhw m0, m0, q3120 ; compensate for different order in unpack
  2853. %endif
  2854. packssdw m5, m4
  2855. paddw m0, m5
  2856. movzx r0d, word [r2]
  2857. add r3d, r0d
  2858. INTRA_X9_END 0, intrax9b
  2859. add rsp, pad
  2860. RET
  2861. RESET_MM_PERMUTATION
  2862. ALIGN 16
  2863. .satd_8x4:
  2864. pmaddubsw m0, m12
  2865. pmaddubsw m1, m12
  2866. pmaddubsw m2, m12
  2867. pmaddubsw m3, m12
  2868. psubw m0, m8
  2869. psubw m1, m9
  2870. psubw m2, m10
  2871. psubw m3, m11
  2872. SATD_8x4_SSE 0, 0, 1, 2, 3, 13, 14, 0, swap
  2873. pmaddwd m0, [pw_1]
  2874. MOVHL m1, m0
  2875. paddd xmm0, m0, m1 ; consistent location of return value. only the avx version of hadamard permutes m0, so 3arg is free
  2876. ret
  2877. %else ; !ARCH_X86_64
  2878. cglobal intra_satd_x9_4x4, 3,4,8
  2879. %assign pad 0x120-gprsize-(stack_offset&15)
  2880. %define fenc_buf rsp
  2881. %define pred_buf rsp+0x40
  2882. %define spill rsp+0xe0
  2883. sub rsp, pad
  2884. INTRA_X9_PRED intrax9b, [spill+0x20]
  2885. mova [pred_buf+0x00], m2
  2886. mova [pred_buf+0x10], m3
  2887. mova [pred_buf+0x20], m4
  2888. mova [pred_buf+0x30], m5
  2889. mova [pred_buf+0x40], m6
  2890. mova [pred_buf+0x50], m7
  2891. movd m4, [r0+0*FENC_STRIDE]
  2892. movd m5, [r0+1*FENC_STRIDE]
  2893. movd m6, [r0+2*FENC_STRIDE]
  2894. movd m0, [r0+3*FENC_STRIDE]
  2895. mova m7, [hmul_8p]
  2896. pshufd m4, m4, 0
  2897. pshufd m5, m5, 0
  2898. pshufd m6, m6, 0
  2899. pshufd m0, m0, 0
  2900. pmaddubsw m4, m7
  2901. pmaddubsw m5, m7
  2902. pmaddubsw m6, m7
  2903. pmaddubsw m0, m7
  2904. mova [fenc_buf+0x00], m4
  2905. mova [fenc_buf+0x10], m5
  2906. mova [fenc_buf+0x20], m6
  2907. mova [fenc_buf+0x30], m0
  2908. movddup m0, m2
  2909. pshufd m1, m2, q3232
  2910. movddup m2, m3
  2911. punpckhqdq m3, m3
  2912. pmaddubsw m0, m7
  2913. pmaddubsw m1, m7
  2914. pmaddubsw m2, m7
  2915. pmaddubsw m3, m7
  2916. psubw m0, m4
  2917. psubw m1, m5
  2918. psubw m2, m6
  2919. call .satd_8x4b ; ddr, ddl
  2920. mova m3, [pred_buf+0x30]
  2921. mova m1, [pred_buf+0x20]
  2922. movddup m2, m3
  2923. punpckhqdq m3, m3
  2924. movq [spill+0x08], m0
  2925. movddup m0, m1
  2926. punpckhqdq m1, m1
  2927. call .satd_8x4 ; vr, vl
  2928. mova m3, [pred_buf+0x50]
  2929. mova m1, [pred_buf+0x40]
  2930. movddup m2, m3
  2931. punpckhqdq m3, m3
  2932. movq [spill+0x10], m0
  2933. movddup m0, m1
  2934. punpckhqdq m1, m1
  2935. call .satd_8x4 ; hd, hu
  2936. movq [spill+0x18], m0
  2937. mova m1, [spill+0x20]
  2938. mova m4, [fenc_buf+0x00]
  2939. mova m5, [fenc_buf+0x20]
  2940. mova m2, [pw_ppmmppmm]
  2941. psignw m4, m2
  2942. psignw m5, m2
  2943. paddw m4, [fenc_buf+0x10]
  2944. paddw m5, [fenc_buf+0x30]
  2945. INTRA_X9_VHDC 1, 4, 5, 6, 7
  2946. ; find minimum
  2947. movu m0, [r2+2]
  2948. movd r3d, m1
  2949. punpckhqdq m1, [spill+0x00]
  2950. packssdw m1, [spill+0x10]
  2951. %if cpuflag(sse4)
  2952. pshufhw m1, m1, q3120
  2953. %else
  2954. pshufhw m0, m0, q3120
  2955. %endif
  2956. paddw m0, m1
  2957. movzx r0d, word [r2]
  2958. add r3d, r0d
  2959. INTRA_X9_END 0, intrax9b
  2960. add rsp, pad
  2961. RET
  2962. RESET_MM_PERMUTATION
  2963. ALIGN 16
  2964. .satd_8x4:
  2965. pmaddubsw m0, m7
  2966. pmaddubsw m1, m7
  2967. pmaddubsw m2, m7
  2968. pmaddubsw m3, m7
  2969. %xdefine fenc_buf fenc_buf+gprsize
  2970. psubw m0, [fenc_buf+0x00]
  2971. psubw m1, [fenc_buf+0x10]
  2972. psubw m2, [fenc_buf+0x20]
  2973. .satd_8x4b:
  2974. psubw m3, [fenc_buf+0x30]
  2975. SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 0, swap
  2976. pmaddwd m0, [pw_1]
  2977. MOVHL m1, m0
  2978. paddd xmm0, m0, m1
  2979. ret
  2980. %endif ; ARCH
  2981. %endmacro ; INTRA_X9
  2982. %macro INTRA8_X9 0
  2983. ;-----------------------------------------------------------------------------
  2984. ; int intra_sad_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
  2985. ;-----------------------------------------------------------------------------
  2986. cglobal intra_sad_x9_8x8, 5,6,9
  2987. %define fenc02 m4
  2988. %define fenc13 m5
  2989. %define fenc46 m6
  2990. %define fenc57 m7
  2991. %if ARCH_X86_64
  2992. %define tmp m8
  2993. %assign padbase 0x0
  2994. %else
  2995. %define tmp [rsp]
  2996. %assign padbase 0x10
  2997. %endif
  2998. %assign pad 0x240+0x10+padbase-gprsize-(stack_offset&15)
  2999. %define pred(i,j) [rsp+i*0x40+j*0x10+padbase]
  3000. SUB rsp, pad
  3001. movq fenc02, [r0+FENC_STRIDE* 0]
  3002. movq fenc13, [r0+FENC_STRIDE* 1]
  3003. movq fenc46, [r0+FENC_STRIDE* 4]
  3004. movq fenc57, [r0+FENC_STRIDE* 5]
  3005. movhps fenc02, [r0+FENC_STRIDE* 2]
  3006. movhps fenc13, [r0+FENC_STRIDE* 3]
  3007. movhps fenc46, [r0+FENC_STRIDE* 6]
  3008. movhps fenc57, [r0+FENC_STRIDE* 7]
  3009. ; save instruction size: avoid 4-byte memory offsets
  3010. lea r0, [intra8x9_h1+128]
  3011. %define off(m) (r0+m-(intra8x9_h1+128))
  3012. ; v
  3013. movddup m0, [r2+16]
  3014. mova pred(0,0), m0
  3015. psadbw m1, m0, fenc02
  3016. mova pred(0,1), m0
  3017. psadbw m2, m0, fenc13
  3018. mova pred(0,2), m0
  3019. psadbw m3, m0, fenc46
  3020. mova pred(0,3), m0
  3021. psadbw m0, m0, fenc57
  3022. paddw m1, m2
  3023. paddw m0, m3
  3024. paddw m0, m1
  3025. MOVHL m1, m0
  3026. paddw m0, m1
  3027. movd [r4+0], m0
  3028. ; h
  3029. movq m0, [r2+7]
  3030. pshufb m1, m0, [off(intra8x9_h1)]
  3031. pshufb m2, m0, [off(intra8x9_h2)]
  3032. mova pred(1,0), m1
  3033. psadbw m1, fenc02
  3034. mova pred(1,1), m2
  3035. psadbw m2, fenc13
  3036. paddw m1, m2
  3037. pshufb m3, m0, [off(intra8x9_h3)]
  3038. pshufb m2, m0, [off(intra8x9_h4)]
  3039. mova pred(1,2), m3
  3040. psadbw m3, fenc46
  3041. mova pred(1,3), m2
  3042. psadbw m2, fenc57
  3043. paddw m1, m3
  3044. paddw m1, m2
  3045. MOVHL m2, m1
  3046. paddw m1, m2
  3047. movd [r4+2], m1
  3048. lea r5, [rsp+padbase+0x100]
  3049. %define pred(i,j) [r5+i*0x40+j*0x10-0x100]
  3050. ; dc
  3051. movhps m0, [r2+16]
  3052. pxor m2, m2
  3053. psadbw m0, m2
  3054. MOVHL m1, m0
  3055. paddw m0, m1
  3056. psrlw m0, 3
  3057. pavgw m0, m2
  3058. pshufb m0, m2
  3059. mova pred(2,0), m0
  3060. psadbw m1, m0, fenc02
  3061. mova pred(2,1), m0
  3062. psadbw m2, m0, fenc13
  3063. mova pred(2,2), m0
  3064. psadbw m3, m0, fenc46
  3065. mova pred(2,3), m0
  3066. psadbw m0, m0, fenc57
  3067. paddw m1, m2
  3068. paddw m0, m3
  3069. paddw m0, m1
  3070. MOVHL m1, m0
  3071. paddw m0, m1
  3072. movd [r4+4], m0
  3073. ; ddl
  3074. ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
  3075. ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
  3076. ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
  3077. ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
  3078. ; Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC
  3079. ; Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD
  3080. ; Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE
  3081. ; Ft8 Ft9 FtA FtB FtC FtD FtE FtF
  3082. mova m0, [r2+16]
  3083. movu m2, [r2+17]
  3084. pslldq m1, m0, 1
  3085. pavgb m3, m0, m2 ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB ___ ___ ___ ___ ___
  3086. PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; ___ Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB FtC FtD FtE FtF
  3087. pshufb m1, m0, [off(intra8x9_ddl1)]
  3088. pshufb m2, m0, [off(intra8x9_ddl2)]
  3089. mova pred(3,0), m1
  3090. psadbw m1, fenc02
  3091. mova pred(3,1), m2
  3092. psadbw m2, fenc13
  3093. paddw m1, m2
  3094. pshufb m2, m0, [off(intra8x9_ddl3)]
  3095. mova pred(3,2), m2
  3096. psadbw m2, fenc46
  3097. paddw m1, m2
  3098. pshufb m2, m0, [off(intra8x9_ddl4)]
  3099. mova pred(3,3), m2
  3100. psadbw m2, fenc57
  3101. paddw m1, m2
  3102. MOVHL m2, m1
  3103. paddw m1, m2
  3104. movd [r4+6], m1
  3105. ; vl
  3106. ; Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8
  3107. ; Ft1 Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8
  3108. ; Gt2 Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9
  3109. ; Ft2 Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9
  3110. ; Gt3 Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA
  3111. ; Ft3 Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA
  3112. ; Gt4 Gt5 Gt6 Gt7 Gt8 Gt9 GtA GtB
  3113. ; Ft4 Ft5 Ft6 Ft7 Ft8 Ft9 FtA FtB
  3114. pshufb m1, m3, [off(intra8x9_vl1)]
  3115. pshufb m2, m0, [off(intra8x9_vl2)]
  3116. pshufb m3, m3, [off(intra8x9_vl3)]
  3117. pshufb m0, m0, [off(intra8x9_vl4)]
  3118. mova pred(7,0), m1
  3119. psadbw m1, fenc02
  3120. mova pred(7,1), m2
  3121. psadbw m2, fenc13
  3122. mova pred(7,2), m3
  3123. psadbw m3, fenc46
  3124. mova pred(7,3), m0
  3125. psadbw m0, fenc57
  3126. paddw m1, m2
  3127. paddw m0, m3
  3128. paddw m0, m1
  3129. MOVHL m1, m0
  3130. paddw m0, m1
  3131. %if cpuflag(sse4)
  3132. pextrw [r4+14], m0, 0
  3133. %else
  3134. movd r5d, m0
  3135. mov [r4+14], r5w
  3136. lea r5, [rsp+padbase+0x100]
  3137. %endif
  3138. ; ddr
  3139. ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
  3140. ; Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
  3141. ; Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4
  3142. ; Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3
  3143. ; Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2
  3144. ; Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1
  3145. ; Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0
  3146. ; Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt
  3147. movu m2, [r2+8]
  3148. movu m0, [r2+7]
  3149. movu m1, [r2+6]
  3150. pavgb m3, m2, m0 ; Gl6 Gl5 Gl4 Gl3 Gl2 Gl1 Gl0 Glt Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
  3151. PRED4x4_LOWPASS m0, m1, m2, m0, tmp ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
  3152. pshufb m1, m0, [off(intra8x9_ddr1)]
  3153. pshufb m2, m0, [off(intra8x9_ddr2)]
  3154. mova pred(4,0), m1
  3155. psadbw m1, fenc02
  3156. mova pred(4,1), m2
  3157. psadbw m2, fenc13
  3158. paddw m1, m2
  3159. pshufb m2, m0, [off(intra8x9_ddr3)]
  3160. mova pred(4,2), m2
  3161. psadbw m2, fenc46
  3162. paddw m1, m2
  3163. pshufb m2, m0, [off(intra8x9_ddr4)]
  3164. mova pred(4,3), m2
  3165. psadbw m2, fenc57
  3166. paddw m1, m2
  3167. MOVHL m2, m1
  3168. paddw m1, m2
  3169. movd [r4+8], m1
  3170. add r0, 256
  3171. add r5, 0xC0
  3172. %define off(m) (r0+m-(intra8x9_h1+256+128))
  3173. %define pred(i,j) [r5+i*0x40+j*0x10-0x1C0]
  3174. ; vr
  3175. ; Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
  3176. ; Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 Ft6
  3177. ; Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6
  3178. ; Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
  3179. ; Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5
  3180. ; Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3 Ft4
  3181. ; Fl4 Fl2 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4
  3182. ; Fl5 Fl3 Fl1 Flt Ft0 Ft1 Ft2 Ft3
  3183. movsd m2, m3, m0 ; Fl7 Fl6 Fl5 Fl4 Fl3 Fl2 Fl1 Fl0 Gt0 Gt1 Gt2 Gt3 Gt4 Gt5 Gt6 Gt7
  3184. pshufb m1, m2, [off(intra8x9_vr1)]
  3185. pshufb m2, m2, [off(intra8x9_vr3)]
  3186. mova pred(5,0), m1
  3187. psadbw m1, fenc02
  3188. mova pred(5,2), m2
  3189. psadbw m2, fenc46
  3190. paddw m1, m2
  3191. pshufb m2, m0, [off(intra8x9_vr2)]
  3192. mova pred(5,1), m2
  3193. psadbw m2, fenc13
  3194. paddw m1, m2
  3195. pshufb m2, m0, [off(intra8x9_vr4)]
  3196. mova pred(5,3), m2
  3197. psadbw m2, fenc57
  3198. paddw m1, m2
  3199. MOVHL m2, m1
  3200. paddw m1, m2
  3201. movd [r4+10], m1
  3202. ; hd
  3203. ; Glt Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5
  3204. ; Gl0 Fl0 Glt Flt Ft0 Ft1 Ft2 Ft3
  3205. ; Gl1 Fl1 Gl0 Fl0 Glt Flt Ft0 Ft1
  3206. ; Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 Glt Flt
  3207. ; Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0
  3208. ; Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1
  3209. ; Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2
  3210. ; Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3
  3211. pshufd m2, m3, q0001
  3212. %if cpuflag(sse4)
  3213. pblendw m2, m0, q3330 ; Gl2 Gl1 Gl0 Glt ___ Fl2 Fl1 Fl0 Flt Ft0 Ft1 Ft2 Ft3 Ft4 Ft5 ___
  3214. %else
  3215. movss m1, m0, m2
  3216. SWAP 1, 2
  3217. %endif
  3218. punpcklbw m0, m3 ; Fl7 Gl6 Fl6 Gl5 Fl5 Gl4 Fl4 Gl3 Fl3 Gl2 Fl2 Gl1 Fl1 Gl0 Fl0 ___
  3219. pshufb m1, m2, [off(intra8x9_hd1)]
  3220. pshufb m2, m2, [off(intra8x9_hd2)]
  3221. mova pred(6,0), m1
  3222. psadbw m1, fenc02
  3223. mova pred(6,1), m2
  3224. psadbw m2, fenc13
  3225. paddw m1, m2
  3226. pshufb m2, m0, [off(intra8x9_hd3)]
  3227. pshufb m3, m0, [off(intra8x9_hd4)]
  3228. mova pred(6,2), m2
  3229. psadbw m2, fenc46
  3230. mova pred(6,3), m3
  3231. psadbw m3, fenc57
  3232. paddw m1, m2
  3233. paddw m1, m3
  3234. MOVHL m2, m1
  3235. paddw m1, m2
  3236. ; don't just store to [r4+12]. this is too close to the load of dqword [r4] and would cause a forwarding stall
  3237. pslldq m1, 12
  3238. SWAP 3, 1
  3239. ; hu
  3240. ; Gl0 Fl1 Gl1 Fl2 Gl2 Fl3 Gl3 Fl4
  3241. ; Gl1 Fl2 Gl2 Fl3 Gl3 Fl4 Gl4 Fl5
  3242. ; Gl2 Fl3 Gl3 Gl3 Gl4 Fl5 Gl5 Fl6
  3243. ; Gl3 Gl3 Gl4 Fl5 Gl5 Fl6 Gl6 Fl7
  3244. ; Gl4 Fl5 Gl5 Fl6 Gl6 Fl7 Gl7 Gl7
  3245. ; Gl5 Fl6 Gl6 Fl7 Gl7 Gl7 Gl7 Gl7
  3246. ; Gl6 Fl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
  3247. ; Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7 Gl7
  3248. %if cpuflag(sse4)
  3249. pinsrb m0, [r2+7], 15 ; Gl7
  3250. %else
  3251. movd m1, [r2+7]
  3252. pslldq m0, 1
  3253. palignr m1, m0, 1
  3254. SWAP 0, 1
  3255. %endif
  3256. pshufb m1, m0, [off(intra8x9_hu1)]
  3257. pshufb m2, m0, [off(intra8x9_hu2)]
  3258. mova pred(8,0), m1
  3259. psadbw m1, fenc02
  3260. mova pred(8,1), m2
  3261. psadbw m2, fenc13
  3262. paddw m1, m2
  3263. pshufb m2, m0, [off(intra8x9_hu3)]
  3264. pshufb m0, m0, [off(intra8x9_hu4)]
  3265. mova pred(8,2), m2
  3266. psadbw m2, fenc46
  3267. mova pred(8,3), m0
  3268. psadbw m0, fenc57
  3269. paddw m1, m2
  3270. paddw m1, m0
  3271. MOVHL m2, m1
  3272. paddw m1, m2
  3273. movd r2d, m1
  3274. movu m0, [r3]
  3275. por m3, [r4]
  3276. paddw m0, m3
  3277. mova [r4], m0
  3278. movzx r5d, word [r3+16]
  3279. add r2d, r5d
  3280. mov [r4+16], r2w
  3281. %if cpuflag(sse4)
  3282. phminposuw m0, m0 ; v,h,dc,ddl,ddr,vr,hd,vl
  3283. movd eax, m0
  3284. %else
  3285. ; 8x8 sad is up to 14 bits; +bitcosts and saturate -> 14 bits; pack with 2 bit index
  3286. paddusw m0, m0
  3287. paddusw m0, m0
  3288. paddw m0, [off(pw_s00112233)]
  3289. MOVHL m1, m0
  3290. pminsw m0, m1
  3291. pshuflw m1, m0, q0032
  3292. pminsw m0, m1
  3293. movd eax, m0
  3294. ; repack with 3 bit index
  3295. xor eax, 0x80008000
  3296. movzx r3d, ax
  3297. shr eax, 15
  3298. add r3d, r3d
  3299. or eax, 1
  3300. cmp eax, r3d
  3301. cmovg eax, r3d
  3302. ; reverse to phminposuw order
  3303. mov r3d, eax
  3304. and eax, 7
  3305. shr r3d, 3
  3306. shl eax, 16
  3307. or eax, r3d
  3308. %endif
  3309. add r2d, 8<<16
  3310. cmp ax, r2w
  3311. cmovg eax, r2d
  3312. mov r2d, eax
  3313. shr r2d, 16
  3314. shl r2d, 6
  3315. add r1, 4*FDEC_STRIDE
  3316. mova m0, [rsp+padbase+r2+0x00]
  3317. mova m1, [rsp+padbase+r2+0x10]
  3318. mova m2, [rsp+padbase+r2+0x20]
  3319. mova m3, [rsp+padbase+r2+0x30]
  3320. movq [r1+FDEC_STRIDE*-4], m0
  3321. movhps [r1+FDEC_STRIDE*-2], m0
  3322. movq [r1+FDEC_STRIDE*-3], m1
  3323. movhps [r1+FDEC_STRIDE*-1], m1
  3324. movq [r1+FDEC_STRIDE* 0], m2
  3325. movhps [r1+FDEC_STRIDE* 2], m2
  3326. movq [r1+FDEC_STRIDE* 1], m3
  3327. movhps [r1+FDEC_STRIDE* 3], m3
  3328. ADD rsp, pad
  3329. RET
  3330. %if ARCH_X86_64
  3331. ;-----------------------------------------------------------------------------
  3332. ; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
  3333. ;-----------------------------------------------------------------------------
  3334. cglobal intra_sa8d_x9_8x8, 5,6,16
  3335. %assign pad 0x2c0+0x10-gprsize-(stack_offset&15)
  3336. %define fenc_buf rsp
  3337. %define pred_buf rsp+0x80
  3338. SUB rsp, pad
  3339. mova m15, [hmul_8p]
  3340. pxor m8, m8
  3341. %assign %%i 0
  3342. %rep 8
  3343. movddup m %+ %%i, [r0+%%i*FENC_STRIDE]
  3344. pmaddubsw m9, m %+ %%i, m15
  3345. punpcklbw m %+ %%i, m8
  3346. mova [fenc_buf+%%i*0x10], m9
  3347. %assign %%i %%i+1
  3348. %endrep
  3349. ; save instruction size: avoid 4-byte memory offsets
  3350. lea r0, [intra8x9_h1+0x80]
  3351. %define off(m) (r0+m-(intra8x9_h1+0x80))
  3352. lea r5, [pred_buf+0x80]
  3353. ; v, h, dc
  3354. HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 8
  3355. pabsw m11, m1
  3356. %assign %%i 2
  3357. %rep 6
  3358. pabsw m8, m %+ %%i
  3359. paddw m11, m8
  3360. %assign %%i %%i+1
  3361. %endrep
  3362. ; 1D hadamard of edges
  3363. movq m8, [r2+7]
  3364. movddup m9, [r2+16]
  3365. mova [r5-0x80], m9
  3366. mova [r5-0x70], m9
  3367. mova [r5-0x60], m9
  3368. mova [r5-0x50], m9
  3369. punpcklwd m8, m8
  3370. pshufb m9, [intrax3_shuf]
  3371. pmaddubsw m8, [pb_pppm]
  3372. pmaddubsw m9, [pb_pppm]
  3373. HSUMSUB2 psignw, m8, m9, m12, m13, m9, q1032, [pw_ppppmmmm]
  3374. HSUMSUB2 psignw, m8, m9, m12, m13, m9, q2301, [pw_ppmmppmm]
  3375. ; dc
  3376. paddw m10, m8, m9
  3377. paddw m10, [pw_8]
  3378. pand m10, [sw_f0]
  3379. psrlw m12, m10, 4
  3380. psllw m10, 2
  3381. pxor m13, m13
  3382. pshufb m12, m13
  3383. mova [r5+0x00], m12
  3384. mova [r5+0x10], m12
  3385. mova [r5+0x20], m12
  3386. mova [r5+0x30], m12
  3387. ; differences
  3388. psllw m8, 3 ; left edge
  3389. psubw m8, m0
  3390. psubw m10, m0
  3391. pabsw m8, m8 ; 1x8 sum
  3392. pabsw m10, m10
  3393. paddw m8, m11
  3394. paddw m11, m10
  3395. punpcklwd m0, m1
  3396. punpcklwd m2, m3
  3397. punpcklwd m4, m5
  3398. punpcklwd m6, m7
  3399. punpckldq m0, m2
  3400. punpckldq m4, m6
  3401. punpcklqdq m0, m4 ; transpose
  3402. psllw m9, 3 ; top edge
  3403. psrldq m10, m11, 2 ; 8x7 sum
  3404. psubw m0, m9 ; 8x1 sum
  3405. pabsw m0, m0
  3406. paddw m10, m0
  3407. phaddd m10, m8 ; logically phaddw, but this is faster and it won't overflow
  3408. psrlw m11, 1
  3409. psrlw m10, 1
  3410. ; store h
  3411. movq m3, [r2+7]
  3412. pshufb m0, m3, [off(intra8x9_h1)]
  3413. pshufb m1, m3, [off(intra8x9_h2)]
  3414. pshufb m2, m3, [off(intra8x9_h3)]
  3415. pshufb m3, m3, [off(intra8x9_h4)]
  3416. mova [r5-0x40], m0
  3417. mova [r5-0x30], m1
  3418. mova [r5-0x20], m2
  3419. mova [r5-0x10], m3
  3420. ; ddl
  3421. mova m8, [r2+16]
  3422. movu m2, [r2+17]
  3423. pslldq m1, m8, 1
  3424. pavgb m9, m8, m2
  3425. PRED4x4_LOWPASS m8, m1, m2, m8, m3
  3426. pshufb m0, m8, [off(intra8x9_ddl1)]
  3427. pshufb m1, m8, [off(intra8x9_ddl2)]
  3428. pshufb m2, m8, [off(intra8x9_ddl3)]
  3429. pshufb m3, m8, [off(intra8x9_ddl4)]
  3430. add r5, 0x40
  3431. call .sa8d
  3432. phaddd m11, m0
  3433. ; vl
  3434. pshufb m0, m9, [off(intra8x9_vl1)]
  3435. pshufb m1, m8, [off(intra8x9_vl2)]
  3436. pshufb m2, m9, [off(intra8x9_vl3)]
  3437. pshufb m3, m8, [off(intra8x9_vl4)]
  3438. add r5, 0x100
  3439. call .sa8d
  3440. phaddd m10, m11
  3441. mova m12, m0
  3442. ; ddr
  3443. movu m2, [r2+8]
  3444. movu m8, [r2+7]
  3445. movu m1, [r2+6]
  3446. pavgb m9, m2, m8
  3447. PRED4x4_LOWPASS m8, m1, m2, m8, m3
  3448. pshufb m0, m8, [off(intra8x9_ddr1)]
  3449. pshufb m1, m8, [off(intra8x9_ddr2)]
  3450. pshufb m2, m8, [off(intra8x9_ddr3)]
  3451. pshufb m3, m8, [off(intra8x9_ddr4)]
  3452. sub r5, 0xc0
  3453. call .sa8d
  3454. mova m11, m0
  3455. add r0, 0x100
  3456. %define off(m) (r0+m-(intra8x9_h1+0x180))
  3457. ; vr
  3458. movsd m2, m9, m8
  3459. pshufb m0, m2, [off(intra8x9_vr1)]
  3460. pshufb m1, m8, [off(intra8x9_vr2)]
  3461. pshufb m2, m2, [off(intra8x9_vr3)]
  3462. pshufb m3, m8, [off(intra8x9_vr4)]
  3463. add r5, 0x40
  3464. call .sa8d
  3465. phaddd m11, m0
  3466. ; hd
  3467. %if cpuflag(sse4)
  3468. pshufd m1, m9, q0001
  3469. pblendw m1, m8, q3330
  3470. %else
  3471. pshufd m2, m9, q0001
  3472. movss m1, m8, m2
  3473. %endif
  3474. punpcklbw m8, m9
  3475. pshufb m0, m1, [off(intra8x9_hd1)]
  3476. pshufb m1, m1, [off(intra8x9_hd2)]
  3477. pshufb m2, m8, [off(intra8x9_hd3)]
  3478. pshufb m3, m8, [off(intra8x9_hd4)]
  3479. add r5, 0x40
  3480. call .sa8d
  3481. phaddd m0, m12
  3482. phaddd m11, m0
  3483. ; hu
  3484. %if cpuflag(sse4)
  3485. pinsrb m8, [r2+7], 15
  3486. %else
  3487. movd m9, [r2+7]
  3488. pslldq m8, 1
  3489. palignr m9, m8, 1
  3490. SWAP 8, 9
  3491. %endif
  3492. pshufb m0, m8, [off(intra8x9_hu1)]
  3493. pshufb m1, m8, [off(intra8x9_hu2)]
  3494. pshufb m2, m8, [off(intra8x9_hu3)]
  3495. pshufb m3, m8, [off(intra8x9_hu4)]
  3496. add r5, 0x80
  3497. call .sa8d
  3498. pmaddwd m0, [pw_1]
  3499. phaddw m10, m11
  3500. MOVHL m1, m0
  3501. paddw m0, m1
  3502. pshuflw m1, m0, q0032
  3503. pavgw m0, m1
  3504. pxor m2, m2
  3505. pavgw m10, m2
  3506. movd r2d, m0
  3507. movu m0, [r3]
  3508. paddw m0, m10
  3509. mova [r4], m0
  3510. movzx r5d, word [r3+16]
  3511. add r2d, r5d
  3512. mov [r4+16], r2w
  3513. %if cpuflag(sse4)
  3514. phminposuw m0, m0
  3515. movd eax, m0
  3516. %else
  3517. ; 8x8 sa8d is up to 15 bits; +bitcosts and saturate -> 15 bits; pack with 1 bit index
  3518. paddusw m0, m0
  3519. paddw m0, [off(pw_s00001111)]
  3520. MOVHL m1, m0
  3521. pminsw m0, m1
  3522. pshuflw m1, m0, q0032
  3523. mova m2, m0
  3524. pminsw m0, m1
  3525. pcmpgtw m2, m1 ; 2nd index bit
  3526. movd r3d, m0
  3527. movd r4d, m2
  3528. ; repack with 3 bit index
  3529. xor r3d, 0x80008000
  3530. and r4d, 0x00020002
  3531. movzx eax, r3w
  3532. movzx r5d, r4w
  3533. shr r3d, 16
  3534. shr r4d, 16
  3535. lea eax, [rax*4+r5]
  3536. lea r3d, [ r3*4+r4+1]
  3537. cmp eax, r3d
  3538. cmovg eax, r3d
  3539. ; reverse to phminposuw order
  3540. mov r3d, eax
  3541. and eax, 7
  3542. shr r3d, 3
  3543. shl eax, 16
  3544. or eax, r3d
  3545. %endif
  3546. add r2d, 8<<16
  3547. cmp ax, r2w
  3548. cmovg eax, r2d
  3549. mov r2d, eax
  3550. shr r2d, 16
  3551. shl r2d, 6
  3552. add r1, 4*FDEC_STRIDE
  3553. mova m0, [pred_buf+r2+0x00]
  3554. mova m1, [pred_buf+r2+0x10]
  3555. mova m2, [pred_buf+r2+0x20]
  3556. mova m3, [pred_buf+r2+0x30]
  3557. movq [r1+FDEC_STRIDE*-4], m0
  3558. movhps [r1+FDEC_STRIDE*-2], m0
  3559. movq [r1+FDEC_STRIDE*-3], m1
  3560. movhps [r1+FDEC_STRIDE*-1], m1
  3561. movq [r1+FDEC_STRIDE* 0], m2
  3562. movhps [r1+FDEC_STRIDE* 2], m2
  3563. movq [r1+FDEC_STRIDE* 1], m3
  3564. movhps [r1+FDEC_STRIDE* 3], m3
  3565. ADD rsp, pad
  3566. RET
  3567. ALIGN 16
  3568. .sa8d:
  3569. %xdefine mret m0
  3570. %xdefine fenc_buf fenc_buf+gprsize
  3571. mova [r5+0x00], m0
  3572. mova [r5+0x10], m1
  3573. mova [r5+0x20], m2
  3574. mova [r5+0x30], m3
  3575. movddup m4, m0
  3576. movddup m5, m1
  3577. movddup m6, m2
  3578. movddup m7, m3
  3579. punpckhqdq m0, m0
  3580. punpckhqdq m1, m1
  3581. punpckhqdq m2, m2
  3582. punpckhqdq m3, m3
  3583. PERMUTE 0,4, 1,5, 2,0, 3,1, 4,6, 5,7, 6,2, 7,3
  3584. pmaddubsw m0, m15
  3585. pmaddubsw m1, m15
  3586. psubw m0, [fenc_buf+0x00]
  3587. psubw m1, [fenc_buf+0x10]
  3588. pmaddubsw m2, m15
  3589. pmaddubsw m3, m15
  3590. psubw m2, [fenc_buf+0x20]
  3591. psubw m3, [fenc_buf+0x30]
  3592. pmaddubsw m4, m15
  3593. pmaddubsw m5, m15
  3594. psubw m4, [fenc_buf+0x40]
  3595. psubw m5, [fenc_buf+0x50]
  3596. pmaddubsw m6, m15
  3597. pmaddubsw m7, m15
  3598. psubw m6, [fenc_buf+0x60]
  3599. psubw m7, [fenc_buf+0x70]
  3600. HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 13, 14
  3601. paddw m0, m1
  3602. paddw m0, m2
  3603. paddw mret, m0, m3
  3604. ret
  3605. %endif ; ARCH_X86_64
  3606. %endmacro ; INTRA8_X9
  3607. ; in: r0=pix, r1=stride, r2=stride*3, r3=tmp, m6=mask_ac4, m7=0
  3608. ; out: [tmp]=hadamard4, m0=satd
  3609. INIT_MMX mmx2
  3610. cglobal hadamard_ac_4x4
  3611. %if HIGH_BIT_DEPTH
  3612. mova m0, [r0]
  3613. mova m1, [r0+r1]
  3614. mova m2, [r0+r1*2]
  3615. mova m3, [r0+r2]
  3616. %else ; !HIGH_BIT_DEPTH
  3617. movh m0, [r0]
  3618. movh m1, [r0+r1]
  3619. movh m2, [r0+r1*2]
  3620. movh m3, [r0+r2]
  3621. punpcklbw m0, m7
  3622. punpcklbw m1, m7
  3623. punpcklbw m2, m7
  3624. punpcklbw m3, m7
  3625. %endif ; HIGH_BIT_DEPTH
  3626. HADAMARD4_2D 0, 1, 2, 3, 4
  3627. mova [r3], m0
  3628. mova [r3+8], m1
  3629. mova [r3+16], m2
  3630. mova [r3+24], m3
  3631. ABSW m0, m0, m4
  3632. ABSW m1, m1, m4
  3633. pand m0, m6
  3634. ABSW m2, m2, m4
  3635. ABSW m3, m3, m4
  3636. paddw m0, m1
  3637. paddw m2, m3
  3638. paddw m0, m2
  3639. SAVE_MM_PERMUTATION
  3640. ret
  3641. cglobal hadamard_ac_2x2max
  3642. mova m0, [r3+0x00]
  3643. mova m1, [r3+0x20]
  3644. mova m2, [r3+0x40]
  3645. mova m3, [r3+0x60]
  3646. sub r3, 8
  3647. SUMSUB_BADC w, 0, 1, 2, 3, 4
  3648. ABSW2 m0, m2, m0, m2, m4, m5
  3649. ABSW2 m1, m3, m1, m3, m4, m5
  3650. HADAMARD 0, max, 0, 2, 4, 5
  3651. HADAMARD 0, max, 1, 3, 4, 5
  3652. %if HIGH_BIT_DEPTH
  3653. pmaddwd m0, m7
  3654. pmaddwd m1, m7
  3655. paddd m6, m0
  3656. paddd m6, m1
  3657. %else ; !HIGH_BIT_DEPTH
  3658. paddw m7, m0
  3659. paddw m7, m1
  3660. %endif ; HIGH_BIT_DEPTH
  3661. SAVE_MM_PERMUTATION
  3662. ret
  3663. %macro AC_PREP 2
  3664. %if HIGH_BIT_DEPTH
  3665. pmaddwd %1, %2
  3666. %endif
  3667. %endmacro
  3668. %macro AC_PADD 3
  3669. %if HIGH_BIT_DEPTH
  3670. AC_PREP %2, %3
  3671. paddd %1, %2
  3672. %else
  3673. paddw %1, %2
  3674. %endif ; HIGH_BIT_DEPTH
  3675. %endmacro
  3676. cglobal hadamard_ac_8x8
  3677. mova m6, [mask_ac4]
  3678. %if HIGH_BIT_DEPTH
  3679. mova m7, [pw_1]
  3680. %else
  3681. pxor m7, m7
  3682. %endif ; HIGH_BIT_DEPTH
  3683. call hadamard_ac_4x4_mmx2
  3684. add r0, 4*SIZEOF_PIXEL
  3685. add r3, 32
  3686. mova m5, m0
  3687. AC_PREP m5, m7
  3688. call hadamard_ac_4x4_mmx2
  3689. lea r0, [r0+4*r1]
  3690. add r3, 64
  3691. AC_PADD m5, m0, m7
  3692. call hadamard_ac_4x4_mmx2
  3693. sub r0, 4*SIZEOF_PIXEL
  3694. sub r3, 32
  3695. AC_PADD m5, m0, m7
  3696. call hadamard_ac_4x4_mmx2
  3697. AC_PADD m5, m0, m7
  3698. sub r3, 40
  3699. mova [rsp+gprsize+8], m5 ; save satd
  3700. %if HIGH_BIT_DEPTH
  3701. pxor m6, m6
  3702. %endif
  3703. %rep 3
  3704. call hadamard_ac_2x2max_mmx2
  3705. %endrep
  3706. mova m0, [r3+0x00]
  3707. mova m1, [r3+0x20]
  3708. mova m2, [r3+0x40]
  3709. mova m3, [r3+0x60]
  3710. SUMSUB_BADC w, 0, 1, 2, 3, 4
  3711. HADAMARD 0, sumsub, 0, 2, 4, 5
  3712. ABSW2 m1, m3, m1, m3, m4, m5
  3713. ABSW2 m0, m2, m0, m2, m4, m5
  3714. HADAMARD 0, max, 1, 3, 4, 5
  3715. %if HIGH_BIT_DEPTH
  3716. pand m0, [mask_ac4]
  3717. pmaddwd m1, m7
  3718. pmaddwd m0, m7
  3719. pmaddwd m2, m7
  3720. paddd m6, m1
  3721. paddd m0, m2
  3722. paddd m6, m6
  3723. paddd m0, m6
  3724. SWAP 0, 6
  3725. %else ; !HIGH_BIT_DEPTH
  3726. pand m6, m0
  3727. paddw m7, m1
  3728. paddw m6, m2
  3729. paddw m7, m7
  3730. paddw m6, m7
  3731. %endif ; HIGH_BIT_DEPTH
  3732. mova [rsp+gprsize], m6 ; save sa8d
  3733. SWAP 0, 6
  3734. SAVE_MM_PERMUTATION
  3735. ret
  3736. %macro HADAMARD_AC_WXH_SUM_MMX 2
  3737. mova m1, [rsp+1*mmsize]
  3738. %if HIGH_BIT_DEPTH
  3739. %if %1*%2 >= 128
  3740. paddd m0, [rsp+2*mmsize]
  3741. paddd m1, [rsp+3*mmsize]
  3742. %endif
  3743. %if %1*%2 == 256
  3744. mova m2, [rsp+4*mmsize]
  3745. paddd m1, [rsp+5*mmsize]
  3746. paddd m2, [rsp+6*mmsize]
  3747. mova m3, m0
  3748. paddd m1, [rsp+7*mmsize]
  3749. paddd m0, m2
  3750. %endif
  3751. psrld m0, 1
  3752. HADDD m0, m2
  3753. psrld m1, 1
  3754. HADDD m1, m3
  3755. %else ; !HIGH_BIT_DEPTH
  3756. %if %1*%2 >= 128
  3757. paddusw m0, [rsp+2*mmsize]
  3758. paddusw m1, [rsp+3*mmsize]
  3759. %endif
  3760. %if %1*%2 == 256
  3761. mova m2, [rsp+4*mmsize]
  3762. paddusw m1, [rsp+5*mmsize]
  3763. paddusw m2, [rsp+6*mmsize]
  3764. mova m3, m0
  3765. paddusw m1, [rsp+7*mmsize]
  3766. pxor m3, m2
  3767. pand m3, [pw_1]
  3768. pavgw m0, m2
  3769. psubusw m0, m3
  3770. HADDUW m0, m2
  3771. %else
  3772. psrlw m0, 1
  3773. HADDW m0, m2
  3774. %endif
  3775. psrlw m1, 1
  3776. HADDW m1, m3
  3777. %endif ; HIGH_BIT_DEPTH
  3778. %endmacro
  3779. %macro HADAMARD_AC_WXH_MMX 2
  3780. cglobal pixel_hadamard_ac_%1x%2, 2,4
  3781. %assign pad 16-gprsize-(stack_offset&15)
  3782. %define ysub r1
  3783. FIX_STRIDES r1
  3784. sub rsp, 16+128+pad
  3785. lea r2, [r1*3]
  3786. lea r3, [rsp+16]
  3787. call hadamard_ac_8x8_mmx2
  3788. %if %2==16
  3789. %define ysub r2
  3790. lea r0, [r0+r1*4]
  3791. sub rsp, 16
  3792. call hadamard_ac_8x8_mmx2
  3793. %endif
  3794. %if %1==16
  3795. neg ysub
  3796. sub rsp, 16
  3797. lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
  3798. neg ysub
  3799. call hadamard_ac_8x8_mmx2
  3800. %if %2==16
  3801. lea r0, [r0+r1*4]
  3802. sub rsp, 16
  3803. call hadamard_ac_8x8_mmx2
  3804. %endif
  3805. %endif
  3806. HADAMARD_AC_WXH_SUM_MMX %1, %2
  3807. movd edx, m0
  3808. movd eax, m1
  3809. shr edx, 1
  3810. %if ARCH_X86_64
  3811. shl rdx, 32
  3812. add rax, rdx
  3813. %endif
  3814. add rsp, 128+%1*%2/4+pad
  3815. RET
  3816. %endmacro ; HADAMARD_AC_WXH_MMX
  3817. HADAMARD_AC_WXH_MMX 16, 16
  3818. HADAMARD_AC_WXH_MMX 8, 16
  3819. HADAMARD_AC_WXH_MMX 16, 8
  3820. HADAMARD_AC_WXH_MMX 8, 8
  3821. %macro LOAD_INC_8x4W_SSE2 5
  3822. %if HIGH_BIT_DEPTH
  3823. movu m%1, [r0]
  3824. movu m%2, [r0+r1]
  3825. movu m%3, [r0+r1*2]
  3826. movu m%4, [r0+r2]
  3827. %ifidn %1, 0
  3828. lea r0, [r0+r1*4]
  3829. %endif
  3830. %else ; !HIGH_BIT_DEPTH
  3831. movh m%1, [r0]
  3832. movh m%2, [r0+r1]
  3833. movh m%3, [r0+r1*2]
  3834. movh m%4, [r0+r2]
  3835. %ifidn %1, 0
  3836. lea r0, [r0+r1*4]
  3837. %endif
  3838. punpcklbw m%1, m%5
  3839. punpcklbw m%2, m%5
  3840. punpcklbw m%3, m%5
  3841. punpcklbw m%4, m%5
  3842. %endif ; HIGH_BIT_DEPTH
  3843. %endmacro
  3844. %macro LOAD_INC_8x4W_SSSE3 5
  3845. LOAD_DUP_4x8P %3, %4, %1, %2, [r0+r1*2], [r0+r2], [r0], [r0+r1]
  3846. %ifidn %1, 0
  3847. lea r0, [r0+r1*4]
  3848. %endif
  3849. HSUMSUB %1, %2, %3, %4, %5
  3850. %endmacro
  3851. %macro HADAMARD_AC_SSE2 0
  3852. ; in: r0=pix, r1=stride, r2=stride*3
  3853. ; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
  3854. cglobal hadamard_ac_8x8
  3855. %if ARCH_X86_64
  3856. %define spill0 m8
  3857. %define spill1 m9
  3858. %define spill2 m10
  3859. %else
  3860. %define spill0 [rsp+gprsize]
  3861. %define spill1 [rsp+gprsize+mmsize]
  3862. %define spill2 [rsp+gprsize+mmsize*2]
  3863. %endif
  3864. %if HIGH_BIT_DEPTH
  3865. %define vertical 1
  3866. %elif cpuflag(ssse3) && notcpuflag(atom)
  3867. %define vertical 0
  3868. ;LOAD_INC loads sumsubs
  3869. mova m7, [hmul_8p]
  3870. %else
  3871. %define vertical 1
  3872. ;LOAD_INC only unpacks to words
  3873. pxor m7, m7
  3874. %endif
  3875. LOAD_INC_8x4W 0, 1, 2, 3, 7
  3876. %if vertical
  3877. HADAMARD4_2D_SSE 0, 1, 2, 3, 4
  3878. %else
  3879. HADAMARD4_V 0, 1, 2, 3, 4
  3880. %endif
  3881. mova spill0, m1
  3882. SWAP 1, 7
  3883. LOAD_INC_8x4W 4, 5, 6, 7, 1
  3884. %if vertical
  3885. HADAMARD4_2D_SSE 4, 5, 6, 7, 1
  3886. %else
  3887. HADAMARD4_V 4, 5, 6, 7, 1
  3888. ; FIXME SWAP
  3889. mova m1, spill0
  3890. mova spill0, m6
  3891. mova spill1, m7
  3892. HADAMARD 1, sumsub, 0, 1, 6, 7
  3893. HADAMARD 1, sumsub, 2, 3, 6, 7
  3894. mova m6, spill0
  3895. mova m7, spill1
  3896. mova spill0, m1
  3897. mova spill1, m0
  3898. HADAMARD 1, sumsub, 4, 5, 1, 0
  3899. HADAMARD 1, sumsub, 6, 7, 1, 0
  3900. mova m0, spill1
  3901. %endif
  3902. mova spill1, m2
  3903. mova spill2, m3
  3904. ABSW m1, m0, m0
  3905. ABSW m2, m4, m4
  3906. ABSW m3, m5, m5
  3907. paddw m1, m2
  3908. SUMSUB_BA w, 0, 4
  3909. %if vertical
  3910. pand m1, [mask_ac4]
  3911. %else
  3912. pand m1, [mask_ac4b]
  3913. %endif
  3914. AC_PREP m1, [pw_1]
  3915. ABSW m2, spill0
  3916. AC_PADD m1, m3, [pw_1]
  3917. ABSW m3, spill1
  3918. AC_PADD m1, m2, [pw_1]
  3919. ABSW m2, spill2
  3920. AC_PADD m1, m3, [pw_1]
  3921. ABSW m3, m6, m6
  3922. AC_PADD m1, m2, [pw_1]
  3923. ABSW m2, m7, m7
  3924. AC_PADD m1, m3, [pw_1]
  3925. AC_PADD m1, m2, [pw_1]
  3926. paddw m3, m7, spill2
  3927. psubw m7, spill2
  3928. mova [rsp+gprsize+mmsize*2], m1 ; save satd
  3929. paddw m2, m6, spill1
  3930. psubw m6, spill1
  3931. paddw m1, m5, spill0
  3932. psubw m5, spill0
  3933. %assign %%x 2
  3934. %if vertical
  3935. %assign %%x 4
  3936. %endif
  3937. mova spill1, m4
  3938. HADAMARD %%x, amax, 3, 7, 4
  3939. HADAMARD %%x, amax, 2, 6, 7, 4
  3940. mova m4, spill1
  3941. HADAMARD %%x, amax, 1, 5, 6, 7
  3942. HADAMARD %%x, sumsub, 0, 4, 5, 6
  3943. AC_PREP m2, [pw_1]
  3944. AC_PADD m2, m3, [pw_1]
  3945. AC_PADD m2, m1, [pw_1]
  3946. %if HIGH_BIT_DEPTH
  3947. paddd m2, m2
  3948. %else
  3949. paddw m2, m2
  3950. %endif ; HIGH_BIT_DEPTH
  3951. ABSW m4, m4, m7
  3952. pand m0, [mask_ac8]
  3953. ABSW m0, m0, m7
  3954. AC_PADD m2, m4, [pw_1]
  3955. AC_PADD m2, m0, [pw_1]
  3956. mova [rsp+gprsize+mmsize], m2 ; save sa8d
  3957. SWAP 0, 2
  3958. SAVE_MM_PERMUTATION
  3959. ret
  3960. HADAMARD_AC_WXH_SSE2 16, 16
  3961. HADAMARD_AC_WXH_SSE2 16, 8
  3962. %if mmsize <= 16
  3963. HADAMARD_AC_WXH_SSE2 8, 16
  3964. HADAMARD_AC_WXH_SSE2 8, 8
  3965. %endif
  3966. %endmacro ; HADAMARD_AC_SSE2
  3967. %macro HADAMARD_AC_WXH_SUM_SSE2 2
  3968. mova m1, [rsp+2*mmsize]
  3969. %if HIGH_BIT_DEPTH
  3970. %if %1*%2 >= 128
  3971. paddd m0, [rsp+3*mmsize]
  3972. paddd m1, [rsp+4*mmsize]
  3973. %endif
  3974. %if %1*%2 == 256
  3975. paddd m0, [rsp+5*mmsize]
  3976. paddd m1, [rsp+6*mmsize]
  3977. paddd m0, [rsp+7*mmsize]
  3978. paddd m1, [rsp+8*mmsize]
  3979. psrld m0, 1
  3980. %endif
  3981. HADDD xm0, xm2
  3982. HADDD xm1, xm3
  3983. %else ; !HIGH_BIT_DEPTH
  3984. %if %1*%2*16/mmsize >= 128
  3985. paddusw m0, [rsp+3*mmsize]
  3986. paddusw m1, [rsp+4*mmsize]
  3987. %endif
  3988. %if %1*%2*16/mmsize == 256
  3989. paddusw m0, [rsp+5*mmsize]
  3990. paddusw m1, [rsp+6*mmsize]
  3991. paddusw m0, [rsp+7*mmsize]
  3992. paddusw m1, [rsp+8*mmsize]
  3993. psrlw m0, 1
  3994. %endif
  3995. %if mmsize==32
  3996. vextracti128 xm2, m0, 1
  3997. vextracti128 xm3, m1, 1
  3998. paddusw xm0, xm2
  3999. paddusw xm1, xm3
  4000. %endif
  4001. HADDUW xm0, xm2
  4002. HADDW xm1, xm3
  4003. %endif ; HIGH_BIT_DEPTH
  4004. %endmacro
  4005. ; struct { int satd, int sa8d; } pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
  4006. %macro HADAMARD_AC_WXH_SSE2 2
  4007. cglobal pixel_hadamard_ac_%1x%2, 2,4,11
  4008. %define ysub r1
  4009. FIX_STRIDES r1
  4010. mov r3, rsp
  4011. and rsp, ~(mmsize-1)
  4012. sub rsp, mmsize*3
  4013. lea r2, [r1*3]
  4014. call hadamard_ac_8x8
  4015. %if %2==16
  4016. %define ysub r2
  4017. lea r0, [r0+r1*4]
  4018. sub rsp, mmsize*2
  4019. call hadamard_ac_8x8
  4020. %endif
  4021. %if %1==16 && mmsize <= 16
  4022. neg ysub
  4023. sub rsp, mmsize*2
  4024. lea r0, [r0+ysub*4+8*SIZEOF_PIXEL]
  4025. neg ysub
  4026. call hadamard_ac_8x8
  4027. %if %2==16
  4028. lea r0, [r0+r1*4]
  4029. sub rsp, mmsize*2
  4030. call hadamard_ac_8x8
  4031. %endif
  4032. %endif
  4033. HADAMARD_AC_WXH_SUM_SSE2 %1, %2
  4034. movd edx, xm0
  4035. movd eax, xm1
  4036. shr edx, 2 - (%1*%2*16/mmsize >> 8)
  4037. shr eax, 1
  4038. %if ARCH_X86_64
  4039. shl rdx, 32
  4040. add rax, rdx
  4041. %endif
  4042. mov rsp, r3
  4043. RET
  4044. %endmacro ; HADAMARD_AC_WXH_SSE2
  4045. ; instantiate satds
  4046. %if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
  4047. cextern pixel_sa8d_8x8_internal_mmx2
  4048. INIT_MMX mmx2
  4049. SA8D
  4050. %endif
  4051. %define TRANS TRANS_SSE2
  4052. %define DIFFOP DIFF_UNPACK_SSE2
  4053. %define LOAD_INC_8x4W LOAD_INC_8x4W_SSE2
  4054. %define LOAD_SUMSUB_8x4P LOAD_DIFF_8x4P
  4055. %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSE2
  4056. %define movdqa movaps ; doesn't hurt pre-nehalem, might as well save size
  4057. %define movdqu movups
  4058. %define punpcklqdq movlhps
  4059. INIT_XMM sse2
  4060. SA8D
  4061. SATDS_SSE2
  4062. %if ARCH_X86_64
  4063. SA8D_SATD
  4064. %endif
  4065. %if HIGH_BIT_DEPTH == 0
  4066. INTRA_SA8D_SSE2
  4067. %endif
  4068. INIT_MMX mmx2
  4069. INTRA_X3_MMX
  4070. INIT_XMM sse2
  4071. HADAMARD_AC_SSE2
  4072. %if HIGH_BIT_DEPTH == 0
  4073. INIT_XMM ssse3,atom
  4074. SATDS_SSE2
  4075. SA8D
  4076. HADAMARD_AC_SSE2
  4077. %if ARCH_X86_64
  4078. SA8D_SATD
  4079. %endif
  4080. %endif
  4081. %define DIFFOP DIFF_SUMSUB_SSSE3
  4082. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
  4083. %if HIGH_BIT_DEPTH == 0
  4084. %define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
  4085. %define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
  4086. %define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
  4087. %endif
  4088. INIT_XMM ssse3
  4089. SATDS_SSE2
  4090. SA8D
  4091. HADAMARD_AC_SSE2
  4092. %if ARCH_X86_64
  4093. SA8D_SATD
  4094. %endif
  4095. %if HIGH_BIT_DEPTH == 0
  4096. INTRA_X9
  4097. INTRA8_X9
  4098. %endif
  4099. %undef movdqa ; nehalem doesn't like movaps
  4100. %undef movdqu ; movups
  4101. %undef punpcklqdq ; or movlhps
  4102. %if HIGH_BIT_DEPTH == 0
  4103. INIT_MMX ssse3
  4104. INTRA_X3_MMX
  4105. %endif
  4106. %define TRANS TRANS_SSE4
  4107. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
  4108. INIT_XMM sse4
  4109. SATDS_SSE2
  4110. SA8D
  4111. HADAMARD_AC_SSE2
  4112. %if ARCH_X86_64
  4113. SA8D_SATD
  4114. %endif
  4115. %if HIGH_BIT_DEPTH == 0
  4116. INTRA_X9
  4117. INTRA8_X9
  4118. %endif
  4119. ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
  4120. ; it's effectively free.
  4121. %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
  4122. INIT_XMM avx
  4123. SATDS_SSE2
  4124. SA8D
  4125. %if ARCH_X86_64
  4126. SA8D_SATD
  4127. %endif
  4128. %if HIGH_BIT_DEPTH == 0
  4129. INTRA_X9
  4130. INTRA8_X9
  4131. %endif
  4132. HADAMARD_AC_SSE2
  4133. %define TRANS TRANS_XOP
  4134. INIT_XMM xop
  4135. SATDS_SSE2
  4136. SA8D
  4137. %if ARCH_X86_64
  4138. SA8D_SATD
  4139. %endif
  4140. %if HIGH_BIT_DEPTH == 0
  4141. INTRA_X9
  4142. ; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
  4143. %endif
  4144. HADAMARD_AC_SSE2
  4145. %if HIGH_BIT_DEPTH == 0
  4146. %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2
  4147. %define LOAD_DUP_4x8P LOAD_DUP_4x16P_AVX2
  4148. %define TRANS TRANS_SSE4
  4149. INIT_YMM avx2
  4150. HADAMARD_AC_SSE2
  4151. %if ARCH_X86_64
  4152. SA8D_SATD
  4153. %endif
  4154. %macro LOAD_SUMSUB_8x8P_AVX2 7 ; 4*dst, 2*tmp, mul]
  4155. movq xm%1, [r0]
  4156. movq xm%3, [r2]
  4157. movq xm%2, [r0+r1]
  4158. movq xm%4, [r2+r3]
  4159. vinserti128 m%1, m%1, [r0+4*r1], 1
  4160. vinserti128 m%3, m%3, [r2+4*r3], 1
  4161. vinserti128 m%2, m%2, [r0+r4], 1
  4162. vinserti128 m%4, m%4, [r2+r5], 1
  4163. punpcklqdq m%1, m%1
  4164. punpcklqdq m%3, m%3
  4165. punpcklqdq m%2, m%2
  4166. punpcklqdq m%4, m%4
  4167. DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %7
  4168. lea r0, [r0+2*r1]
  4169. lea r2, [r2+2*r3]
  4170. movq xm%3, [r0]
  4171. movq xm%5, [r2]
  4172. movq xm%4, [r0+r1]
  4173. movq xm%6, [r2+r3]
  4174. vinserti128 m%3, m%3, [r0+4*r1], 1
  4175. vinserti128 m%5, m%5, [r2+4*r3], 1
  4176. vinserti128 m%4, m%4, [r0+r4], 1
  4177. vinserti128 m%6, m%6, [r2+r5], 1
  4178. punpcklqdq m%3, m%3
  4179. punpcklqdq m%5, m%5
  4180. punpcklqdq m%4, m%4
  4181. punpcklqdq m%6, m%6
  4182. DIFF_SUMSUB_SSSE3 %3, %5, %4, %6, %7
  4183. %endmacro
  4184. %macro SATD_START_AVX2 2-3 0
  4185. FIX_STRIDES r1, r3
  4186. %if %3
  4187. mova %2, [hmul_8p]
  4188. lea r4, [5*r1]
  4189. lea r5, [5*r3]
  4190. %else
  4191. mova %2, [hmul_16p]
  4192. lea r4, [3*r1]
  4193. lea r5, [3*r3]
  4194. %endif
  4195. pxor %1, %1
  4196. %endmacro
  4197. %define TRANS TRANS_SSE4
  4198. INIT_YMM avx2
  4199. cglobal pixel_satd_16x8_internal
  4200. LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
  4201. SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
  4202. LOAD_SUMSUB_16x4P_AVX2 0, 1, 2, 3, 4, 5, 7, r0, r2, 0
  4203. SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
  4204. ret
  4205. cglobal pixel_satd_16x16, 4,6,8
  4206. SATD_START_AVX2 m6, m7
  4207. call pixel_satd_16x8_internal
  4208. lea r0, [r0+4*r1]
  4209. lea r2, [r2+4*r3]
  4210. pixel_satd_16x8_internal:
  4211. call pixel_satd_16x8_internal
  4212. vextracti128 xm0, m6, 1
  4213. paddw xm0, xm6
  4214. SATD_END_SSE2 xm0
  4215. RET
  4216. cglobal pixel_satd_16x8, 4,6,8
  4217. SATD_START_AVX2 m6, m7
  4218. jmp pixel_satd_16x8_internal
  4219. cglobal pixel_satd_8x8_internal
  4220. LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
  4221. SATD_8x4_SSE 0, 0, 1, 2, 3, 4, 5, 6
  4222. ret
  4223. cglobal pixel_satd_8x16, 4,6,8
  4224. SATD_START_AVX2 m6, m7, 1
  4225. call pixel_satd_8x8_internal
  4226. lea r0, [r0+2*r1]
  4227. lea r2, [r2+2*r3]
  4228. lea r0, [r0+4*r1]
  4229. lea r2, [r2+4*r3]
  4230. call pixel_satd_8x8_internal
  4231. vextracti128 xm0, m6, 1
  4232. paddw xm0, xm6
  4233. SATD_END_SSE2 xm0
  4234. RET
  4235. cglobal pixel_satd_8x8, 4,6,8
  4236. SATD_START_AVX2 m6, m7, 1
  4237. call pixel_satd_8x8_internal
  4238. vextracti128 xm0, m6, 1
  4239. paddw xm0, xm6
  4240. SATD_END_SSE2 xm0
  4241. RET
  4242. cglobal pixel_sa8d_8x8_internal
  4243. LOAD_SUMSUB_8x8P_AVX2 0, 1, 2, 3, 4, 5, 7
  4244. HADAMARD4_V 0, 1, 2, 3, 4
  4245. HADAMARD 8, sumsub, 0, 1, 4, 5
  4246. HADAMARD 8, sumsub, 2, 3, 4, 5
  4247. HADAMARD 2, sumsub, 0, 1, 4, 5
  4248. HADAMARD 2, sumsub, 2, 3, 4, 5
  4249. HADAMARD 1, amax, 0, 1, 4, 5
  4250. HADAMARD 1, amax, 2, 3, 4, 5
  4251. paddw m6, m0
  4252. paddw m6, m2
  4253. ret
  4254. cglobal pixel_sa8d_8x8, 4,6,8
  4255. SATD_START_AVX2 m6, m7, 1
  4256. call pixel_sa8d_8x8_internal
  4257. vextracti128 xm1, m6, 1
  4258. paddw xm6, xm1
  4259. HADDW xm6, xm1
  4260. movd eax, xm6
  4261. add eax, 1
  4262. shr eax, 1
  4263. RET
  4264. cglobal intra_sad_x9_8x8, 5,7,8
  4265. %define pred(i,j) [rsp+i*0x40+j*0x20]
  4266. mov r6, rsp
  4267. and rsp, ~31
  4268. sub rsp, 0x240
  4269. movu m5, [r0+0*FENC_STRIDE]
  4270. movu m6, [r0+4*FENC_STRIDE]
  4271. punpcklqdq m5, [r0+2*FENC_STRIDE]
  4272. punpcklqdq m6, [r0+6*FENC_STRIDE]
  4273. ; save instruction size: avoid 4-byte memory offsets
  4274. lea r0, [intra8x9_h1+128]
  4275. %define off(m) (r0+m-(intra8x9_h1+128))
  4276. vpbroadcastq m0, [r2+16]
  4277. psadbw m4, m0, m5
  4278. psadbw m2, m0, m6
  4279. mova pred(0,0), m0
  4280. mova pred(0,1), m0
  4281. paddw m4, m2
  4282. vpbroadcastq m1, [r2+7]
  4283. pshufb m3, m1, [off(intra8x9_h1)]
  4284. pshufb m2, m1, [off(intra8x9_h3)]
  4285. mova pred(1,0), m3
  4286. mova pred(1,1), m2
  4287. psadbw m3, m5
  4288. psadbw m2, m6
  4289. paddw m3, m2
  4290. lea r5, [rsp+0x100]
  4291. %define pred(i,j) [r5+i*0x40+j*0x20-0x100]
  4292. ; combine the first two
  4293. pslldq m3, 2
  4294. por m4, m3
  4295. pxor m2, m2
  4296. psadbw m0, m2
  4297. psadbw m1, m2
  4298. paddw m0, m1
  4299. psrlw m0, 3
  4300. pavgw m0, m2
  4301. pshufb m0, m2
  4302. mova pred(2,0), m0
  4303. mova pred(2,1), m0
  4304. psadbw m3, m0, m5
  4305. psadbw m2, m0, m6
  4306. paddw m3, m2
  4307. pslldq m3, 4
  4308. por m4, m3
  4309. vbroadcasti128 m0, [r2+16]
  4310. vbroadcasti128 m2, [r2+17]
  4311. pslldq m1, m0, 1
  4312. pavgb m3, m0, m2
  4313. PRED4x4_LOWPASS m0, m1, m2, m0, m7
  4314. pshufb m1, m0, [off(intra8x9_ddl1)]
  4315. pshufb m2, m0, [off(intra8x9_ddl3)]
  4316. mova pred(3,0), m1
  4317. mova pred(3,1), m2
  4318. psadbw m1, m5
  4319. psadbw m2, m6
  4320. paddw m1, m2
  4321. pslldq m1, 6
  4322. por m4, m1
  4323. vextracti128 xm1, m4, 1
  4324. paddw xm4, xm1
  4325. mova [r4], xm4
  4326. ; for later
  4327. vinserti128 m7, m3, xm0, 1
  4328. vbroadcasti128 m2, [r2+8]
  4329. vbroadcasti128 m0, [r2+7]
  4330. vbroadcasti128 m1, [r2+6]
  4331. pavgb m3, m2, m0
  4332. PRED4x4_LOWPASS m0, m1, m2, m0, m4
  4333. pshufb m1, m0, [off(intra8x9_ddr1)]
  4334. pshufb m2, m0, [off(intra8x9_ddr3)]
  4335. mova pred(4,0), m1
  4336. mova pred(4,1), m2
  4337. psadbw m4, m1, m5
  4338. psadbw m2, m6
  4339. paddw m4, m2
  4340. add r0, 256
  4341. add r5, 0xC0
  4342. %define off(m) (r0+m-(intra8x9_h1+256+128))
  4343. %define pred(i,j) [r5+i*0x40+j*0x20-0x1C0]
  4344. vpblendd m2, m3, m0, 11110011b
  4345. pshufb m1, m2, [off(intra8x9_vr1)]
  4346. pshufb m2, m2, [off(intra8x9_vr3)]
  4347. mova pred(5,0), m1
  4348. mova pred(5,1), m2
  4349. psadbw m1, m5
  4350. psadbw m2, m6
  4351. paddw m1, m2
  4352. pslldq m1, 2
  4353. por m4, m1
  4354. psrldq m2, m3, 4
  4355. pblendw m2, m0, q3330
  4356. punpcklbw m0, m3
  4357. pshufb m1, m2, [off(intra8x9_hd1)]
  4358. pshufb m2, m0, [off(intra8x9_hd3)]
  4359. mova pred(6,0), m1
  4360. mova pred(6,1), m2
  4361. psadbw m1, m5
  4362. psadbw m2, m6
  4363. paddw m1, m2
  4364. pslldq m1, 4
  4365. por m4, m1
  4366. pshufb m1, m7, [off(intra8x9_vl1)]
  4367. pshufb m2, m7, [off(intra8x9_vl3)]
  4368. mova pred(7,0), m1
  4369. mova pred(7,1), m2
  4370. psadbw m1, m5
  4371. psadbw m2, m6
  4372. paddw m1, m2
  4373. pslldq m1, 6
  4374. por m4, m1
  4375. vextracti128 xm1, m4, 1
  4376. paddw xm4, xm1
  4377. mova xm3, [r4]
  4378. SBUTTERFLY qdq, 3, 4, 7
  4379. paddw xm3, xm4
  4380. pslldq m1, m0, 1
  4381. vpbroadcastd m0, [r2+7]
  4382. palignr m0, m1, 1
  4383. pshufb m1, m0, [off(intra8x9_hu1)]
  4384. pshufb m2, m0, [off(intra8x9_hu3)]
  4385. mova pred(8,0), m1
  4386. mova pred(8,1), m2
  4387. psadbw m1, m5
  4388. psadbw m2, m6
  4389. paddw m1, m2
  4390. vextracti128 xm2, m1, 1
  4391. paddw xm1, xm2
  4392. MOVHL xm2, xm1
  4393. paddw xm1, xm2
  4394. movd r2d, xm1
  4395. paddw xm3, [r3]
  4396. mova [r4], xm3
  4397. add r2w, word [r3+16]
  4398. mov [r4+16], r2w
  4399. phminposuw xm3, xm3
  4400. movd r3d, xm3
  4401. add r2d, 8<<16
  4402. cmp r3w, r2w
  4403. cmovg r3d, r2d
  4404. mov r2d, r3d
  4405. shr r3, 16
  4406. shl r3, 6
  4407. add r1, 4*FDEC_STRIDE
  4408. mova xm0, [rsp+r3+0x00]
  4409. mova xm1, [rsp+r3+0x10]
  4410. mova xm2, [rsp+r3+0x20]
  4411. mova xm3, [rsp+r3+0x30]
  4412. movq [r1+FDEC_STRIDE*-4], xm0
  4413. movhps [r1+FDEC_STRIDE*-2], xm0
  4414. movq [r1+FDEC_STRIDE*-3], xm1
  4415. movhps [r1+FDEC_STRIDE*-1], xm1
  4416. movq [r1+FDEC_STRIDE* 0], xm2
  4417. movhps [r1+FDEC_STRIDE* 2], xm2
  4418. movq [r1+FDEC_STRIDE* 1], xm3
  4419. movhps [r1+FDEC_STRIDE* 3], xm3
  4420. mov rsp, r6
  4421. mov eax, r2d
  4422. RET
  4423. %macro SATD_AVX512_LOAD4 2 ; size, opmask
  4424. vpbroadcast%1 m0, [r0]
  4425. vpbroadcast%1 m0 {%2}, [r0+2*r1]
  4426. vpbroadcast%1 m2, [r2]
  4427. vpbroadcast%1 m2 {%2}, [r2+2*r3]
  4428. add r0, r1
  4429. add r2, r3
  4430. vpbroadcast%1 m1, [r0]
  4431. vpbroadcast%1 m1 {%2}, [r0+2*r1]
  4432. vpbroadcast%1 m3, [r2]
  4433. vpbroadcast%1 m3 {%2}, [r2+2*r3]
  4434. %endmacro
  4435. %macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
  4436. vpbroadcast%1 %{2}0, [r0]
  4437. vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
  4438. vpbroadcast%1 %{2}2, [r2]
  4439. vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
  4440. vpbroadcast%1 m0 {%4}, [r0+4*r1]
  4441. vpbroadcast%1 m2 {%4}, [r2+4*r3]
  4442. vpbroadcast%1 m0 {%5}, [r0+2*r4]
  4443. vpbroadcast%1 m2 {%5}, [r2+2*r5]
  4444. vpbroadcast%1 %{2}1, [r0+r1]
  4445. vpbroadcast%1 %{2}1 {%3}, [r0+r4]
  4446. vpbroadcast%1 %{2}3, [r2+r3]
  4447. vpbroadcast%1 %{2}3 {%3}, [r2+r5]
  4448. lea r0, [r0+4*r1]
  4449. lea r2, [r2+4*r3]
  4450. vpbroadcast%1 m1 {%4}, [r0+r1]
  4451. vpbroadcast%1 m3 {%4}, [r2+r3]
  4452. vpbroadcast%1 m1 {%5}, [r0+r4]
  4453. vpbroadcast%1 m3 {%5}, [r2+r5]
  4454. %endmacro
  4455. %macro SATD_AVX512_PACKED 0
  4456. DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
  4457. SUMSUB_BA w, 0, 1, 2
  4458. SBUTTERFLY qdq, 0, 1, 2
  4459. SUMSUB_BA w, 0, 1, 2
  4460. HMAXABSW2 0, 1, 2, 3
  4461. %endmacro
  4462. %macro SATD_AVX512_END 0-1 0 ; sa8d
  4463. vpaddw m0 {k1}{z}, m1 ; zero-extend to dwords
  4464. %if ARCH_X86_64
  4465. %if mmsize == 64
  4466. vextracti32x8 ym1, m0, 1
  4467. paddd ym0, ym1
  4468. %endif
  4469. %if mmsize >= 32
  4470. vextracti128 xm1, ym0, 1
  4471. paddd xmm0, xm0, xm1
  4472. %endif
  4473. punpckhqdq xmm1, xmm0, xmm0
  4474. paddd xmm0, xmm1
  4475. movq rax, xmm0
  4476. rorx rdx, rax, 32
  4477. %if %1
  4478. lea eax, [rax+rdx+1]
  4479. shr eax, 1
  4480. %else
  4481. add eax, edx
  4482. %endif
  4483. %else
  4484. HADDD m0, m1
  4485. movd eax, xm0
  4486. %if %1
  4487. inc eax
  4488. shr eax, 1
  4489. %endif
  4490. %endif
  4491. RET
  4492. %endmacro
  4493. %macro HMAXABSW2 4 ; a, b, tmp1, tmp2
  4494. pabsw m%1, m%1
  4495. pabsw m%2, m%2
  4496. psrldq m%3, m%1, 2
  4497. psrld m%4, m%2, 16
  4498. pmaxsw m%1, m%3
  4499. pmaxsw m%2, m%4
  4500. %endmacro
  4501. INIT_ZMM avx512
  4502. cglobal pixel_satd_16x8_internal
  4503. vbroadcasti64x4 m6, [hmul_16p]
  4504. kxnorb k2, k2, k2
  4505. mov r4d, 0x55555555
  4506. knotw k2, k2
  4507. kmovd k1, r4d
  4508. lea r4, [3*r1]
  4509. lea r5, [3*r3]
  4510. satd_16x8_avx512:
  4511. vbroadcasti128 ym0, [r0]
  4512. vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
  4513. vbroadcasti128 ym4, [r2]
  4514. vbroadcasti32x4 m4 {k2}, [r2+4*r3]
  4515. vbroadcasti128 ym2, [r0+2*r1]
  4516. vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
  4517. vbroadcasti128 ym5, [r2+2*r3]
  4518. vbroadcasti32x4 m5 {k2}, [r2+2*r5]
  4519. DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
  4520. vbroadcasti128 ym1, [r0+r1]
  4521. vbroadcasti128 ym4, [r2+r3]
  4522. vbroadcasti128 ym3, [r0+r4]
  4523. vbroadcasti128 ym5, [r2+r5]
  4524. lea r0, [r0+4*r1]
  4525. lea r2, [r2+4*r3]
  4526. vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
  4527. vbroadcasti32x4 m4 {k2}, [r2+r3]
  4528. vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
  4529. vbroadcasti32x4 m5 {k2}, [r2+r5]
  4530. DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
  4531. HADAMARD4_V 0, 1, 2, 3, 4
  4532. HMAXABSW2 0, 2, 4, 5
  4533. HMAXABSW2 1, 3, 4, 5
  4534. paddw m4, m0, m2 ; m1
  4535. paddw m2, m1, m3 ; m0
  4536. ret
  4537. cglobal pixel_satd_8x8_internal
  4538. vbroadcasti64x4 m4, [hmul_16p]
  4539. mov r4d, 0x55555555
  4540. kmovd k1, r4d ; 01010101
  4541. kshiftlb k2, k1, 5 ; 10100000
  4542. kshiftlb k3, k1, 4 ; 01010000
  4543. lea r4, [3*r1]
  4544. lea r5, [3*r3]
  4545. satd_8x8_avx512:
  4546. SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
  4547. SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
  4548. ret
  4549. cglobal pixel_satd_16x8, 4,6
  4550. call pixel_satd_16x8_internal_avx512
  4551. jmp satd_zmm_avx512_end
  4552. cglobal pixel_satd_16x16, 4,6
  4553. call pixel_satd_16x8_internal_avx512
  4554. lea r0, [r0+4*r1]
  4555. lea r2, [r2+4*r3]
  4556. paddw m7, m0, m1
  4557. call satd_16x8_avx512
  4558. paddw m1, m7
  4559. jmp satd_zmm_avx512_end
  4560. cglobal pixel_satd_8x8, 4,6
  4561. call pixel_satd_8x8_internal_avx512
  4562. satd_zmm_avx512_end:
  4563. SATD_AVX512_END
  4564. cglobal pixel_satd_8x16, 4,6
  4565. call pixel_satd_8x8_internal_avx512
  4566. lea r0, [r0+4*r1]
  4567. lea r2, [r2+4*r3]
  4568. paddw m5, m0, m1
  4569. call satd_8x8_avx512
  4570. paddw m1, m5
  4571. jmp satd_zmm_avx512_end
  4572. INIT_YMM avx512
  4573. cglobal pixel_satd_4x8_internal
  4574. vbroadcasti128 m4, [hmul_4p]
  4575. mov r4d, 0x55550c
  4576. kmovd k2, r4d ; 00001100
  4577. kshiftlb k3, k2, 2 ; 00110000
  4578. kshiftlb k4, k2, 4 ; 11000000
  4579. kshiftrd k1, k2, 8 ; 01010101
  4580. lea r4, [3*r1]
  4581. lea r5, [3*r3]
  4582. satd_4x8_avx512:
  4583. SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
  4584. satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
  4585. SATD_AVX512_PACKED
  4586. ret
  4587. cglobal pixel_satd_8x4, 4,5
  4588. mova m4, [hmul_16p]
  4589. mov r4d, 0x5555
  4590. kmovw k1, r4d
  4591. SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
  4592. call satd_ymm_avx512 ; 3 1 3 1
  4593. jmp satd_ymm_avx512_end2
  4594. cglobal pixel_satd_4x8, 4,6
  4595. call pixel_satd_4x8_internal_avx512
  4596. satd_ymm_avx512_end:
  4597. %if ARCH_X86_64 == 0
  4598. pop r5d
  4599. %assign regs_used 5
  4600. %endif
  4601. satd_ymm_avx512_end2:
  4602. SATD_AVX512_END
  4603. cglobal pixel_satd_4x16, 4,6
  4604. call pixel_satd_4x8_internal_avx512
  4605. lea r0, [r0+4*r1]
  4606. lea r2, [r2+4*r3]
  4607. paddw m5, m0, m1
  4608. call satd_4x8_avx512
  4609. paddw m1, m5
  4610. jmp satd_ymm_avx512_end
  4611. INIT_XMM avx512
  4612. cglobal pixel_satd_4x4, 4,5
  4613. mova m4, [hmul_4p]
  4614. mov r4d, 0x550c
  4615. kmovw k2, r4d
  4616. kshiftrw k1, k2, 8
  4617. SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
  4618. SATD_AVX512_PACKED ; 1 1 3 3
  4619. SWAP 0, 1
  4620. SATD_AVX512_END
  4621. INIT_ZMM avx512
  4622. cglobal pixel_sa8d_8x8, 4,6
  4623. vbroadcasti64x4 m4, [hmul_16p]
  4624. mov r4d, 0x55555555
  4625. kmovd k1, r4d ; 01010101
  4626. kshiftlb k2, k1, 5 ; 10100000
  4627. kshiftlb k3, k1, 4 ; 01010000
  4628. lea r4, [3*r1]
  4629. lea r5, [3*r3]
  4630. SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
  4631. DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5
  4632. SUMSUB_BA w, 0, 1, 2
  4633. SBUTTERFLY qdq, 0, 1, 2
  4634. SUMSUB_BA w, 0, 1, 2
  4635. shufps m2, m0, m1, q2020
  4636. shufps m1, m0, m1, q3131
  4637. SUMSUB_BA w, 2, 1, 0
  4638. vshufi32x4 m0, m2, m1, q1010
  4639. vshufi32x4 m1, m2, m1, q3232
  4640. SUMSUB_BA w, 0, 1, 2
  4641. HMAXABSW2 0, 1, 2, 3
  4642. SATD_AVX512_END 1
  4643. %endif ; HIGH_BIT_DEPTH
  4644. ;=============================================================================
  4645. ; SSIM
  4646. ;=============================================================================
  4647. ;-----------------------------------------------------------------------------
  4648. ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
  4649. ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
  4650. ;-----------------------------------------------------------------------------
  4651. %macro SSIM_ITER 1
  4652. %if HIGH_BIT_DEPTH
  4653. movu m4, [r0+(%1&1)*r1]
  4654. movu m5, [r2+(%1&1)*r3]
  4655. %elif cpuflag(avx)
  4656. pmovzxbw m4, [r0+(%1&1)*r1]
  4657. pmovzxbw m5, [r2+(%1&1)*r3]
  4658. %else
  4659. movq m4, [r0+(%1&1)*r1]
  4660. movq m5, [r2+(%1&1)*r3]
  4661. punpcklbw m4, m7
  4662. punpcklbw m5, m7
  4663. %endif
  4664. %if %1==1
  4665. lea r0, [r0+r1*2]
  4666. lea r2, [r2+r3*2]
  4667. %endif
  4668. %if %1 == 0 && cpuflag(avx)
  4669. SWAP 0, 4
  4670. SWAP 1, 5
  4671. pmaddwd m4, m0, m0
  4672. pmaddwd m5, m1, m1
  4673. pmaddwd m6, m0, m1
  4674. %else
  4675. %if %1 == 0
  4676. mova m0, m4
  4677. mova m1, m5
  4678. %else
  4679. paddw m0, m4
  4680. paddw m1, m5
  4681. %endif
  4682. pmaddwd m6, m4, m5
  4683. pmaddwd m4, m4
  4684. pmaddwd m5, m5
  4685. %endif
  4686. ACCUM paddd, 2, 4, %1
  4687. ACCUM paddd, 3, 6, %1
  4688. paddd m2, m5
  4689. %endmacro
  4690. %macro SSIM 0
  4691. %if HIGH_BIT_DEPTH
  4692. cglobal pixel_ssim_4x4x2_core, 4,4,7
  4693. FIX_STRIDES r1, r3
  4694. %else
  4695. cglobal pixel_ssim_4x4x2_core, 4,4,7+notcpuflag(avx)
  4696. %if notcpuflag(avx)
  4697. pxor m7, m7
  4698. %endif
  4699. %endif
  4700. SSIM_ITER 0
  4701. SSIM_ITER 1
  4702. SSIM_ITER 2
  4703. SSIM_ITER 3
  4704. %if UNIX64
  4705. DECLARE_REG_TMP 4
  4706. %else
  4707. DECLARE_REG_TMP 0
  4708. mov t0, r4mp
  4709. %endif
  4710. %if cpuflag(ssse3)
  4711. phaddw m0, m1
  4712. pmaddwd m0, [pw_1]
  4713. phaddd m2, m3
  4714. %else
  4715. mova m4, [pw_1]
  4716. pmaddwd m0, m4
  4717. pmaddwd m1, m4
  4718. packssdw m0, m1
  4719. shufps m1, m2, m3, q2020
  4720. shufps m2, m3, q3131
  4721. pmaddwd m0, m4
  4722. paddd m2, m1
  4723. %endif
  4724. shufps m1, m0, m2, q2020
  4725. shufps m0, m2, q3131
  4726. mova [t0], m1
  4727. mova [t0+16], m0
  4728. RET
  4729. ;-----------------------------------------------------------------------------
  4730. ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
  4731. ;-----------------------------------------------------------------------------
  4732. cglobal pixel_ssim_end4, 2,3
  4733. mov r2d, r2m
  4734. mova m0, [r0+ 0]
  4735. mova m1, [r0+16]
  4736. mova m2, [r0+32]
  4737. mova m3, [r0+48]
  4738. mova m4, [r0+64]
  4739. paddd m0, [r1+ 0]
  4740. paddd m1, [r1+16]
  4741. paddd m2, [r1+32]
  4742. paddd m3, [r1+48]
  4743. paddd m4, [r1+64]
  4744. paddd m0, m1
  4745. paddd m1, m2
  4746. paddd m2, m3
  4747. paddd m3, m4
  4748. TRANSPOSE4x4D 0, 1, 2, 3, 4
  4749. ; s1=m0, s2=m1, ss=m2, s12=m3
  4750. %if BIT_DEPTH == 10
  4751. cvtdq2ps m0, m0
  4752. cvtdq2ps m1, m1
  4753. cvtdq2ps m2, m2
  4754. cvtdq2ps m3, m3
  4755. mulps m4, m0, m1 ; s1*s2
  4756. mulps m0, m0 ; s1*s1
  4757. mulps m1, m1 ; s2*s2
  4758. mulps m2, [pf_64] ; ss*64
  4759. mulps m3, [pf_128] ; s12*128
  4760. addps m4, m4 ; s1*s2*2
  4761. addps m0, m1 ; s1*s1 + s2*s2
  4762. subps m2, m0 ; vars
  4763. subps m3, m4 ; covar*2
  4764. movaps m1, [ssim_c1]
  4765. addps m4, m1 ; s1*s2*2 + ssim_c1
  4766. addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
  4767. movaps m1, [ssim_c2]
  4768. addps m2, m1 ; vars + ssim_c2
  4769. addps m3, m1 ; covar*2 + ssim_c2
  4770. %else
  4771. pmaddwd m4, m1, m0 ; s1*s2
  4772. pslld m1, 16
  4773. por m0, m1
  4774. pmaddwd m0, m0 ; s1*s1 + s2*s2
  4775. pslld m4, 1
  4776. pslld m3, 7
  4777. pslld m2, 6
  4778. psubd m3, m4 ; covar*2
  4779. psubd m2, m0 ; vars
  4780. mova m1, [ssim_c1]
  4781. paddd m0, m1
  4782. paddd m4, m1
  4783. mova m1, [ssim_c2]
  4784. paddd m3, m1
  4785. paddd m2, m1
  4786. cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
  4787. cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
  4788. cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
  4789. cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
  4790. %endif
  4791. mulps m4, m3
  4792. mulps m0, m2
  4793. divps m4, m0 ; ssim
  4794. cmp r2d, 4
  4795. je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
  4796. neg r2
  4797. %ifdef PIC
  4798. lea r3, [mask_ff + 16]
  4799. %xdefine %%mask r3
  4800. %else
  4801. %xdefine %%mask mask_ff + 16
  4802. %endif
  4803. %if cpuflag(avx)
  4804. andps m4, [%%mask + r2*4]
  4805. %else
  4806. movups m0, [%%mask + r2*4]
  4807. andps m4, m0
  4808. %endif
  4809. .skip:
  4810. movhlps m0, m4
  4811. addps m0, m4
  4812. %if cpuflag(ssse3)
  4813. movshdup m4, m0
  4814. %else
  4815. pshuflw m4, m0, q0032
  4816. %endif
  4817. addss m0, m4
  4818. %if ARCH_X86_64 == 0
  4819. movss r0m, m0
  4820. fld dword r0m
  4821. %endif
  4822. RET
  4823. %endmacro ; SSIM
  4824. INIT_XMM sse2
  4825. SSIM
  4826. INIT_XMM avx
  4827. SSIM
  4828. ;-----------------------------------------------------------------------------
  4829. ; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
  4830. ;-----------------------------------------------------------------------------
  4831. %macro ASD8 0
  4832. cglobal pixel_asd8, 5,5
  4833. pxor m0, m0
  4834. pxor m1, m1
  4835. .loop:
  4836. %if HIGH_BIT_DEPTH
  4837. paddw m0, [r0]
  4838. paddw m1, [r2]
  4839. paddw m0, [r0+2*r1]
  4840. paddw m1, [r2+2*r3]
  4841. lea r0, [r0+4*r1]
  4842. paddw m0, [r0]
  4843. paddw m1, [r2+4*r3]
  4844. lea r2, [r2+4*r3]
  4845. paddw m0, [r0+2*r1]
  4846. paddw m1, [r2+2*r3]
  4847. lea r0, [r0+4*r1]
  4848. lea r2, [r2+4*r3]
  4849. %else
  4850. movq m2, [r0]
  4851. movq m3, [r2]
  4852. movhps m2, [r0+r1]
  4853. movhps m3, [r2+r3]
  4854. lea r0, [r0+2*r1]
  4855. psadbw m2, m1
  4856. psadbw m3, m1
  4857. movq m4, [r0]
  4858. movq m5, [r2+2*r3]
  4859. lea r2, [r2+2*r3]
  4860. movhps m4, [r0+r1]
  4861. movhps m5, [r2+r3]
  4862. lea r0, [r0+2*r1]
  4863. paddw m0, m2
  4864. psubw m0, m3
  4865. psadbw m4, m1
  4866. psadbw m5, m1
  4867. lea r2, [r2+2*r3]
  4868. paddw m0, m4
  4869. psubw m0, m5
  4870. %endif
  4871. sub r4d, 4
  4872. jg .loop
  4873. %if HIGH_BIT_DEPTH
  4874. psubw m0, m1
  4875. HADDW m0, m1
  4876. ABSD m1, m0
  4877. %else
  4878. MOVHL m1, m0
  4879. paddw m0, m1
  4880. ABSW m1, m0
  4881. %endif
  4882. movd eax, m1
  4883. RET
  4884. %endmacro
  4885. INIT_XMM sse2
  4886. ASD8
  4887. INIT_XMM ssse3
  4888. ASD8
  4889. %if HIGH_BIT_DEPTH
  4890. INIT_XMM xop
  4891. ASD8
  4892. %endif
  4893. ;=============================================================================
  4894. ; Successive Elimination ADS
  4895. ;=============================================================================
  4896. %macro ADS_START 0
  4897. %if UNIX64
  4898. movsxd r5, r5d
  4899. %else
  4900. mov r5d, r5m
  4901. %endif
  4902. mov r0d, r5d
  4903. lea r6, [r4+r5+(mmsize-1)]
  4904. and r6, ~(mmsize-1)
  4905. shl r2d, 1
  4906. %endmacro
  4907. %macro ADS_END 1 ; unroll_size
  4908. add r1, 8*%1
  4909. add r3, 8*%1
  4910. add r6, 4*%1
  4911. sub r0d, 4*%1
  4912. jg .loop
  4913. WIN64_RESTORE_XMM
  4914. %if mmsize==32
  4915. vzeroupper
  4916. %endif
  4917. lea r6, [r4+r5+(mmsize-1)]
  4918. and r6, ~(mmsize-1)
  4919. %if cpuflag(ssse3)
  4920. jmp ads_mvs_ssse3
  4921. %else
  4922. jmp ads_mvs_mmx
  4923. %endif
  4924. %endmacro
  4925. ;-----------------------------------------------------------------------------
  4926. ; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
  4927. ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
  4928. ;-----------------------------------------------------------------------------
  4929. INIT_MMX mmx2
  4930. cglobal pixel_ads4, 5,7
  4931. mova m6, [r0]
  4932. mova m4, [r0+8]
  4933. pshufw m7, m6, 0
  4934. pshufw m6, m6, q2222
  4935. pshufw m5, m4, 0
  4936. pshufw m4, m4, q2222
  4937. ADS_START
  4938. .loop:
  4939. movu m0, [r1]
  4940. movu m1, [r1+16]
  4941. psubw m0, m7
  4942. psubw m1, m6
  4943. ABSW m0, m0, m2
  4944. ABSW m1, m1, m3
  4945. movu m2, [r1+r2]
  4946. movu m3, [r1+r2+16]
  4947. psubw m2, m5
  4948. psubw m3, m4
  4949. paddw m0, m1
  4950. ABSW m2, m2, m1
  4951. ABSW m3, m3, m1
  4952. paddw m0, m2
  4953. paddw m0, m3
  4954. pshufw m1, r6m, 0
  4955. paddusw m0, [r3]
  4956. psubusw m1, m0
  4957. packsswb m1, m1
  4958. movd [r6], m1
  4959. ADS_END 1
  4960. cglobal pixel_ads2, 5,7
  4961. mova m6, [r0]
  4962. pshufw m5, r6m, 0
  4963. pshufw m7, m6, 0
  4964. pshufw m6, m6, q2222
  4965. ADS_START
  4966. .loop:
  4967. movu m0, [r1]
  4968. movu m1, [r1+r2]
  4969. psubw m0, m7
  4970. psubw m1, m6
  4971. ABSW m0, m0, m2
  4972. ABSW m1, m1, m3
  4973. paddw m0, m1
  4974. paddusw m0, [r3]
  4975. mova m4, m5
  4976. psubusw m4, m0
  4977. packsswb m4, m4
  4978. movd [r6], m4
  4979. ADS_END 1
  4980. cglobal pixel_ads1, 5,7
  4981. pshufw m7, [r0], 0
  4982. pshufw m6, r6m, 0
  4983. ADS_START
  4984. .loop:
  4985. movu m0, [r1]
  4986. movu m1, [r1+8]
  4987. psubw m0, m7
  4988. psubw m1, m7
  4989. ABSW m0, m0, m2
  4990. ABSW m1, m1, m3
  4991. paddusw m0, [r3]
  4992. paddusw m1, [r3+8]
  4993. mova m4, m6
  4994. mova m5, m6
  4995. psubusw m4, m0
  4996. psubusw m5, m1
  4997. packsswb m4, m5
  4998. mova [r6], m4
  4999. ADS_END 2
  5000. %macro ADS_XMM 0
  5001. %if mmsize==32
  5002. cglobal pixel_ads4, 5,7,8
  5003. vpbroadcastw m7, [r0+ 0]
  5004. vpbroadcastw m6, [r0+ 4]
  5005. vpbroadcastw m5, [r0+ 8]
  5006. vpbroadcastw m4, [r0+12]
  5007. %else
  5008. cglobal pixel_ads4, 5,7,12
  5009. mova m4, [r0]
  5010. pshuflw m7, m4, q0000
  5011. pshuflw m6, m4, q2222
  5012. pshufhw m5, m4, q0000
  5013. pshufhw m4, m4, q2222
  5014. punpcklqdq m7, m7
  5015. punpcklqdq m6, m6
  5016. punpckhqdq m5, m5
  5017. punpckhqdq m4, m4
  5018. %endif
  5019. %if ARCH_X86_64 && mmsize == 16
  5020. movd m8, r6m
  5021. SPLATW m8, m8
  5022. ADS_START
  5023. movu m10, [r1]
  5024. movu m11, [r1+r2]
  5025. .loop:
  5026. psubw m0, m10, m7
  5027. movu m10, [r1+16]
  5028. psubw m1, m10, m6
  5029. ABSW m0, m0, m2
  5030. ABSW m1, m1, m3
  5031. psubw m2, m11, m5
  5032. movu m11, [r1+r2+16]
  5033. paddw m0, m1
  5034. psubw m3, m11, m4
  5035. movu m9, [r3]
  5036. ABSW m2, m2, m1
  5037. ABSW m3, m3, m1
  5038. paddw m0, m2
  5039. paddw m0, m3
  5040. paddusw m0, m9
  5041. psubusw m1, m8, m0
  5042. %else
  5043. ADS_START
  5044. .loop:
  5045. movu m0, [r1]
  5046. movu m1, [r1+16]
  5047. psubw m0, m7
  5048. psubw m1, m6
  5049. ABSW m0, m0, m2
  5050. ABSW m1, m1, m3
  5051. movu m2, [r1+r2]
  5052. movu m3, [r1+r2+16]
  5053. psubw m2, m5
  5054. psubw m3, m4
  5055. paddw m0, m1
  5056. ABSW m2, m2, m1
  5057. ABSW m3, m3, m1
  5058. paddw m0, m2
  5059. paddw m0, m3
  5060. movu m2, [r3]
  5061. %if mmsize==32
  5062. vpbroadcastw m1, r6m
  5063. %else
  5064. movd m1, r6m
  5065. pshuflw m1, m1, 0
  5066. punpcklqdq m1, m1
  5067. %endif
  5068. paddusw m0, m2
  5069. psubusw m1, m0
  5070. %endif ; ARCH
  5071. packsswb m1, m1
  5072. %if mmsize==32
  5073. vpermq m1, m1, q3120
  5074. mova [r6], xm1
  5075. %else
  5076. movh [r6], m1
  5077. %endif
  5078. ADS_END mmsize/8
  5079. cglobal pixel_ads2, 5,7,8
  5080. %if mmsize==32
  5081. vpbroadcastw m7, [r0+0]
  5082. vpbroadcastw m6, [r0+4]
  5083. vpbroadcastw m5, r6m
  5084. %else
  5085. movq m6, [r0]
  5086. movd m5, r6m
  5087. pshuflw m7, m6, 0
  5088. pshuflw m6, m6, q2222
  5089. pshuflw m5, m5, 0
  5090. punpcklqdq m7, m7
  5091. punpcklqdq m6, m6
  5092. punpcklqdq m5, m5
  5093. %endif
  5094. ADS_START
  5095. .loop:
  5096. movu m0, [r1]
  5097. movu m1, [r1+r2]
  5098. psubw m0, m7
  5099. psubw m1, m6
  5100. movu m4, [r3]
  5101. ABSW m0, m0, m2
  5102. ABSW m1, m1, m3
  5103. paddw m0, m1
  5104. paddusw m0, m4
  5105. psubusw m1, m5, m0
  5106. packsswb m1, m1
  5107. %if mmsize==32
  5108. vpermq m1, m1, q3120
  5109. mova [r6], xm1
  5110. %else
  5111. movh [r6], m1
  5112. %endif
  5113. ADS_END mmsize/8
  5114. cglobal pixel_ads1, 5,7,8
  5115. %if mmsize==32
  5116. vpbroadcastw m7, [r0]
  5117. vpbroadcastw m6, r6m
  5118. %else
  5119. movd m7, [r0]
  5120. movd m6, r6m
  5121. pshuflw m7, m7, 0
  5122. pshuflw m6, m6, 0
  5123. punpcklqdq m7, m7
  5124. punpcklqdq m6, m6
  5125. %endif
  5126. ADS_START
  5127. .loop:
  5128. movu m0, [r1]
  5129. movu m1, [r1+mmsize]
  5130. psubw m0, m7
  5131. psubw m1, m7
  5132. movu m2, [r3]
  5133. movu m3, [r3+mmsize]
  5134. ABSW m0, m0, m4
  5135. ABSW m1, m1, m5
  5136. paddusw m0, m2
  5137. paddusw m1, m3
  5138. psubusw m4, m6, m0
  5139. psubusw m5, m6, m1
  5140. packsswb m4, m5
  5141. %if mmsize==32
  5142. vpermq m4, m4, q3120
  5143. %endif
  5144. mova [r6], m4
  5145. ADS_END mmsize/4
  5146. %endmacro
  5147. INIT_XMM sse2
  5148. ADS_XMM
  5149. INIT_XMM ssse3
  5150. ADS_XMM
  5151. INIT_XMM avx
  5152. ADS_XMM
  5153. INIT_YMM avx2
  5154. ADS_XMM
  5155. ; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
  5156. ; {
  5157. ; int nmv=0, i, j;
  5158. ; *(uint32_t*)(masks+width) = 0;
  5159. ; for( i=0; i<width; i+=8 )
  5160. ; {
  5161. ; uint64_t mask = *(uint64_t*)(masks+i);
  5162. ; if( !mask ) continue;
  5163. ; for( j=0; j<8; j++ )
  5164. ; if( mask & (255<<j*8) )
  5165. ; mvs[nmv++] = i+j;
  5166. ; }
  5167. ; return nmv;
  5168. ; }
  5169. %macro TEST 1
  5170. mov [r4+r0*2], r1w
  5171. test r2d, 0xff<<(%1*8)
  5172. setne r3b
  5173. add r0d, r3d
  5174. inc r1d
  5175. %endmacro
  5176. INIT_MMX mmx
  5177. cglobal pixel_ads_mvs, 0,7,0
  5178. ads_mvs_mmx:
  5179. ; mvs = r4
  5180. ; masks = r6
  5181. ; width = r5
  5182. ; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
  5183. xor r0d, r0d
  5184. xor r1d, r1d
  5185. mov [r6+r5], r0d
  5186. jmp .loopi
  5187. ALIGN 16
  5188. .loopi0:
  5189. add r1d, 8
  5190. cmp r1d, r5d
  5191. jge .end
  5192. .loopi:
  5193. mov r2, [r6+r1]
  5194. %if ARCH_X86_64
  5195. test r2, r2
  5196. %else
  5197. mov r3, r2
  5198. add r3d, [r6+r1+4]
  5199. %endif
  5200. jz .loopi0
  5201. xor r3d, r3d
  5202. TEST 0
  5203. TEST 1
  5204. TEST 2
  5205. TEST 3
  5206. %if ARCH_X86_64
  5207. shr r2, 32
  5208. %else
  5209. mov r2d, [r6+r1]
  5210. %endif
  5211. TEST 0
  5212. TEST 1
  5213. TEST 2
  5214. TEST 3
  5215. cmp r1d, r5d
  5216. jl .loopi
  5217. .end:
  5218. movifnidn eax, r0d
  5219. RET
  5220. INIT_XMM ssse3
  5221. cglobal pixel_ads_mvs, 0,7,0
  5222. ads_mvs_ssse3:
  5223. mova m3, [pw_8]
  5224. mova m4, [pw_76543210]
  5225. pxor m5, m5
  5226. add r5, r6
  5227. xor r0d, r0d ; nmv
  5228. mov [r5], r0d
  5229. %ifdef PIC
  5230. lea r1, [$$]
  5231. %define GLOBAL +r1-$$
  5232. %else
  5233. %define GLOBAL
  5234. %endif
  5235. .loop:
  5236. movh m0, [r6]
  5237. pcmpeqb m0, m5
  5238. pmovmskb r2d, m0
  5239. xor r2d, 0xffff ; skipping if r2d is zero is slower (branch mispredictions)
  5240. movzx r3d, byte [r2+popcnt_table GLOBAL] ; popcnt
  5241. add r2d, r2d
  5242. ; shuffle counters based on mv mask
  5243. pshufb m2, m4, [r2*8+ads_mvs_shuffle GLOBAL]
  5244. movu [r4+r0*2], m2
  5245. add r0d, r3d
  5246. paddw m4, m3 ; {i*8+0, i*8+1, i*8+2, i*8+3, i*8+4, i*8+5, i*8+6, i*8+7}
  5247. add r6, 8
  5248. cmp r6, r5
  5249. jl .loop
  5250. movifnidn eax, r0d
  5251. RET