sad-a.asm 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215
  1. ;*****************************************************************************
  2. ;* sad-a.asm: x86 sad functions
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2003-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Laurent Aimar <fenrir@via.ecp.fr>
  9. ;* Alex Izvorski <aizvorksi@gmail.com>
  10. ;*
  11. ;* This program is free software; you can redistribute it and/or modify
  12. ;* it under the terms of the GNU General Public License as published by
  13. ;* the Free Software Foundation; either version 2 of the License, or
  14. ;* (at your option) any later version.
  15. ;*
  16. ;* This program is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. ;* GNU General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU General Public License
  22. ;* along with this program; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. ;*
  25. ;* This program is also available under a commercial proprietary license.
  26. ;* For more information, contact us at licensing@x264.com.
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA 32
  31. pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
  32. hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
  33. SECTION .text
  34. cextern pb_3
  35. cextern pb_shuf8x8c
  36. cextern pw_8
  37. cextern sw_64
  38. ;=============================================================================
  39. ; SAD MMX
  40. ;=============================================================================
  41. %macro SAD_INC_2x16P 0
  42. movq mm1, [r0]
  43. movq mm2, [r0+8]
  44. movq mm3, [r0+r1]
  45. movq mm4, [r0+r1+8]
  46. psadbw mm1, [r2]
  47. psadbw mm2, [r2+8]
  48. psadbw mm3, [r2+r3]
  49. psadbw mm4, [r2+r3+8]
  50. lea r0, [r0+2*r1]
  51. paddw mm1, mm2
  52. paddw mm3, mm4
  53. lea r2, [r2+2*r3]
  54. paddw mm0, mm1
  55. paddw mm0, mm3
  56. %endmacro
  57. %macro SAD_INC_2x8P 0
  58. movq mm1, [r0]
  59. movq mm2, [r0+r1]
  60. psadbw mm1, [r2]
  61. psadbw mm2, [r2+r3]
  62. lea r0, [r0+2*r1]
  63. paddw mm0, mm1
  64. paddw mm0, mm2
  65. lea r2, [r2+2*r3]
  66. %endmacro
  67. %macro SAD_INC_2x4P 0
  68. movd mm1, [r0]
  69. movd mm2, [r2]
  70. punpckldq mm1, [r0+r1]
  71. punpckldq mm2, [r2+r3]
  72. psadbw mm1, mm2
  73. paddw mm0, mm1
  74. lea r0, [r0+2*r1]
  75. lea r2, [r2+2*r3]
  76. %endmacro
  77. ;-----------------------------------------------------------------------------
  78. ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
  79. ;-----------------------------------------------------------------------------
  80. %macro SAD 2
  81. cglobal pixel_sad_%1x%2_mmx2, 4,4
  82. pxor mm0, mm0
  83. %rep %2/2
  84. SAD_INC_2x%1P
  85. %endrep
  86. movd eax, mm0
  87. RET
  88. %endmacro
  89. SAD 16, 16
  90. SAD 16, 8
  91. SAD 8, 16
  92. SAD 8, 8
  93. SAD 8, 4
  94. SAD 4, 16
  95. SAD 4, 8
  96. SAD 4, 4
  97. ;=============================================================================
  98. ; SAD XMM
  99. ;=============================================================================
  100. %macro SAD_END_SSE2 0
  101. MOVHL m1, m0
  102. paddw m0, m1
  103. movd eax, m0
  104. RET
  105. %endmacro
  106. ;-----------------------------------------------------------------------------
  107. ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
  108. ;-----------------------------------------------------------------------------
  109. %macro SAD_W16 1 ; h
  110. cglobal pixel_sad_16x%1, 4,4
  111. %ifidn cpuname, sse2
  112. .skip_prologue:
  113. %endif
  114. %assign %%i 0
  115. %if ARCH_X86_64
  116. lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
  117. lea r5, [3*r3]
  118. %rep %1/4
  119. movu m1, [r2]
  120. psadbw m1, [r0]
  121. movu m3, [r2+r3]
  122. psadbw m3, [r0+r1]
  123. movu m2, [r2+2*r3]
  124. psadbw m2, [r0+2*r1]
  125. movu m4, [r2+r5]
  126. psadbw m4, [r0+r6]
  127. %if %%i != %1/4-1
  128. lea r2, [r2+4*r3]
  129. lea r0, [r0+4*r1]
  130. %endif
  131. paddw m1, m3
  132. paddw m2, m4
  133. ACCUM paddw, 0, 1, %%i
  134. paddw m0, m2
  135. %assign %%i %%i+1
  136. %endrep
  137. %else ; The cost of having to save and restore registers on x86-32
  138. %rep %1/2 ; nullifies the benefit of having 3*stride in registers.
  139. movu m1, [r2]
  140. psadbw m1, [r0]
  141. movu m2, [r2+r3]
  142. psadbw m2, [r0+r1]
  143. %if %%i != %1/2-1
  144. lea r2, [r2+2*r3]
  145. lea r0, [r0+2*r1]
  146. %endif
  147. ACCUM paddw, 0, 1, %%i
  148. paddw m0, m2
  149. %assign %%i %%i+1
  150. %endrep
  151. %endif
  152. SAD_END_SSE2
  153. %endmacro
  154. INIT_XMM sse2
  155. SAD_W16 16
  156. SAD_W16 8
  157. INIT_XMM sse3
  158. SAD_W16 16
  159. SAD_W16 8
  160. INIT_XMM sse2, aligned
  161. SAD_W16 16
  162. SAD_W16 8
  163. %macro SAD_INC_4x8P_SSE 1
  164. movq m1, [r0]
  165. movq m2, [r0+r1]
  166. lea r0, [r0+2*r1]
  167. movq m3, [r2]
  168. movq m4, [r2+r3]
  169. lea r2, [r2+2*r3]
  170. movhps m1, [r0]
  171. movhps m2, [r0+r1]
  172. movhps m3, [r2]
  173. movhps m4, [r2+r3]
  174. lea r0, [r0+2*r1]
  175. psadbw m1, m3
  176. psadbw m2, m4
  177. lea r2, [r2+2*r3]
  178. ACCUM paddw, 0, 1, %1
  179. paddw m0, m2
  180. %endmacro
  181. INIT_XMM
  182. ;Even on Nehalem, no sizes other than 8x16 benefit from this method.
  183. cglobal pixel_sad_8x16_sse2, 4,4
  184. SAD_INC_4x8P_SSE 0
  185. SAD_INC_4x8P_SSE 1
  186. SAD_INC_4x8P_SSE 1
  187. SAD_INC_4x8P_SSE 1
  188. SAD_END_SSE2
  189. %macro SAD_W48_AVX512 3 ; w, h, d/q
  190. cglobal pixel_sad_%1x%2, 4,4
  191. kxnorb k1, k1, k1
  192. kaddb k1, k1, k1
  193. %assign %%i 0
  194. %if ARCH_X86_64 && %2 != 4
  195. lea r6, [3*r1]
  196. lea r5, [3*r3]
  197. %rep %2/4
  198. mov%3 m1, [r0]
  199. vpbroadcast%3 m1 {k1}, [r0+r1]
  200. mov%3 m3, [r2]
  201. vpbroadcast%3 m3 {k1}, [r2+r3]
  202. mov%3 m2, [r0+2*r1]
  203. vpbroadcast%3 m2 {k1}, [r0+r6]
  204. mov%3 m4, [r2+2*r3]
  205. vpbroadcast%3 m4 {k1}, [r2+r5]
  206. %if %%i != %2/4-1
  207. lea r0, [r0+4*r1]
  208. lea r2, [r2+4*r3]
  209. %endif
  210. psadbw m1, m3
  211. psadbw m2, m4
  212. ACCUM paddd, 0, 1, %%i
  213. paddd m0, m2
  214. %assign %%i %%i+1
  215. %endrep
  216. %else
  217. %rep %2/2
  218. mov%3 m1, [r0]
  219. vpbroadcast%3 m1 {k1}, [r0+r1]
  220. mov%3 m2, [r2]
  221. vpbroadcast%3 m2 {k1}, [r2+r3]
  222. %if %%i != %2/2-1
  223. lea r0, [r0+2*r1]
  224. lea r2, [r2+2*r3]
  225. %endif
  226. psadbw m1, m2
  227. ACCUM paddd, 0, 1, %%i
  228. %assign %%i %%i+1
  229. %endrep
  230. %endif
  231. %if %1 == 8
  232. punpckhqdq m1, m0, m0
  233. paddd m0, m1
  234. %endif
  235. movd eax, m0
  236. RET
  237. %endmacro
  238. INIT_XMM avx512
  239. SAD_W48_AVX512 4, 4, d
  240. SAD_W48_AVX512 4, 8, d
  241. SAD_W48_AVX512 4, 16, d
  242. SAD_W48_AVX512 8, 4, q
  243. SAD_W48_AVX512 8, 8, q
  244. SAD_W48_AVX512 8, 16, q
  245. %macro SAD_W16_AVX512_START 1 ; h
  246. cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which
  247. jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
  248. lea r1, [3*r3]
  249. %endmacro
  250. %macro SAD_W16_AVX512_END 0
  251. paddd m0, m1
  252. paddd m0, m2
  253. paddd m0, m3
  254. %if mmsize == 64
  255. vextracti32x8 ym1, m0, 1
  256. paddd ym0, ym1
  257. %endif
  258. vextracti128 xm1, ym0, 1
  259. paddd xmm0, xm0, xm1
  260. punpckhqdq xmm1, xmm0, xmm0
  261. paddd xmm0, xmm1
  262. movd eax, xmm0
  263. RET
  264. %endmacro
  265. INIT_YMM avx512
  266. cglobal pixel_sad_16x8, 4,4
  267. SAD_W16_AVX512_START 8
  268. movu xm0, [r2]
  269. vinserti128 m0, [r2+r3], 1
  270. psadbw m0, [r0+0*32]
  271. movu xm1, [r2+2*r3]
  272. vinserti128 m1, [r2+r1], 1
  273. lea r2, [r2+4*r3]
  274. psadbw m1, [r0+1*32]
  275. movu xm2, [r2]
  276. vinserti128 m2, [r2+r3], 1
  277. psadbw m2, [r0+2*32]
  278. movu xm3, [r2+2*r3]
  279. vinserti128 m3, [r2+r1], 1
  280. psadbw m3, [r0+3*32]
  281. SAD_W16_AVX512_END
  282. INIT_ZMM avx512
  283. cglobal pixel_sad_16x16, 4,4
  284. SAD_W16_AVX512_START 16
  285. movu xm0, [r2]
  286. vinserti128 ym0, [r2+r3], 1
  287. movu xm1, [r2+4*r3]
  288. vinserti32x4 m0, [r2+2*r3], 2
  289. vinserti32x4 m1, [r2+2*r1], 2
  290. vinserti32x4 m0, [r2+r1], 3
  291. lea r2, [r2+4*r3]
  292. vinserti32x4 m1, [r2+r3], 1
  293. psadbw m0, [r0+0*64]
  294. vinserti32x4 m1, [r2+r1], 3
  295. lea r2, [r2+4*r3]
  296. psadbw m1, [r0+1*64]
  297. movu xm2, [r2]
  298. vinserti128 ym2, [r2+r3], 1
  299. movu xm3, [r2+4*r3]
  300. vinserti32x4 m2, [r2+2*r3], 2
  301. vinserti32x4 m3, [r2+2*r1], 2
  302. vinserti32x4 m2, [r2+r1], 3
  303. lea r2, [r2+4*r3]
  304. vinserti32x4 m3, [r2+r3], 1
  305. psadbw m2, [r0+2*64]
  306. vinserti32x4 m3, [r2+r1], 3
  307. psadbw m3, [r0+3*64]
  308. SAD_W16_AVX512_END
  309. ;-----------------------------------------------------------------------------
  310. ; void pixel_vsad( pixel *src, intptr_t stride );
  311. ;-----------------------------------------------------------------------------
  312. %if ARCH_X86_64 == 0
  313. INIT_MMX
  314. cglobal pixel_vsad_mmx2, 3,3
  315. mova m0, [r0]
  316. mova m1, [r0+8]
  317. mova m2, [r0+r1]
  318. mova m3, [r0+r1+8]
  319. lea r0, [r0+r1*2]
  320. psadbw m0, m2
  321. psadbw m1, m3
  322. paddw m0, m1
  323. sub r2d, 2
  324. je .end
  325. .loop:
  326. mova m4, [r0]
  327. mova m5, [r0+8]
  328. mova m6, [r0+r1]
  329. mova m7, [r0+r1+8]
  330. lea r0, [r0+r1*2]
  331. psadbw m2, m4
  332. psadbw m3, m5
  333. psadbw m4, m6
  334. psadbw m5, m7
  335. ;max sum: 31*16*255(pixel_max)=126480
  336. paddd m0, m2
  337. paddd m0, m3
  338. paddd m0, m4
  339. paddd m0, m5
  340. mova m2, m6
  341. mova m3, m7
  342. sub r2d, 2
  343. jg .loop
  344. .end:
  345. movd eax, m0
  346. RET
  347. %endif
  348. INIT_XMM
  349. cglobal pixel_vsad_sse2, 3,3
  350. mova m0, [r0]
  351. mova m1, [r0+r1]
  352. lea r0, [r0+r1*2]
  353. psadbw m0, m1
  354. sub r2d, 2
  355. je .end
  356. .loop:
  357. mova m2, [r0]
  358. mova m3, [r0+r1]
  359. lea r0, [r0+r1*2]
  360. psadbw m1, m2
  361. psadbw m2, m3
  362. paddw m0, m1
  363. paddw m0, m2
  364. mova m1, m3
  365. sub r2d, 2
  366. jg .loop
  367. .end:
  368. MOVHL m1, m0
  369. ;max sum: 31*16*255(pixel_max)=126480
  370. paddd m0, m1
  371. movd eax, m0
  372. RET
  373. ;-----------------------------------------------------------------------------
  374. ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
  375. ;-----------------------------------------------------------------------------
  376. cglobal intra_sad_x3_4x4_mmx2, 3,3
  377. pxor mm7, mm7
  378. movd mm0, [r1-FDEC_STRIDE]
  379. movd mm1, [r0+FENC_STRIDE*0]
  380. movd mm2, [r0+FENC_STRIDE*2]
  381. punpckldq mm0, mm0
  382. punpckldq mm1, [r0+FENC_STRIDE*1]
  383. punpckldq mm2, [r0+FENC_STRIDE*3]
  384. movq mm6, mm0
  385. movq mm3, mm1
  386. psadbw mm3, mm0
  387. psadbw mm0, mm2
  388. paddw mm0, mm3
  389. movd [r2], mm0 ;V prediction cost
  390. movd mm3, [r1+FDEC_STRIDE*0-4]
  391. movd mm0, [r1+FDEC_STRIDE*1-4]
  392. movd mm4, [r1+FDEC_STRIDE*2-4]
  393. movd mm5, [r1+FDEC_STRIDE*3-4]
  394. punpcklbw mm3, mm0
  395. punpcklbw mm4, mm5
  396. movq mm5, mm3
  397. punpckhwd mm5, mm4
  398. punpckhdq mm5, mm6
  399. psadbw mm5, mm7
  400. punpckhbw mm3, mm3
  401. punpckhbw mm4, mm4
  402. punpckhwd mm3, mm3
  403. punpckhwd mm4, mm4
  404. psraw mm5, 2
  405. pavgw mm5, mm7
  406. punpcklbw mm5, mm5
  407. pshufw mm5, mm5, 0 ;DC prediction
  408. movq mm6, mm5
  409. psadbw mm5, mm1
  410. psadbw mm6, mm2
  411. psadbw mm1, mm3
  412. psadbw mm2, mm4
  413. paddw mm5, mm6
  414. paddw mm1, mm2
  415. movd [r2+8], mm5 ;DC prediction cost
  416. movd [r2+4], mm1 ;H prediction cost
  417. RET
  418. ;-----------------------------------------------------------------------------
  419. ; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
  420. ;-----------------------------------------------------------------------------
  421. ;m0 = DC
  422. ;m6 = V
  423. ;m7 = H
  424. ;m1 = DC score
  425. ;m2 = V score
  426. ;m3 = H score
  427. ;m5 = pixel row
  428. ;m4 = temp
  429. %macro INTRA_SAD_HVDC_ITER 2
  430. movq m5, [r0+FENC_STRIDE*%1]
  431. movq m4, m5
  432. psadbw m4, m0
  433. ACCUM paddw, 1, 4, %1
  434. movq m4, m5
  435. psadbw m4, m6
  436. ACCUM paddw, 2, 4, %1
  437. pshufw m4, m7, %2
  438. psadbw m5, m4
  439. ACCUM paddw, 3, 5, %1
  440. %endmacro
  441. INIT_MMX
  442. cglobal intra_sad_x3_8x8_mmx2, 3,3
  443. movq m7, [r1+7]
  444. pxor m0, m0
  445. movq m6, [r1+16] ;V prediction
  446. pxor m1, m1
  447. psadbw m0, m7
  448. psadbw m1, m6
  449. paddw m0, m1
  450. paddw m0, [pw_8]
  451. psrlw m0, 4
  452. punpcklbw m0, m0
  453. pshufw m0, m0, q0000 ;DC prediction
  454. punpckhbw m7, m7
  455. INTRA_SAD_HVDC_ITER 0, q3333
  456. INTRA_SAD_HVDC_ITER 1, q2222
  457. INTRA_SAD_HVDC_ITER 2, q1111
  458. INTRA_SAD_HVDC_ITER 3, q0000
  459. movq m7, [r1+7]
  460. punpcklbw m7, m7
  461. INTRA_SAD_HVDC_ITER 4, q3333
  462. INTRA_SAD_HVDC_ITER 5, q2222
  463. INTRA_SAD_HVDC_ITER 6, q1111
  464. INTRA_SAD_HVDC_ITER 7, q0000
  465. movd [r2+0], m2
  466. movd [r2+4], m3
  467. movd [r2+8], m1
  468. RET
  469. ;-----------------------------------------------------------------------------
  470. ; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
  471. ;-----------------------------------------------------------------------------
  472. %macro INTRA_SAD_HV_ITER 1
  473. %if cpuflag(ssse3)
  474. movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
  475. movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
  476. pshufb m1, m7
  477. pshufb m3, m7
  478. %else
  479. movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
  480. movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
  481. punpckhbw m1, m1
  482. punpckhbw m3, m3
  483. pshufw m1, m1, q3333
  484. pshufw m3, m3, q3333
  485. %endif
  486. movq m4, [r0 + FENC_STRIDE*(%1+0)]
  487. movq m5, [r0 + FENC_STRIDE*(%1+1)]
  488. psadbw m1, m4
  489. psadbw m3, m5
  490. psadbw m4, m6
  491. psadbw m5, m6
  492. paddw m1, m3
  493. paddw m4, m5
  494. ACCUM paddw, 0, 1, %1
  495. ACCUM paddw, 2, 4, %1
  496. %endmacro
  497. %macro INTRA_SAD_8x8C 0
  498. cglobal intra_sad_x3_8x8c, 3,3
  499. movq m6, [r1 - FDEC_STRIDE]
  500. add r1, FDEC_STRIDE*4
  501. %if cpuflag(ssse3)
  502. movq m7, [pb_3]
  503. %endif
  504. INTRA_SAD_HV_ITER 0
  505. INTRA_SAD_HV_ITER 2
  506. INTRA_SAD_HV_ITER 4
  507. INTRA_SAD_HV_ITER 6
  508. movd [r2+4], m0
  509. movd [r2+8], m2
  510. pxor m7, m7
  511. movq m2, [r1 + FDEC_STRIDE*-4 - 8]
  512. movq m4, [r1 + FDEC_STRIDE*-2 - 8]
  513. movq m3, [r1 + FDEC_STRIDE* 0 - 8]
  514. movq m5, [r1 + FDEC_STRIDE* 2 - 8]
  515. punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
  516. punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
  517. punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
  518. punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
  519. punpckhbw m2, m4
  520. punpckhbw m3, m5
  521. psrlq m2, 32
  522. psrlq m3, 32
  523. psadbw m2, m7 ; s2
  524. psadbw m3, m7 ; s3
  525. movq m1, m6
  526. SWAP 0, 6
  527. punpckldq m0, m7
  528. punpckhdq m1, m7
  529. psadbw m0, m7 ; s0
  530. psadbw m1, m7 ; s1
  531. punpcklwd m0, m1
  532. punpcklwd m2, m3
  533. punpckldq m0, m2 ;s0 s1 s2 s3
  534. pshufw m3, m0, q3312 ;s2,s1,s3,s3
  535. pshufw m0, m0, q1310 ;s0,s1,s3,s1
  536. paddw m0, m3
  537. psrlw m0, 2
  538. pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
  539. %if cpuflag(ssse3)
  540. movq2dq xmm0, m0
  541. pshufb xmm0, [pb_shuf8x8c]
  542. movq xmm1, [r0+FENC_STRIDE*0]
  543. movq xmm2, [r0+FENC_STRIDE*1]
  544. movq xmm3, [r0+FENC_STRIDE*2]
  545. movq xmm4, [r0+FENC_STRIDE*3]
  546. movhps xmm1, [r0+FENC_STRIDE*4]
  547. movhps xmm2, [r0+FENC_STRIDE*5]
  548. movhps xmm3, [r0+FENC_STRIDE*6]
  549. movhps xmm4, [r0+FENC_STRIDE*7]
  550. psadbw xmm1, xmm0
  551. psadbw xmm2, xmm0
  552. psadbw xmm3, xmm0
  553. psadbw xmm4, xmm0
  554. paddw xmm1, xmm2
  555. paddw xmm1, xmm3
  556. paddw xmm1, xmm4
  557. MOVHL xmm0, xmm1
  558. paddw xmm1, xmm0
  559. movd [r2], xmm1
  560. %else
  561. packuswb m0, m0
  562. punpcklbw m0, m0
  563. movq m1, m0
  564. punpcklbw m0, m0 ; 4x dc0 4x dc1
  565. punpckhbw m1, m1 ; 4x dc2 4x dc3
  566. movq m2, [r0+FENC_STRIDE*0]
  567. movq m3, [r0+FENC_STRIDE*1]
  568. movq m4, [r0+FENC_STRIDE*2]
  569. movq m5, [r0+FENC_STRIDE*3]
  570. movq m6, [r0+FENC_STRIDE*4]
  571. movq m7, [r0+FENC_STRIDE*5]
  572. psadbw m2, m0
  573. psadbw m3, m0
  574. psadbw m4, m0
  575. psadbw m5, m0
  576. movq m0, [r0+FENC_STRIDE*6]
  577. psadbw m6, m1
  578. psadbw m7, m1
  579. psadbw m0, m1
  580. psadbw m1, [r0+FENC_STRIDE*7]
  581. paddw m2, m3
  582. paddw m4, m5
  583. paddw m6, m7
  584. paddw m0, m1
  585. paddw m2, m4
  586. paddw m6, m0
  587. paddw m2, m6
  588. movd [r2], m2
  589. %endif
  590. RET
  591. %endmacro
  592. INIT_MMX mmx2
  593. INTRA_SAD_8x8C
  594. INIT_MMX ssse3
  595. INTRA_SAD_8x8C
  596. INIT_YMM avx2
  597. cglobal intra_sad_x3_8x8c, 3,3,7
  598. vpbroadcastq m2, [r1 - FDEC_STRIDE] ; V pred
  599. add r1, FDEC_STRIDE*4-1
  600. pxor xm5, xm5
  601. punpckldq xm3, xm2, xm5 ; V0 _ V1 _
  602. movd xm0, [r1 + FDEC_STRIDE*-1 - 3]
  603. movd xm1, [r1 + FDEC_STRIDE* 3 - 3]
  604. pinsrb xm0, [r1 + FDEC_STRIDE*-4], 0
  605. pinsrb xm1, [r1 + FDEC_STRIDE* 0], 0
  606. pinsrb xm0, [r1 + FDEC_STRIDE*-3], 1
  607. pinsrb xm1, [r1 + FDEC_STRIDE* 1], 1
  608. pinsrb xm0, [r1 + FDEC_STRIDE*-2], 2
  609. pinsrb xm1, [r1 + FDEC_STRIDE* 2], 2
  610. punpcklqdq xm0, xm1 ; H0 _ H1 _
  611. vinserti128 m3, m3, xm0, 1 ; V0 V1 H0 H1
  612. pshufb xm0, [hpred_shuf] ; H00224466 H11335577
  613. psadbw m3, m5 ; s0 s1 s2 s3
  614. vpermq m4, m3, q3312 ; s2 s1 s3 s3
  615. vpermq m3, m3, q1310 ; s0 s1 s3 s1
  616. paddw m3, m4
  617. psrlw m3, 2
  618. pavgw m3, m5 ; s0+s2 s1 s3 s1+s3
  619. pshufb m3, [pb_shuf8x8c2] ; DC0 _ DC1 _
  620. vpblendd m3, m3, m2, 11001100b ; DC0 V DC1 V
  621. vinserti128 m1, m3, xm3, 1 ; DC0 V DC0 V
  622. vperm2i128 m6, m3, m3, q0101 ; DC1 V DC1 V
  623. vpermq m0, m0, q3120 ; H00224466 _ H11335577 _
  624. movddup m2, [r0+FENC_STRIDE*0]
  625. movddup m4, [r0+FENC_STRIDE*2]
  626. pshuflw m3, m0, q0000
  627. psadbw m3, m2
  628. psadbw m2, m1
  629. pshuflw m5, m0, q1111
  630. psadbw m5, m4
  631. psadbw m4, m1
  632. paddw m2, m4
  633. paddw m3, m5
  634. movddup m4, [r0+FENC_STRIDE*4]
  635. pshuflw m5, m0, q2222
  636. psadbw m5, m4
  637. psadbw m4, m6
  638. paddw m2, m4
  639. paddw m3, m5
  640. movddup m4, [r0+FENC_STRIDE*6]
  641. pshuflw m5, m0, q3333
  642. psadbw m5, m4
  643. psadbw m4, m6
  644. paddw m2, m4
  645. paddw m3, m5
  646. vextracti128 xm0, m2, 1
  647. vextracti128 xm1, m3, 1
  648. paddw xm2, xm0 ; DC V
  649. paddw xm3, xm1 ; H
  650. pextrd [r2+8], xm2, 2 ; V
  651. movd [r2+4], xm3 ; H
  652. movd [r2+0], xm2 ; DC
  653. RET
  654. ;-----------------------------------------------------------------------------
  655. ; void intra_sad_x3_16x16( uint8_t *fenc, uint8_t *fdec, int res[3] );
  656. ;-----------------------------------------------------------------------------
  657. ;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
  658. ;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
  659. %macro INTRA_SAD16 0
  660. cglobal intra_sad_x3_16x16, 3,5,8
  661. pxor mm0, mm0
  662. pxor mm1, mm1
  663. psadbw mm0, [r1-FDEC_STRIDE+0]
  664. psadbw mm1, [r1-FDEC_STRIDE+8]
  665. paddw mm0, mm1
  666. movd r3d, mm0
  667. %if cpuflag(ssse3)
  668. mova m1, [pb_3]
  669. %endif
  670. %assign x 0
  671. %rep 16
  672. movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
  673. %if (x&3)==3 && x!=15
  674. add r1, FDEC_STRIDE*4
  675. %endif
  676. add r3d, r4d
  677. %assign x x+1
  678. %endrep
  679. sub r1, FDEC_STRIDE*12
  680. add r3d, 16
  681. shr r3d, 5
  682. imul r3d, 0x01010101
  683. movd m7, r3d
  684. mova m5, [r1-FDEC_STRIDE]
  685. %if mmsize==16
  686. pshufd m7, m7, 0
  687. %else
  688. mova m1, [r1-FDEC_STRIDE+8]
  689. punpckldq m7, m7
  690. %endif
  691. pxor m4, m4
  692. pxor m3, m3
  693. pxor m2, m2
  694. mov r3d, 15*FENC_STRIDE
  695. .vloop:
  696. SPLATB_LOAD m6, r1+r3*2-1, m1
  697. mova m0, [r0+r3]
  698. psadbw m0, m7
  699. paddw m4, m0
  700. mova m0, [r0+r3]
  701. psadbw m0, m5
  702. paddw m2, m0
  703. %if mmsize==8
  704. mova m0, [r0+r3]
  705. psadbw m0, m6
  706. paddw m3, m0
  707. mova m0, [r0+r3+8]
  708. psadbw m0, m7
  709. paddw m4, m0
  710. mova m0, [r0+r3+8]
  711. psadbw m0, m1
  712. paddw m2, m0
  713. psadbw m6, [r0+r3+8]
  714. paddw m3, m6
  715. %else
  716. psadbw m6, [r0+r3]
  717. paddw m3, m6
  718. %endif
  719. add r3d, -FENC_STRIDE
  720. jge .vloop
  721. %if mmsize==16
  722. pslldq m3, 4
  723. por m3, m2
  724. MOVHL m1, m3
  725. paddw m3, m1
  726. movq [r2+0], m3
  727. MOVHL m1, m4
  728. paddw m4, m1
  729. %else
  730. movd [r2+0], m2
  731. movd [r2+4], m3
  732. %endif
  733. movd [r2+8], m4
  734. RET
  735. %endmacro
  736. INIT_MMX mmx2
  737. INTRA_SAD16
  738. INIT_XMM sse2
  739. INTRA_SAD16
  740. INIT_XMM ssse3
  741. INTRA_SAD16
  742. INIT_YMM avx2
  743. cglobal intra_sad_x3_16x16, 3,5,6
  744. pxor xm0, xm0
  745. psadbw xm0, [r1-FDEC_STRIDE]
  746. MOVHL xm1, xm0
  747. paddw xm0, xm1
  748. movd r3d, xm0
  749. %assign x 0
  750. %rep 16
  751. movzx r4d, byte [r1-1+FDEC_STRIDE*(x&3)]
  752. %if (x&3)==3 && x!=15
  753. add r1, FDEC_STRIDE*4
  754. %endif
  755. add r3d, r4d
  756. %assign x x+1
  757. %endrep
  758. sub r1, FDEC_STRIDE*12
  759. add r3d, 16
  760. shr r3d, 5
  761. movd xm5, r3d
  762. vpbroadcastb xm5, xm5
  763. vinserti128 m5, m5, [r1-FDEC_STRIDE], 1 ; m5 contains DC and V prediction
  764. pxor m4, m4 ; DC / V accumulator
  765. pxor xm3, xm3 ; H accumulator
  766. mov r3d, 15*FENC_STRIDE
  767. .vloop:
  768. vpbroadcastb xm2, [r1+r3*2-1]
  769. vbroadcasti128 m0, [r0+r3]
  770. psadbw m1, m0, m5
  771. psadbw xm0, xm2
  772. paddw m4, m1
  773. paddw xm3, xm0
  774. add r3d, -FENC_STRIDE
  775. jge .vloop
  776. punpckhqdq m5, m4, m4
  777. MOVHL xm2, xm3
  778. paddw m4, m5 ; DC / V
  779. paddw xm3, xm2 ; H
  780. vextracti128 xm2, m4, 1
  781. movd [r2+0], xm2
  782. movd [r2+4], xm3
  783. movd [r2+8], xm4
  784. RET
  785. ;=============================================================================
  786. ; SAD x3/x4 MMX
  787. ;=============================================================================
  788. %macro SAD_X3_START_1x8P 0
  789. movq mm3, [r0]
  790. movq mm0, [r1]
  791. movq mm1, [r2]
  792. movq mm2, [r3]
  793. psadbw mm0, mm3
  794. psadbw mm1, mm3
  795. psadbw mm2, mm3
  796. %endmacro
  797. %macro SAD_X3_1x8P 2
  798. movq mm3, [r0+%1]
  799. movq mm4, [r1+%2]
  800. movq mm5, [r2+%2]
  801. movq mm6, [r3+%2]
  802. psadbw mm4, mm3
  803. psadbw mm5, mm3
  804. psadbw mm6, mm3
  805. paddw mm0, mm4
  806. paddw mm1, mm5
  807. paddw mm2, mm6
  808. %endmacro
  809. %macro SAD_X3_START_2x4P 3
  810. movd mm3, [r0]
  811. movd %1, [r1]
  812. movd %2, [r2]
  813. movd %3, [r3]
  814. punpckldq mm3, [r0+FENC_STRIDE]
  815. punpckldq %1, [r1+r4]
  816. punpckldq %2, [r2+r4]
  817. punpckldq %3, [r3+r4]
  818. psadbw %1, mm3
  819. psadbw %2, mm3
  820. psadbw %3, mm3
  821. %endmacro
  822. %macro SAD_X3_2x16P 1
  823. %if %1
  824. SAD_X3_START_1x8P
  825. %else
  826. SAD_X3_1x8P 0, 0
  827. %endif
  828. SAD_X3_1x8P 8, 8
  829. SAD_X3_1x8P FENC_STRIDE, r4
  830. SAD_X3_1x8P FENC_STRIDE+8, r4+8
  831. add r0, 2*FENC_STRIDE
  832. lea r1, [r1+2*r4]
  833. lea r2, [r2+2*r4]
  834. lea r3, [r3+2*r4]
  835. %endmacro
  836. %macro SAD_X3_2x8P 1
  837. %if %1
  838. SAD_X3_START_1x8P
  839. %else
  840. SAD_X3_1x8P 0, 0
  841. %endif
  842. SAD_X3_1x8P FENC_STRIDE, r4
  843. add r0, 2*FENC_STRIDE
  844. lea r1, [r1+2*r4]
  845. lea r2, [r2+2*r4]
  846. lea r3, [r3+2*r4]
  847. %endmacro
  848. %macro SAD_X3_2x4P 1
  849. %if %1
  850. SAD_X3_START_2x4P mm0, mm1, mm2
  851. %else
  852. SAD_X3_START_2x4P mm4, mm5, mm6
  853. paddw mm0, mm4
  854. paddw mm1, mm5
  855. paddw mm2, mm6
  856. %endif
  857. add r0, 2*FENC_STRIDE
  858. lea r1, [r1+2*r4]
  859. lea r2, [r2+2*r4]
  860. lea r3, [r3+2*r4]
  861. %endmacro
  862. %macro SAD_X4_START_1x8P 0
  863. movq mm7, [r0]
  864. movq mm0, [r1]
  865. movq mm1, [r2]
  866. movq mm2, [r3]
  867. movq mm3, [r4]
  868. psadbw mm0, mm7
  869. psadbw mm1, mm7
  870. psadbw mm2, mm7
  871. psadbw mm3, mm7
  872. %endmacro
  873. %macro SAD_X4_1x8P 2
  874. movq mm7, [r0+%1]
  875. movq mm4, [r1+%2]
  876. movq mm5, [r2+%2]
  877. movq mm6, [r3+%2]
  878. psadbw mm4, mm7
  879. psadbw mm5, mm7
  880. psadbw mm6, mm7
  881. psadbw mm7, [r4+%2]
  882. paddw mm0, mm4
  883. paddw mm1, mm5
  884. paddw mm2, mm6
  885. paddw mm3, mm7
  886. %endmacro
  887. %macro SAD_X4_START_2x4P 0
  888. movd mm7, [r0]
  889. movd mm0, [r1]
  890. movd mm1, [r2]
  891. movd mm2, [r3]
  892. movd mm3, [r4]
  893. punpckldq mm7, [r0+FENC_STRIDE]
  894. punpckldq mm0, [r1+r5]
  895. punpckldq mm1, [r2+r5]
  896. punpckldq mm2, [r3+r5]
  897. punpckldq mm3, [r4+r5]
  898. psadbw mm0, mm7
  899. psadbw mm1, mm7
  900. psadbw mm2, mm7
  901. psadbw mm3, mm7
  902. %endmacro
  903. %macro SAD_X4_INC_2x4P 0
  904. movd mm7, [r0]
  905. movd mm4, [r1]
  906. movd mm5, [r2]
  907. punpckldq mm7, [r0+FENC_STRIDE]
  908. punpckldq mm4, [r1+r5]
  909. punpckldq mm5, [r2+r5]
  910. psadbw mm4, mm7
  911. psadbw mm5, mm7
  912. paddw mm0, mm4
  913. paddw mm1, mm5
  914. movd mm4, [r3]
  915. movd mm5, [r4]
  916. punpckldq mm4, [r3+r5]
  917. punpckldq mm5, [r4+r5]
  918. psadbw mm4, mm7
  919. psadbw mm5, mm7
  920. paddw mm2, mm4
  921. paddw mm3, mm5
  922. %endmacro
  923. %macro SAD_X4_2x16P 1
  924. %if %1
  925. SAD_X4_START_1x8P
  926. %else
  927. SAD_X4_1x8P 0, 0
  928. %endif
  929. SAD_X4_1x8P 8, 8
  930. SAD_X4_1x8P FENC_STRIDE, r5
  931. SAD_X4_1x8P FENC_STRIDE+8, r5+8
  932. add r0, 2*FENC_STRIDE
  933. lea r1, [r1+2*r5]
  934. lea r2, [r2+2*r5]
  935. lea r3, [r3+2*r5]
  936. lea r4, [r4+2*r5]
  937. %endmacro
  938. %macro SAD_X4_2x8P 1
  939. %if %1
  940. SAD_X4_START_1x8P
  941. %else
  942. SAD_X4_1x8P 0, 0
  943. %endif
  944. SAD_X4_1x8P FENC_STRIDE, r5
  945. add r0, 2*FENC_STRIDE
  946. lea r1, [r1+2*r5]
  947. lea r2, [r2+2*r5]
  948. lea r3, [r3+2*r5]
  949. lea r4, [r4+2*r5]
  950. %endmacro
  951. %macro SAD_X4_2x4P 1
  952. %if %1
  953. SAD_X4_START_2x4P
  954. %else
  955. SAD_X4_INC_2x4P
  956. %endif
  957. add r0, 2*FENC_STRIDE
  958. lea r1, [r1+2*r5]
  959. lea r2, [r2+2*r5]
  960. lea r3, [r3+2*r5]
  961. lea r4, [r4+2*r5]
  962. %endmacro
  963. %macro SAD_X3_END 0
  964. %if UNIX64
  965. movd [r5+0], mm0
  966. movd [r5+4], mm1
  967. movd [r5+8], mm2
  968. %else
  969. mov r0, r5mp
  970. movd [r0+0], mm0
  971. movd [r0+4], mm1
  972. movd [r0+8], mm2
  973. %endif
  974. RET
  975. %endmacro
  976. %macro SAD_X4_END 0
  977. mov r0, r6mp
  978. movd [r0+0], mm0
  979. movd [r0+4], mm1
  980. movd [r0+8], mm2
  981. movd [r0+12], mm3
  982. RET
  983. %endmacro
  984. ;-----------------------------------------------------------------------------
  985. ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
  986. ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
  987. ;-----------------------------------------------------------------------------
  988. %macro SAD_X 3
  989. cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
  990. SAD_X%1_2x%2P 1
  991. %rep %3/2-1
  992. SAD_X%1_2x%2P 0
  993. %endrep
  994. SAD_X%1_END
  995. %endmacro
  996. INIT_MMX
  997. SAD_X 3, 16, 16
  998. SAD_X 3, 16, 8
  999. SAD_X 3, 8, 16
  1000. SAD_X 3, 8, 8
  1001. SAD_X 3, 8, 4
  1002. SAD_X 3, 4, 8
  1003. SAD_X 3, 4, 4
  1004. SAD_X 4, 16, 16
  1005. SAD_X 4, 16, 8
  1006. SAD_X 4, 8, 16
  1007. SAD_X 4, 8, 8
  1008. SAD_X 4, 8, 4
  1009. SAD_X 4, 4, 8
  1010. SAD_X 4, 4, 4
  1011. ;=============================================================================
  1012. ; SAD x3/x4 XMM
  1013. ;=============================================================================
  1014. %macro SAD_X3_START_1x16P_SSE2 0
  1015. mova m2, [r0]
  1016. %if cpuflag(avx)
  1017. psadbw m0, m2, [r1]
  1018. psadbw m1, m2, [r2]
  1019. psadbw m2, [r3]
  1020. %else
  1021. movu m0, [r1]
  1022. movu m1, [r2]
  1023. movu m3, [r3]
  1024. psadbw m0, m2
  1025. psadbw m1, m2
  1026. psadbw m2, m3
  1027. %endif
  1028. %endmacro
  1029. %macro SAD_X3_1x16P_SSE2 2
  1030. mova m3, [r0+%1]
  1031. %if cpuflag(avx)
  1032. psadbw m4, m3, [r1+%2]
  1033. psadbw m5, m3, [r2+%2]
  1034. psadbw m3, [r3+%2]
  1035. %else
  1036. movu m4, [r1+%2]
  1037. movu m5, [r2+%2]
  1038. movu m6, [r3+%2]
  1039. psadbw m4, m3
  1040. psadbw m5, m3
  1041. psadbw m3, m6
  1042. %endif
  1043. paddw m0, m4
  1044. paddw m1, m5
  1045. paddw m2, m3
  1046. %endmacro
  1047. %if ARCH_X86_64
  1048. DECLARE_REG_TMP 6
  1049. %else
  1050. DECLARE_REG_TMP 5
  1051. %endif
  1052. %macro SAD_X3_4x16P_SSE2 2
  1053. %if %1==0
  1054. lea t0, [r4*3]
  1055. SAD_X3_START_1x16P_SSE2
  1056. %else
  1057. SAD_X3_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0
  1058. %endif
  1059. SAD_X3_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r4*1
  1060. SAD_X3_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2
  1061. SAD_X3_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), t0
  1062. %if %1 != %2-1
  1063. %if (%1&1) != 0
  1064. add r0, 8*FENC_STRIDE
  1065. %endif
  1066. lea r1, [r1+4*r4]
  1067. lea r2, [r2+4*r4]
  1068. lea r3, [r3+4*r4]
  1069. %endif
  1070. %endmacro
  1071. %macro SAD_X3_START_2x8P_SSE2 0
  1072. movq m3, [r0]
  1073. movq m0, [r1]
  1074. movq m1, [r2]
  1075. movq m2, [r3]
  1076. movhps m3, [r0+FENC_STRIDE]
  1077. movhps m0, [r1+r4]
  1078. movhps m1, [r2+r4]
  1079. movhps m2, [r3+r4]
  1080. psadbw m0, m3
  1081. psadbw m1, m3
  1082. psadbw m2, m3
  1083. %endmacro
  1084. %macro SAD_X3_2x8P_SSE2 4
  1085. movq m6, [r0+%1]
  1086. movq m3, [r1+%2]
  1087. movq m4, [r2+%2]
  1088. movq m5, [r3+%2]
  1089. movhps m6, [r0+%3]
  1090. movhps m3, [r1+%4]
  1091. movhps m4, [r2+%4]
  1092. movhps m5, [r3+%4]
  1093. psadbw m3, m6
  1094. psadbw m4, m6
  1095. psadbw m5, m6
  1096. paddw m0, m3
  1097. paddw m1, m4
  1098. paddw m2, m5
  1099. %endmacro
  1100. %macro SAD_X4_START_2x8P_SSE2 0
  1101. movq m4, [r0]
  1102. movq m0, [r1]
  1103. movq m1, [r2]
  1104. movq m2, [r3]
  1105. movq m3, [r4]
  1106. movhps m4, [r0+FENC_STRIDE]
  1107. movhps m0, [r1+r5]
  1108. movhps m1, [r2+r5]
  1109. movhps m2, [r3+r5]
  1110. movhps m3, [r4+r5]
  1111. psadbw m0, m4
  1112. psadbw m1, m4
  1113. psadbw m2, m4
  1114. psadbw m3, m4
  1115. %endmacro
  1116. %macro SAD_X4_2x8P_SSE2 4
  1117. movq m6, [r0+%1]
  1118. movq m4, [r1+%2]
  1119. movq m5, [r2+%2]
  1120. movhps m6, [r0+%3]
  1121. movhps m4, [r1+%4]
  1122. movhps m5, [r2+%4]
  1123. psadbw m4, m6
  1124. psadbw m5, m6
  1125. paddw m0, m4
  1126. paddw m1, m5
  1127. movq m4, [r3+%2]
  1128. movq m5, [r4+%2]
  1129. movhps m4, [r3+%4]
  1130. movhps m5, [r4+%4]
  1131. psadbw m4, m6
  1132. psadbw m5, m6
  1133. paddw m2, m4
  1134. paddw m3, m5
  1135. %endmacro
  1136. %macro SAD_X4_START_1x16P_SSE2 0
  1137. mova m3, [r0]
  1138. %if cpuflag(avx)
  1139. psadbw m0, m3, [r1]
  1140. psadbw m1, m3, [r2]
  1141. psadbw m2, m3, [r3]
  1142. psadbw m3, [r4]
  1143. %else
  1144. movu m0, [r1]
  1145. movu m1, [r2]
  1146. movu m2, [r3]
  1147. movu m4, [r4]
  1148. psadbw m0, m3
  1149. psadbw m1, m3
  1150. psadbw m2, m3
  1151. psadbw m3, m4
  1152. %endif
  1153. %endmacro
  1154. %macro SAD_X4_1x16P_SSE2 2
  1155. mova m6, [r0+%1]
  1156. %if cpuflag(avx)
  1157. psadbw m4, m6, [r1+%2]
  1158. psadbw m5, m6, [r2+%2]
  1159. %else
  1160. movu m4, [r1+%2]
  1161. movu m5, [r2+%2]
  1162. psadbw m4, m6
  1163. psadbw m5, m6
  1164. %endif
  1165. paddw m0, m4
  1166. paddw m1, m5
  1167. %if cpuflag(avx)
  1168. psadbw m4, m6, [r3+%2]
  1169. psadbw m5, m6, [r4+%2]
  1170. %else
  1171. movu m4, [r3+%2]
  1172. movu m5, [r4+%2]
  1173. psadbw m4, m6
  1174. psadbw m5, m6
  1175. %endif
  1176. paddw m2, m4
  1177. paddw m3, m5
  1178. %endmacro
  1179. %macro SAD_X4_4x16P_SSE2 2
  1180. %if %1==0
  1181. lea r6, [r5*3]
  1182. SAD_X4_START_1x16P_SSE2
  1183. %else
  1184. SAD_X4_1x16P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0
  1185. %endif
  1186. SAD_X4_1x16P_SSE2 FENC_STRIDE*(1+(%1&1)*4), r5*1
  1187. SAD_X4_1x16P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2
  1188. SAD_X4_1x16P_SSE2 FENC_STRIDE*(3+(%1&1)*4), r6
  1189. %if %1 != %2-1
  1190. %if (%1&1) != 0
  1191. add r0, 8*FENC_STRIDE
  1192. %endif
  1193. lea r1, [r1+4*r5]
  1194. lea r2, [r2+4*r5]
  1195. lea r3, [r3+4*r5]
  1196. lea r4, [r4+4*r5]
  1197. %endif
  1198. %endmacro
  1199. %macro SAD_X3_4x8P_SSE2 2
  1200. %if %1==0
  1201. lea t0, [r4*3]
  1202. SAD_X3_START_2x8P_SSE2
  1203. %else
  1204. SAD_X3_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r4*0, FENC_STRIDE*(1+(%1&1)*4), r4*1
  1205. %endif
  1206. SAD_X3_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r4*2, FENC_STRIDE*(3+(%1&1)*4), t0
  1207. %if %1 != %2-1
  1208. %if (%1&1) != 0
  1209. add r0, 8*FENC_STRIDE
  1210. %endif
  1211. lea r1, [r1+4*r4]
  1212. lea r2, [r2+4*r4]
  1213. lea r3, [r3+4*r4]
  1214. %endif
  1215. %endmacro
  1216. %macro SAD_X4_4x8P_SSE2 2
  1217. %if %1==0
  1218. lea r6, [r5*3]
  1219. SAD_X4_START_2x8P_SSE2
  1220. %else
  1221. SAD_X4_2x8P_SSE2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
  1222. %endif
  1223. SAD_X4_2x8P_SSE2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
  1224. %if %1 != %2-1
  1225. %if (%1&1) != 0
  1226. add r0, 8*FENC_STRIDE
  1227. %endif
  1228. lea r1, [r1+4*r5]
  1229. lea r2, [r2+4*r5]
  1230. lea r3, [r3+4*r5]
  1231. lea r4, [r4+4*r5]
  1232. %endif
  1233. %endmacro
  1234. %macro SAD_X3_END_SSE2 0
  1235. movifnidn r5, r5mp
  1236. %if cpuflag(ssse3)
  1237. packssdw m0, m1
  1238. packssdw m2, m2
  1239. phaddd m0, m2
  1240. mova [r5], m0
  1241. %else
  1242. movhlps m3, m0
  1243. movhlps m4, m1
  1244. movhlps m5, m2
  1245. paddw m0, m3
  1246. paddw m1, m4
  1247. paddw m2, m5
  1248. movd [r5+0], m0
  1249. movd [r5+4], m1
  1250. movd [r5+8], m2
  1251. %endif
  1252. RET
  1253. %endmacro
  1254. %macro SAD_X4_END_SSE2 0
  1255. mov r0, r6mp
  1256. %if cpuflag(ssse3)
  1257. packssdw m0, m1
  1258. packssdw m2, m3
  1259. phaddd m0, m2
  1260. mova [r0], m0
  1261. %else
  1262. psllq m1, 32
  1263. psllq m3, 32
  1264. paddw m0, m1
  1265. paddw m2, m3
  1266. movhlps m1, m0
  1267. movhlps m3, m2
  1268. paddw m0, m1
  1269. paddw m2, m3
  1270. movq [r0+0], m0
  1271. movq [r0+8], m2
  1272. %endif
  1273. RET
  1274. %endmacro
  1275. %macro SAD_X4_START_2x8P_SSSE3 0
  1276. movddup m4, [r0]
  1277. movq m0, [r1]
  1278. movq m1, [r3]
  1279. movhps m0, [r2]
  1280. movhps m1, [r4]
  1281. movddup m5, [r0+FENC_STRIDE]
  1282. movq m2, [r1+r5]
  1283. movq m3, [r3+r5]
  1284. movhps m2, [r2+r5]
  1285. movhps m3, [r4+r5]
  1286. psadbw m0, m4
  1287. psadbw m1, m4
  1288. psadbw m2, m5
  1289. psadbw m3, m5
  1290. paddw m0, m2
  1291. paddw m1, m3
  1292. %endmacro
  1293. %macro SAD_X4_2x8P_SSSE3 4
  1294. movddup m6, [r0+%1]
  1295. movq m2, [r1+%2]
  1296. movq m3, [r3+%2]
  1297. movhps m2, [r2+%2]
  1298. movhps m3, [r4+%2]
  1299. movddup m7, [r0+%3]
  1300. movq m4, [r1+%4]
  1301. movq m5, [r3+%4]
  1302. movhps m4, [r2+%4]
  1303. movhps m5, [r4+%4]
  1304. psadbw m2, m6
  1305. psadbw m3, m6
  1306. psadbw m4, m7
  1307. psadbw m5, m7
  1308. paddw m0, m2
  1309. paddw m1, m3
  1310. paddw m0, m4
  1311. paddw m1, m5
  1312. %endmacro
  1313. %macro SAD_X4_4x8P_SSSE3 2
  1314. %if %1==0
  1315. lea r6, [r5*3]
  1316. SAD_X4_START_2x8P_SSSE3
  1317. %else
  1318. SAD_X4_2x8P_SSSE3 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
  1319. %endif
  1320. SAD_X4_2x8P_SSSE3 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
  1321. %if %1 != %2-1
  1322. %if (%1&1) != 0
  1323. add r0, 8*FENC_STRIDE
  1324. %endif
  1325. lea r1, [r1+4*r5]
  1326. lea r2, [r2+4*r5]
  1327. lea r3, [r3+4*r5]
  1328. lea r4, [r4+4*r5]
  1329. %endif
  1330. %endmacro
  1331. %macro SAD_X4_END_SSSE3 0
  1332. mov r0, r6mp
  1333. packssdw m0, m1
  1334. mova [r0], m0
  1335. RET
  1336. %endmacro
  1337. %macro SAD_X3_START_2x16P_AVX2 0
  1338. movu m3, [r0] ; assumes FENC_STRIDE == 16
  1339. movu xm0, [r1]
  1340. movu xm1, [r2]
  1341. movu xm2, [r3]
  1342. vinserti128 m0, m0, [r1+r4], 1
  1343. vinserti128 m1, m1, [r2+r4], 1
  1344. vinserti128 m2, m2, [r3+r4], 1
  1345. psadbw m0, m3
  1346. psadbw m1, m3
  1347. psadbw m2, m3
  1348. %endmacro
  1349. %macro SAD_X3_2x16P_AVX2 3
  1350. movu m3, [r0+%1] ; assumes FENC_STRIDE == 16
  1351. movu xm4, [r1+%2]
  1352. movu xm5, [r2+%2]
  1353. movu xm6, [r3+%2]
  1354. vinserti128 m4, m4, [r1+%3], 1
  1355. vinserti128 m5, m5, [r2+%3], 1
  1356. vinserti128 m6, m6, [r3+%3], 1
  1357. psadbw m4, m3
  1358. psadbw m5, m3
  1359. psadbw m6, m3
  1360. paddw m0, m4
  1361. paddw m1, m5
  1362. paddw m2, m6
  1363. %endmacro
  1364. %macro SAD_X3_4x16P_AVX2 2
  1365. %if %1==0
  1366. lea t0, [r4*3]
  1367. SAD_X3_START_2x16P_AVX2
  1368. %else
  1369. SAD_X3_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r4*0, r4*1
  1370. %endif
  1371. SAD_X3_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r4*2, t0
  1372. %if %1 != %2-1
  1373. %if (%1&1) != 0
  1374. add r0, 8*FENC_STRIDE
  1375. %endif
  1376. lea r1, [r1+4*r4]
  1377. lea r2, [r2+4*r4]
  1378. lea r3, [r3+4*r4]
  1379. %endif
  1380. %endmacro
  1381. %macro SAD_X4_START_2x16P_AVX2 0
  1382. vbroadcasti128 m4, [r0]
  1383. vbroadcasti128 m5, [r0+FENC_STRIDE]
  1384. movu xm0, [r1]
  1385. movu xm1, [r2]
  1386. movu xm2, [r1+r5]
  1387. movu xm3, [r2+r5]
  1388. vinserti128 m0, m0, [r3], 1
  1389. vinserti128 m1, m1, [r4], 1
  1390. vinserti128 m2, m2, [r3+r5], 1
  1391. vinserti128 m3, m3, [r4+r5], 1
  1392. psadbw m0, m4
  1393. psadbw m1, m4
  1394. psadbw m2, m5
  1395. psadbw m3, m5
  1396. paddw m0, m2
  1397. paddw m1, m3
  1398. %endmacro
  1399. %macro SAD_X4_2x16P_AVX2 4
  1400. vbroadcasti128 m6, [r0+%1]
  1401. vbroadcasti128 m7, [r0+%3]
  1402. movu xm2, [r1+%2]
  1403. movu xm3, [r2+%2]
  1404. movu xm4, [r1+%4]
  1405. movu xm5, [r2+%4]
  1406. vinserti128 m2, m2, [r3+%2], 1
  1407. vinserti128 m3, m3, [r4+%2], 1
  1408. vinserti128 m4, m4, [r3+%4], 1
  1409. vinserti128 m5, m5, [r4+%4], 1
  1410. psadbw m2, m6
  1411. psadbw m3, m6
  1412. psadbw m4, m7
  1413. psadbw m5, m7
  1414. paddw m0, m2
  1415. paddw m1, m3
  1416. paddw m0, m4
  1417. paddw m1, m5
  1418. %endmacro
  1419. %macro SAD_X4_4x16P_AVX2 2
  1420. %if %1==0
  1421. lea r6, [r5*3]
  1422. SAD_X4_START_2x16P_AVX2
  1423. %else
  1424. SAD_X4_2x16P_AVX2 FENC_STRIDE*(0+(%1&1)*4), r5*0, FENC_STRIDE*(1+(%1&1)*4), r5*1
  1425. %endif
  1426. SAD_X4_2x16P_AVX2 FENC_STRIDE*(2+(%1&1)*4), r5*2, FENC_STRIDE*(3+(%1&1)*4), r6
  1427. %if %1 != %2-1
  1428. %if (%1&1) != 0
  1429. add r0, 8*FENC_STRIDE
  1430. %endif
  1431. lea r1, [r1+4*r5]
  1432. lea r2, [r2+4*r5]
  1433. lea r3, [r3+4*r5]
  1434. lea r4, [r4+4*r5]
  1435. %endif
  1436. %endmacro
  1437. %macro SAD_X3_END_AVX2 0
  1438. movifnidn r5, r5mp
  1439. packssdw m0, m1 ; 0 0 1 1 0 0 1 1
  1440. packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
  1441. phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
  1442. vextracti128 xm1, m0, 1
  1443. paddd xm0, xm1 ; 0 1 2 _
  1444. mova [r5], xm0
  1445. RET
  1446. %endmacro
  1447. %macro SAD_X4_END_AVX2 0
  1448. mov r0, r6mp
  1449. packssdw m0, m1 ; 0 0 1 1 2 2 3 3
  1450. vextracti128 xm1, m0, 1
  1451. phaddd xm0, xm1 ; 0 1 2 3
  1452. mova [r0], xm0
  1453. RET
  1454. %endmacro
  1455. ;-----------------------------------------------------------------------------
  1456. ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
  1457. ; uint8_t *pix2, intptr_t i_stride, int scores[3] )
  1458. ;-----------------------------------------------------------------------------
  1459. %macro SAD_X_SSE2 4
  1460. cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
  1461. %assign x 0
  1462. %rep %3/4
  1463. SAD_X%1_4x%2P_SSE2 x, %3/4
  1464. %assign x x+1
  1465. %endrep
  1466. SAD_X%1_END_SSE2
  1467. %endmacro
  1468. INIT_XMM sse2
  1469. SAD_X_SSE2 3, 16, 16, 7
  1470. SAD_X_SSE2 3, 16, 8, 7
  1471. SAD_X_SSE2 3, 8, 16, 7
  1472. SAD_X_SSE2 3, 8, 8, 7
  1473. SAD_X_SSE2 3, 8, 4, 7
  1474. SAD_X_SSE2 4, 16, 16, 7
  1475. SAD_X_SSE2 4, 16, 8, 7
  1476. SAD_X_SSE2 4, 8, 16, 7
  1477. SAD_X_SSE2 4, 8, 8, 7
  1478. SAD_X_SSE2 4, 8, 4, 7
  1479. INIT_XMM sse3
  1480. SAD_X_SSE2 3, 16, 16, 7
  1481. SAD_X_SSE2 3, 16, 8, 7
  1482. SAD_X_SSE2 4, 16, 16, 7
  1483. SAD_X_SSE2 4, 16, 8, 7
  1484. %macro SAD_X_SSSE3 3
  1485. cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,8
  1486. %assign x 0
  1487. %rep %3/4
  1488. SAD_X%1_4x%2P_SSSE3 x, %3/4
  1489. %assign x x+1
  1490. %endrep
  1491. SAD_X%1_END_SSSE3
  1492. %endmacro
  1493. INIT_XMM ssse3
  1494. SAD_X_SSE2 3, 16, 16, 7
  1495. SAD_X_SSE2 3, 16, 8, 7
  1496. SAD_X_SSE2 4, 16, 16, 7
  1497. SAD_X_SSE2 4, 16, 8, 7
  1498. SAD_X_SSSE3 4, 8, 16
  1499. SAD_X_SSSE3 4, 8, 8
  1500. SAD_X_SSSE3 4, 8, 4
  1501. INIT_XMM avx
  1502. SAD_X_SSE2 3, 16, 16, 6
  1503. SAD_X_SSE2 3, 16, 8, 6
  1504. SAD_X_SSE2 4, 16, 16, 7
  1505. SAD_X_SSE2 4, 16, 8, 7
  1506. %macro SAD_X_AVX2 4
  1507. cglobal pixel_sad_x%1_%2x%3, 2+%1,3+%1,%4
  1508. %assign x 0
  1509. %rep %3/4
  1510. SAD_X%1_4x%2P_AVX2 x, %3/4
  1511. %assign x x+1
  1512. %endrep
  1513. SAD_X%1_END_AVX2
  1514. %endmacro
  1515. INIT_YMM avx2
  1516. SAD_X_AVX2 3, 16, 16, 7
  1517. SAD_X_AVX2 3, 16, 8, 7
  1518. SAD_X_AVX2 4, 16, 16, 8
  1519. SAD_X_AVX2 4, 16, 8, 8
  1520. %macro SAD_X_W4_AVX512 2 ; x, h
  1521. cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
  1522. mov t1d, 0xa
  1523. kmovb k1, t1d
  1524. lea t1, [3*t0]
  1525. kaddb k2, k1, k1
  1526. kshiftlb k3, k1, 2
  1527. %assign %%i 0
  1528. %rep %2/4
  1529. movu m6, [r0+%%i*64]
  1530. vmovddup m6 {k1}, [r0+%%i*64+32]
  1531. movd xmm2, [r1]
  1532. movd xmm4, [r1+t0]
  1533. vpbroadcastd xmm2 {k1}, [r1+2*t0]
  1534. vpbroadcastd xmm4 {k1}, [r1+t1]
  1535. vpbroadcastd xmm2 {k2}, [r2+t0]
  1536. vpbroadcastd xmm4 {k2}, [r2]
  1537. vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
  1538. vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
  1539. vpmovqd s1, m6 ; s0 s2 s1 s3
  1540. movd xmm3, [r3]
  1541. movd xmm5, [r3+t0]
  1542. vpbroadcastd xmm3 {k1}, [r3+2*t0]
  1543. vpbroadcastd xmm5 {k1}, [r3+t1]
  1544. %if %1 == 4
  1545. vpbroadcastd xmm3 {k2}, [r4+t0]
  1546. vpbroadcastd xmm5 {k2}, [r4]
  1547. vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
  1548. vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
  1549. %endif
  1550. %if %%i != %2/4-1
  1551. %assign %%j 1
  1552. %rep %1
  1553. lea r%+%%j, [r%+%%j+4*t0]
  1554. %assign %%j %%j+1
  1555. %endrep
  1556. %endif
  1557. pshufd s2, s1, q1032
  1558. psadbw xmm2, s1
  1559. psadbw xmm4, s2
  1560. psadbw xmm3, s1
  1561. psadbw xmm5, s2
  1562. %if %%i
  1563. paddd xmm0, xmm2
  1564. paddd xmm1, xmm3
  1565. paddd xmm0, xmm4
  1566. paddd xmm1, xmm5
  1567. %else
  1568. paddd xmm0, xmm2, xmm4
  1569. paddd xmm1, xmm3, xmm5
  1570. %endif
  1571. %assign %%i %%i+1
  1572. %endrep
  1573. %if %1 == 4
  1574. movifnidn t2, r6mp
  1575. %else
  1576. movifnidn t2, r5mp
  1577. %endif
  1578. packusdw xmm0, xmm1
  1579. mova [t2], xmm0
  1580. RET
  1581. %endmacro
  1582. %macro SAD_X_W8_AVX512 2 ; x, h
  1583. cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
  1584. kxnorb k3, k3, k3
  1585. lea t1, [3*t0]
  1586. kaddb k1, k3, k3
  1587. kshiftlb k2, k3, 2
  1588. kshiftlb k3, k3, 3
  1589. %assign %%i 0
  1590. %rep %2/4
  1591. movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
  1592. movq xm2, [r1]
  1593. movq xm4, [r1+2*t0]
  1594. vpbroadcastq xm2 {k1}, [r2]
  1595. vpbroadcastq xm4 {k1}, [r2+2*t0]
  1596. vpbroadcastq m2 {k2}, [r1+t0]
  1597. vpbroadcastq m4 {k2}, [r1+t1]
  1598. vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
  1599. vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
  1600. movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
  1601. movq xm3, [r3]
  1602. movq xm5, [r3+2*t0]
  1603. %if %1 == 4
  1604. vpbroadcastq xm3 {k1}, [r4]
  1605. vpbroadcastq xm5 {k1}, [r4+2*t0]
  1606. %endif
  1607. vpbroadcastq m3 {k2}, [r3+t0]
  1608. vpbroadcastq m5 {k2}, [r3+t1]
  1609. %if %1 == 4
  1610. vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
  1611. vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
  1612. %endif
  1613. %if %%i != %2/4-1
  1614. %assign %%j 1
  1615. %rep %1
  1616. lea r%+%%j, [r%+%%j+4*t0]
  1617. %assign %%j %%j+1
  1618. %endrep
  1619. %endif
  1620. psadbw m2, m6
  1621. psadbw m4, m7
  1622. psadbw m3, m6
  1623. psadbw m5, m7
  1624. ACCUM paddd, 0, 2, %%i
  1625. ACCUM paddd, 1, 3, %%i
  1626. paddd m0, m4
  1627. paddd m1, m5
  1628. %assign %%i %%i+1
  1629. %endrep
  1630. %if %1 == 4
  1631. movifnidn t2, r6mp
  1632. %else
  1633. movifnidn t2, r5mp
  1634. %endif
  1635. packusdw m0, m1
  1636. vextracti128 xm1, m0, 1
  1637. paddd xm0, xm1
  1638. mova [t2], xm0
  1639. RET
  1640. %endmacro
  1641. %macro SAD_X_W16_AVX512 2 ; x, h
  1642. cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
  1643. lea t1, [3*t0]
  1644. %assign %%i 0
  1645. %rep %2/4
  1646. mova m6, [r0+%%i*64] ; s0 s1 s2 s3
  1647. movu xm2, [r3]
  1648. movu xm4, [r3+t0]
  1649. %if %1 == 4
  1650. vinserti128 ym2, [r4+t0], 1
  1651. vinserti128 ym4, [r4], 1
  1652. %endif
  1653. vinserti32x4 m2, [r1+2*t0], 2
  1654. vinserti32x4 m4, [r1+t1], 2
  1655. vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
  1656. vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
  1657. vpermq m7, m6, q1032 ; s1 s0 s3 s2
  1658. movu xm3, [r1]
  1659. movu xm5, [r1+t0]
  1660. vinserti128 ym3, [r2+t0], 1
  1661. vinserti128 ym5, [r2], 1
  1662. vinserti32x4 m3, [r3+2*t0], 2
  1663. vinserti32x4 m5, [r3+t1], 2
  1664. %if %1 == 4
  1665. vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
  1666. vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
  1667. %endif
  1668. %if %%i != %2/4-1
  1669. %assign %%j 1
  1670. %rep %1
  1671. lea r%+%%j, [r%+%%j+4*t0]
  1672. %assign %%j %%j+1
  1673. %endrep
  1674. %endif
  1675. psadbw m2, m6
  1676. psadbw m4, m7
  1677. psadbw m3, m6
  1678. psadbw m5, m7
  1679. ACCUM paddd, 0, 2, %%i
  1680. ACCUM paddd, 1, 3, %%i
  1681. paddd m0, m4
  1682. paddd m1, m5
  1683. %assign %%i %%i+1
  1684. %endrep
  1685. %if %1 == 4
  1686. movifnidn t2, r6mp
  1687. %else
  1688. movifnidn t2, r5mp
  1689. %endif
  1690. mov t1d, 0x1111
  1691. kmovw k1, t1d
  1692. vshufi32x4 m0, m0, q1032
  1693. paddd m0, m1
  1694. punpckhqdq m1, m0, m0
  1695. paddd m0, m1
  1696. vpcompressd m0 {k1}{z}, m0
  1697. mova [t2], xm0
  1698. RET
  1699. %endmacro
  1700. ; t0 = stride, t1 = tmp/stride3, t2 = scores
  1701. %if WIN64
  1702. %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
  1703. %define s2 xmm17 ; they're callee-saved on win64
  1704. DECLARE_REG_TMP 4, 6, 0
  1705. %else
  1706. %define s1 xmm6
  1707. %define s2 xmm7
  1708. %if ARCH_X86_64
  1709. DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
  1710. %else
  1711. DECLARE_REG_TMP 4, 5, 0
  1712. %endif
  1713. %endif
  1714. INIT_YMM avx512
  1715. SAD_X_W4_AVX512 3, 4 ; x3_4x4
  1716. SAD_X_W4_AVX512 3, 8 ; x3_4x8
  1717. SAD_X_W8_AVX512 3, 4 ; x3_8x4
  1718. SAD_X_W8_AVX512 3, 8 ; x3_8x8
  1719. SAD_X_W8_AVX512 3, 16 ; x3_8x16
  1720. INIT_ZMM avx512
  1721. SAD_X_W16_AVX512 3, 8 ; x3_16x8
  1722. SAD_X_W16_AVX512 3, 16 ; x3_16x16
  1723. DECLARE_REG_TMP 5, 6, 0
  1724. INIT_YMM avx512
  1725. SAD_X_W4_AVX512 4, 4 ; x4_4x4
  1726. SAD_X_W4_AVX512 4, 8 ; x4_4x8
  1727. SAD_X_W8_AVX512 4, 4 ; x4_8x4
  1728. SAD_X_W8_AVX512 4, 8 ; x4_8x8
  1729. SAD_X_W8_AVX512 4, 16 ; x4_8x16
  1730. INIT_ZMM avx512
  1731. SAD_X_W16_AVX512 4, 8 ; x4_16x8
  1732. SAD_X_W16_AVX512 4, 16 ; x4_16x16
  1733. ;=============================================================================
  1734. ; SAD cacheline split
  1735. ;=============================================================================
  1736. ; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
  1737. ; unless the unaligned data spans the border between 2 cachelines, in which
  1738. ; case it's really slow. The exact numbers may differ, but all Intel cpus prior
  1739. ; to Nehalem have a large penalty for cacheline splits.
  1740. ; (8-byte alignment exactly half way between two cachelines is ok though.)
  1741. ; LDDQU was supposed to fix this, but it only works on Pentium 4.
  1742. ; So in the split case we load aligned data and explicitly perform the
  1743. ; alignment between registers. Like on archs that have only aligned loads,
  1744. ; except complicated by the fact that PALIGNR takes only an immediate, not
  1745. ; a variable alignment.
  1746. ; It is also possible to hoist the realignment to the macroblock level (keep
  1747. ; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
  1748. ; needed for that method makes it often slower.
  1749. ; sad 16x16 costs on Core2:
  1750. ; good offsets: 49 cycles (50/64 of all mvs)
  1751. ; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
  1752. ; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
  1753. ; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
  1754. ; computed jump assumes this loop is exactly 80 bytes
  1755. %macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
  1756. ALIGN 16
  1757. sad_w16_align%1_sse2:
  1758. movdqa xmm1, [r2+16]
  1759. movdqa xmm2, [r2+r3+16]
  1760. movdqa xmm3, [r2]
  1761. movdqa xmm4, [r2+r3]
  1762. pslldq xmm1, 16-%1
  1763. pslldq xmm2, 16-%1
  1764. psrldq xmm3, %1
  1765. psrldq xmm4, %1
  1766. por xmm1, xmm3
  1767. por xmm2, xmm4
  1768. psadbw xmm1, [r0]
  1769. psadbw xmm2, [r0+r1]
  1770. paddw xmm0, xmm1
  1771. paddw xmm0, xmm2
  1772. lea r0, [r0+2*r1]
  1773. lea r2, [r2+2*r3]
  1774. dec r4
  1775. jg sad_w16_align%1_sse2
  1776. ret
  1777. %endmacro
  1778. ; computed jump assumes this loop is exactly 64 bytes
  1779. %macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
  1780. ALIGN 16
  1781. sad_w16_align%1_ssse3:
  1782. movdqa xmm1, [r2+16]
  1783. movdqa xmm2, [r2+r3+16]
  1784. palignr xmm1, [r2], %1
  1785. palignr xmm2, [r2+r3], %1
  1786. psadbw xmm1, [r0]
  1787. psadbw xmm2, [r0+r1]
  1788. paddw xmm0, xmm1
  1789. paddw xmm0, xmm2
  1790. lea r0, [r0+2*r1]
  1791. lea r2, [r2+2*r3]
  1792. dec r4
  1793. jg sad_w16_align%1_ssse3
  1794. ret
  1795. %endmacro
  1796. %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
  1797. cglobal pixel_sad_16x%2_cache64_%1
  1798. mov eax, r2m
  1799. and eax, 0x37
  1800. cmp eax, 0x30
  1801. jle pixel_sad_16x%2_sse2
  1802. PROLOGUE 4,6
  1803. mov r4d, r2d
  1804. and r4d, 15
  1805. %ifidn %1, ssse3
  1806. shl r4d, 6 ; code size = 64
  1807. %else
  1808. lea r4, [r4*5]
  1809. shl r4d, 4 ; code size = 80
  1810. %endif
  1811. %define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
  1812. %ifdef PIC
  1813. lea r5, [sad_w16_addr]
  1814. add r5, r4
  1815. %else
  1816. lea r5, [sad_w16_addr + r4]
  1817. %endif
  1818. and r2, ~15
  1819. mov r4d, %2/2
  1820. pxor xmm0, xmm0
  1821. call r5
  1822. MOVHL xmm1, xmm0
  1823. paddw xmm0, xmm1
  1824. movd eax, xmm0
  1825. RET
  1826. %endmacro
  1827. %macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
  1828. mov eax, r2m
  1829. and eax, 0x17|%1|(%4>>1)
  1830. cmp eax, 0x10|%1|(%4>>1)
  1831. jle pixel_sad_%1x%2_mmx2
  1832. and eax, 7
  1833. shl eax, 3
  1834. movd mm6, [sw_64]
  1835. movd mm7, eax
  1836. psubw mm6, mm7
  1837. PROLOGUE 4,5
  1838. and r2, ~7
  1839. mov r4d, %3
  1840. pxor mm0, mm0
  1841. %endmacro
  1842. %macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
  1843. cglobal pixel_sad_16x%1_cache%2_mmx2
  1844. SAD_CACHELINE_START_MMX2 16, %1, %1, %2
  1845. .loop:
  1846. movq mm1, [r2]
  1847. movq mm2, [r2+8]
  1848. movq mm3, [r2+16]
  1849. movq mm4, mm2
  1850. psrlq mm1, mm7
  1851. psllq mm2, mm6
  1852. psllq mm3, mm6
  1853. psrlq mm4, mm7
  1854. por mm1, mm2
  1855. por mm3, mm4
  1856. psadbw mm1, [r0]
  1857. psadbw mm3, [r0+8]
  1858. paddw mm0, mm1
  1859. paddw mm0, mm3
  1860. add r2, r3
  1861. add r0, r1
  1862. dec r4
  1863. jg .loop
  1864. movd eax, mm0
  1865. RET
  1866. %endmacro
  1867. %macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
  1868. cglobal pixel_sad_8x%1_cache%2_mmx2
  1869. SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
  1870. .loop:
  1871. movq mm1, [r2+8]
  1872. movq mm2, [r2+r3+8]
  1873. movq mm3, [r2]
  1874. movq mm4, [r2+r3]
  1875. psllq mm1, mm6
  1876. psllq mm2, mm6
  1877. psrlq mm3, mm7
  1878. psrlq mm4, mm7
  1879. por mm1, mm3
  1880. por mm2, mm4
  1881. psadbw mm1, [r0]
  1882. psadbw mm2, [r0+r1]
  1883. paddw mm0, mm1
  1884. paddw mm0, mm2
  1885. lea r2, [r2+2*r3]
  1886. lea r0, [r0+2*r1]
  1887. dec r4
  1888. jg .loop
  1889. movd eax, mm0
  1890. RET
  1891. %endmacro
  1892. ; sad_x3/x4_cache64: check each mv.
  1893. ; if they're all within a cacheline, use normal sad_x3/x4.
  1894. ; otherwise, send them individually to sad_cache64.
  1895. %macro CHECK_SPLIT 3 ; pix, width, cacheline
  1896. mov eax, %1
  1897. and eax, 0x17|%2|(%3>>1)
  1898. cmp eax, 0x10|%2|(%3>>1)
  1899. jg .split
  1900. %endmacro
  1901. %macro SADX3_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
  1902. cglobal pixel_sad_x3_%1x%2_cache%3_%6
  1903. CHECK_SPLIT r1m, %1, %3
  1904. CHECK_SPLIT r2m, %1, %3
  1905. CHECK_SPLIT r3m, %1, %3
  1906. jmp pixel_sad_x3_%1x%2_%4
  1907. .split:
  1908. %if ARCH_X86_64
  1909. PROLOGUE 6,9
  1910. push r3
  1911. push r2
  1912. %if WIN64
  1913. movsxd r4, r4d
  1914. sub rsp, 40 ; shadow space and alignment
  1915. %endif
  1916. mov r2, r1
  1917. mov r1, FENC_STRIDE
  1918. mov r3, r4
  1919. mov r7, r0
  1920. mov r8, r5
  1921. call pixel_sad_%1x%2_cache%3_%5
  1922. mov [r8], eax
  1923. %if WIN64
  1924. mov r2, [rsp+40+0*8]
  1925. %else
  1926. pop r2
  1927. %endif
  1928. mov r0, r7
  1929. call pixel_sad_%1x%2_cache%3_%5
  1930. mov [r8+4], eax
  1931. %if WIN64
  1932. mov r2, [rsp+40+1*8]
  1933. %else
  1934. pop r2
  1935. %endif
  1936. mov r0, r7
  1937. call pixel_sad_%1x%2_cache%3_%5
  1938. mov [r8+8], eax
  1939. %if WIN64
  1940. add rsp, 40+2*8
  1941. %endif
  1942. RET
  1943. %else
  1944. push edi
  1945. mov edi, [esp+28]
  1946. push dword [esp+24]
  1947. push dword [esp+16]
  1948. push dword 16
  1949. push dword [esp+20]
  1950. call pixel_sad_%1x%2_cache%3_%5
  1951. mov ecx, [esp+32]
  1952. mov [edi], eax
  1953. mov [esp+8], ecx
  1954. call pixel_sad_%1x%2_cache%3_%5
  1955. mov ecx, [esp+36]
  1956. mov [edi+4], eax
  1957. mov [esp+8], ecx
  1958. call pixel_sad_%1x%2_cache%3_%5
  1959. mov [edi+8], eax
  1960. add esp, 16
  1961. pop edi
  1962. ret
  1963. %endif
  1964. %endmacro
  1965. %macro SADX4_CACHELINE_FUNC 6 ; width, height, cacheline, normal_ver, split_ver, name
  1966. cglobal pixel_sad_x4_%1x%2_cache%3_%6
  1967. CHECK_SPLIT r1m, %1, %3
  1968. CHECK_SPLIT r2m, %1, %3
  1969. CHECK_SPLIT r3m, %1, %3
  1970. CHECK_SPLIT r4m, %1, %3
  1971. jmp pixel_sad_x4_%1x%2_%4
  1972. .split:
  1973. %if ARCH_X86_64
  1974. PROLOGUE 6,9
  1975. mov r8, r6mp
  1976. push r4
  1977. push r3
  1978. push r2
  1979. %if WIN64
  1980. sub rsp, 32 ; shadow space
  1981. %endif
  1982. mov r2, r1
  1983. mov r1, FENC_STRIDE
  1984. mov r3, r5
  1985. mov r7, r0
  1986. call pixel_sad_%1x%2_cache%3_%5
  1987. mov [r8], eax
  1988. %if WIN64
  1989. mov r2, [rsp+32+0*8]
  1990. %else
  1991. pop r2
  1992. %endif
  1993. mov r0, r7
  1994. call pixel_sad_%1x%2_cache%3_%5
  1995. mov [r8+4], eax
  1996. %if WIN64
  1997. mov r2, [rsp+32+1*8]
  1998. %else
  1999. pop r2
  2000. %endif
  2001. mov r0, r7
  2002. call pixel_sad_%1x%2_cache%3_%5
  2003. mov [r8+8], eax
  2004. %if WIN64
  2005. mov r2, [rsp+32+2*8]
  2006. %else
  2007. pop r2
  2008. %endif
  2009. mov r0, r7
  2010. call pixel_sad_%1x%2_cache%3_%5
  2011. mov [r8+12], eax
  2012. %if WIN64
  2013. add rsp, 32+3*8
  2014. %endif
  2015. RET
  2016. %else
  2017. push edi
  2018. mov edi, [esp+32]
  2019. push dword [esp+28]
  2020. push dword [esp+16]
  2021. push dword 16
  2022. push dword [esp+20]
  2023. call pixel_sad_%1x%2_cache%3_%5
  2024. mov ecx, [esp+32]
  2025. mov [edi], eax
  2026. mov [esp+8], ecx
  2027. call pixel_sad_%1x%2_cache%3_%5
  2028. mov ecx, [esp+36]
  2029. mov [edi+4], eax
  2030. mov [esp+8], ecx
  2031. call pixel_sad_%1x%2_cache%3_%5
  2032. mov ecx, [esp+40]
  2033. mov [edi+8], eax
  2034. mov [esp+8], ecx
  2035. call pixel_sad_%1x%2_cache%3_%5
  2036. mov [edi+12], eax
  2037. add esp, 16
  2038. pop edi
  2039. ret
  2040. %endif
  2041. %endmacro
  2042. %macro SADX34_CACHELINE_FUNC 1+
  2043. SADX3_CACHELINE_FUNC %1
  2044. SADX4_CACHELINE_FUNC %1
  2045. %endmacro
  2046. ; instantiate the aligned sads
  2047. INIT_MMX
  2048. %if ARCH_X86_64 == 0
  2049. SAD16_CACHELINE_FUNC_MMX2 8, 32
  2050. SAD16_CACHELINE_FUNC_MMX2 16, 32
  2051. SAD8_CACHELINE_FUNC_MMX2 4, 32
  2052. SAD8_CACHELINE_FUNC_MMX2 8, 32
  2053. SAD8_CACHELINE_FUNC_MMX2 16, 32
  2054. SAD16_CACHELINE_FUNC_MMX2 8, 64
  2055. SAD16_CACHELINE_FUNC_MMX2 16, 64
  2056. %endif ; !ARCH_X86_64
  2057. SAD8_CACHELINE_FUNC_MMX2 4, 64
  2058. SAD8_CACHELINE_FUNC_MMX2 8, 64
  2059. SAD8_CACHELINE_FUNC_MMX2 16, 64
  2060. %if ARCH_X86_64 == 0
  2061. SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
  2062. SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
  2063. SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
  2064. SADX34_CACHELINE_FUNC 8, 8, 32, mmx2, mmx2, mmx2
  2065. SADX34_CACHELINE_FUNC 16, 16, 64, mmx2, mmx2, mmx2
  2066. SADX34_CACHELINE_FUNC 16, 8, 64, mmx2, mmx2, mmx2
  2067. %endif ; !ARCH_X86_64
  2068. SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
  2069. SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
  2070. %if ARCH_X86_64 == 0
  2071. SAD16_CACHELINE_FUNC sse2, 8
  2072. SAD16_CACHELINE_FUNC sse2, 16
  2073. %assign i 1
  2074. %rep 15
  2075. SAD16_CACHELINE_LOOP_SSE2 i
  2076. %assign i i+1
  2077. %endrep
  2078. SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2, sse2
  2079. SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2, sse2
  2080. %endif ; !ARCH_X86_64
  2081. SADX34_CACHELINE_FUNC 8, 16, 64, sse2, mmx2, sse2
  2082. SAD16_CACHELINE_FUNC ssse3, 8
  2083. SAD16_CACHELINE_FUNC ssse3, 16
  2084. %assign i 1
  2085. %rep 15
  2086. SAD16_CACHELINE_LOOP_SSSE3 i
  2087. %assign i i+1
  2088. %endrep
  2089. SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3
  2090. SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3