deblock-a.asm 63 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548
  1. ;*****************************************************************************
  2. ;* deblock-a.asm: x86 deblocking
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Oskar Arvidsson <oskar@irock.se>
  9. ;*
  10. ;* This program is free software; you can redistribute it and/or modify
  11. ;* it under the terms of the GNU General Public License as published by
  12. ;* the Free Software Foundation; either version 2 of the License, or
  13. ;* (at your option) any later version.
  14. ;*
  15. ;* This program is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. ;* GNU General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU General Public License
  21. ;* along with this program; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. ;*
  24. ;* This program is also available under a commercial proprietary license.
  25. ;* For more information, contact us at licensing@x264.com.
  26. ;*****************************************************************************
  27. %include "x86inc.asm"
  28. %include "x86util.asm"
  29. SECTION_RODATA 64
  30. load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5
  31. dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9
  32. dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5
  33. dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9
  34. load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c
  35. dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c
  36. transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
  37. SECTION .text
  38. cextern pb_0
  39. cextern pb_1
  40. cextern pb_3
  41. cextern pb_a1
  42. cextern pw_2
  43. cextern pw_4
  44. cextern pw_00ff
  45. cextern pw_pixel_max
  46. cextern pb_unpackbd1
  47. %if HIGH_BIT_DEPTH
  48. ; out: %4 = |%1-%2|-%3
  49. ; clobbers: %5
  50. %macro ABS_SUB 5
  51. psubusw %5, %2, %1
  52. psubusw %4, %1, %2
  53. por %4, %5
  54. psubw %4, %3
  55. %endmacro
  56. ; out: %4 = |%1-%2|<%3
  57. %macro DIFF_LT 5
  58. psubusw %4, %2, %1
  59. psubusw %5, %1, %2
  60. por %5, %4 ; |%1-%2|
  61. pxor %4, %4
  62. psubw %5, %3 ; |%1-%2|-%3
  63. pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
  64. %endmacro
  65. %macro LOAD_AB 4
  66. movd %1, %3
  67. movd %2, %4
  68. SPLATW %1, %1
  69. SPLATW %2, %2
  70. %endmacro
  71. ; in: %2=tc reg
  72. ; out: %1=splatted tc
  73. %macro LOAD_TC 2
  74. %if mmsize == 8
  75. pshufw %1, [%2-1], 0
  76. %else
  77. movd %1, [%2]
  78. punpcklbw %1, %1
  79. pshuflw %1, %1, q1100
  80. pshufd %1, %1, q1100
  81. %endif
  82. psraw %1, 8
  83. %endmacro
  84. ; in: %1=p1, %2=p0, %3=q0, %4=q1
  85. ; %5=alpha, %6=beta, %7-%9=tmp
  86. ; out: %7=mask
  87. %macro LOAD_MASK 9
  88. ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
  89. ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
  90. pand %8, %9
  91. ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
  92. pxor %7, %7
  93. pand %8, %9
  94. pcmpgtw %7, %8
  95. %endmacro
  96. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  97. ; out: %1=p0', m2=q0'
  98. %macro DEBLOCK_P0_Q0 7
  99. psubw %3, %4
  100. pxor %7, %7
  101. paddw %3, [pw_4]
  102. psubw %7, %5
  103. psubw %6, %2, %1
  104. psllw %6, 2
  105. paddw %3, %6
  106. psraw %3, 3
  107. mova %6, [pw_pixel_max]
  108. CLIPW %3, %7, %5
  109. pxor %7, %7
  110. paddw %1, %3
  111. psubw %2, %3
  112. CLIPW %1, %7, %6
  113. CLIPW %2, %7, %6
  114. %endmacro
  115. ; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
  116. %macro LUMA_Q1 6
  117. pavgw %6, %3, %4 ; (p0+q0+1)>>1
  118. paddw %1, %6
  119. pxor %6, %6
  120. psraw %1, 1
  121. psubw %6, %5
  122. psubw %1, %2
  123. CLIPW %1, %6, %5
  124. paddw %1, %2
  125. %endmacro
  126. %macro LUMA_DEBLOCK_ONE 3
  127. DIFF_LT m5, %1, bm, m4, m6
  128. pxor m6, m6
  129. mova %3, m4
  130. pcmpgtw m6, tcm
  131. pand m4, tcm
  132. pandn m6, m7
  133. pand m4, m6
  134. LUMA_Q1 m5, %2, m1, m2, m4, m6
  135. %endmacro
  136. %macro LUMA_H_STORE 2
  137. %if mmsize == 8
  138. movq [r0-4], m0
  139. movq [r0+r1-4], m1
  140. movq [r0+r1*2-4], m2
  141. movq [r0+%2-4], m3
  142. %else
  143. movq [r0-4], m0
  144. movhps [r0+r1-4], m0
  145. movq [r0+r1*2-4], m1
  146. movhps [%1-4], m1
  147. movq [%1+r1-4], m2
  148. movhps [%1+r1*2-4], m2
  149. movq [%1+%2-4], m3
  150. movhps [%1+r1*4-4], m3
  151. %endif
  152. %endmacro
  153. %macro DEBLOCK_LUMA 0
  154. ;-----------------------------------------------------------------------------
  155. ; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  156. ;-----------------------------------------------------------------------------
  157. cglobal deblock_v_luma, 5,5,8,0-5*mmsize
  158. %define tcm [rsp]
  159. %define ms1 [rsp+mmsize]
  160. %define ms2 [rsp+mmsize*2]
  161. %define am [rsp+mmsize*3]
  162. %define bm [rsp+mmsize*4]
  163. add r1, r1
  164. LOAD_AB m4, m5, r2d, r3d
  165. mov r3, 32/mmsize
  166. mov r2, r0
  167. sub r0, r1
  168. mova am, m4
  169. sub r0, r1
  170. mova bm, m5
  171. sub r0, r1
  172. .loop:
  173. mova m0, [r0+r1]
  174. mova m1, [r0+r1*2]
  175. mova m2, [r2]
  176. mova m3, [r2+r1]
  177. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  178. LOAD_TC m6, r4
  179. mova tcm, m6
  180. mova m5, [r0]
  181. LUMA_DEBLOCK_ONE m1, m0, ms1
  182. mova [r0+r1], m5
  183. mova m5, [r2+r1*2]
  184. LUMA_DEBLOCK_ONE m2, m3, ms2
  185. mova [r2+r1], m5
  186. pxor m5, m5
  187. mova m6, tcm
  188. pcmpgtw m5, tcm
  189. psubw m6, ms1
  190. pandn m5, m7
  191. psubw m6, ms2
  192. pand m5, m6
  193. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  194. mova [r0+r1*2], m1
  195. mova [r2], m2
  196. add r0, mmsize
  197. add r2, mmsize
  198. add r4, mmsize/8
  199. dec r3
  200. jg .loop
  201. RET
  202. cglobal deblock_h_luma, 5,6,8,0-7*mmsize
  203. %define tcm [rsp]
  204. %define ms1 [rsp+mmsize]
  205. %define ms2 [rsp+mmsize*2]
  206. %define p1m [rsp+mmsize*3]
  207. %define p2m [rsp+mmsize*4]
  208. %define am [rsp+mmsize*5]
  209. %define bm [rsp+mmsize*6]
  210. add r1, r1
  211. LOAD_AB m4, m5, r2d, r3d
  212. mov r3, r1
  213. mova am, m4
  214. add r3, r1
  215. mov r5, 32/mmsize
  216. mova bm, m5
  217. add r3, r1
  218. %if mmsize == 16
  219. mov r2, r0
  220. add r2, r3
  221. %endif
  222. .loop:
  223. %if mmsize == 8
  224. movq m2, [r0-8] ; y q2 q1 q0
  225. movq m7, [r0+0]
  226. movq m5, [r0+r1-8]
  227. movq m3, [r0+r1+0]
  228. movq m0, [r0+r1*2-8]
  229. movq m6, [r0+r1*2+0]
  230. movq m1, [r0+r3-8]
  231. TRANSPOSE4x4W 2, 5, 0, 1, 4
  232. SWAP 2, 7
  233. movq m7, [r0+r3]
  234. TRANSPOSE4x4W 2, 3, 6, 7, 4
  235. %else
  236. movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  237. movu m0, [r0+r1-8]
  238. movu m2, [r0+r1*2-8]
  239. movu m3, [r2-8]
  240. TRANSPOSE4x4W 5, 0, 2, 3, 6
  241. mova tcm, m3
  242. movu m4, [r2+r1-8]
  243. movu m1, [r2+r1*2-8]
  244. movu m3, [r2+r3-8]
  245. movu m7, [r2+r1*4-8]
  246. TRANSPOSE4x4W 4, 1, 3, 7, 6
  247. mova m6, tcm
  248. punpcklqdq m6, m7
  249. punpckhqdq m5, m4
  250. SBUTTERFLY qdq, 0, 1, 7
  251. SBUTTERFLY qdq, 2, 3, 7
  252. %endif
  253. mova p2m, m6
  254. LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
  255. LOAD_TC m6, r4
  256. mova tcm, m6
  257. LUMA_DEBLOCK_ONE m1, m0, ms1
  258. mova p1m, m5
  259. mova m5, p2m
  260. LUMA_DEBLOCK_ONE m2, m3, ms2
  261. mova p2m, m5
  262. pxor m5, m5
  263. mova m6, tcm
  264. pcmpgtw m5, tcm
  265. psubw m6, ms1
  266. pandn m5, m7
  267. psubw m6, ms2
  268. pand m5, m6
  269. DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
  270. mova m0, p1m
  271. mova m3, p2m
  272. TRANSPOSE4x4W 0, 1, 2, 3, 4
  273. LUMA_H_STORE r2, r3
  274. add r4, mmsize/8
  275. lea r0, [r0+r1*(mmsize/2)]
  276. lea r2, [r2+r1*(mmsize/2)]
  277. dec r5
  278. jg .loop
  279. RET
  280. %endmacro
  281. %if ARCH_X86_64
  282. ; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
  283. ; m12=alpha, m13=beta
  284. ; out: m0=p1', m3=q1', m1=p0', m2=q0'
  285. ; clobbers: m4, m5, m6, m7, m10, m11, m14
  286. %macro DEBLOCK_LUMA_INTER_SSE2 0
  287. LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
  288. LOAD_TC m6, r4
  289. DIFF_LT m8, m1, m13, m10, m4
  290. DIFF_LT m9, m2, m13, m11, m4
  291. pand m6, m7
  292. mova m14, m6
  293. pxor m4, m4
  294. pcmpgtw m6, m4
  295. pand m6, m14
  296. mova m5, m10
  297. pand m5, m6
  298. LUMA_Q1 m8, m0, m1, m2, m5, m4
  299. mova m5, m11
  300. pand m5, m6
  301. LUMA_Q1 m9, m3, m1, m2, m5, m4
  302. pxor m4, m4
  303. psubw m6, m10
  304. pcmpgtw m4, m14
  305. pandn m4, m7
  306. psubw m6, m11
  307. pand m4, m6
  308. DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
  309. SWAP 0, 8
  310. SWAP 3, 9
  311. %endmacro
  312. %macro DEBLOCK_LUMA_64 0
  313. cglobal deblock_v_luma, 5,5,15
  314. %define p2 m8
  315. %define p1 m0
  316. %define p0 m1
  317. %define q0 m2
  318. %define q1 m3
  319. %define q2 m9
  320. %define mask0 m7
  321. %define mask1 m10
  322. %define mask2 m11
  323. add r1, r1
  324. LOAD_AB m12, m13, r2d, r3d
  325. mov r2, r0
  326. sub r0, r1
  327. sub r0, r1
  328. sub r0, r1
  329. mov r3, 2
  330. .loop:
  331. mova p2, [r0]
  332. mova p1, [r0+r1]
  333. mova p0, [r0+r1*2]
  334. mova q0, [r2]
  335. mova q1, [r2+r1]
  336. mova q2, [r2+r1*2]
  337. DEBLOCK_LUMA_INTER_SSE2
  338. mova [r0+r1], p1
  339. mova [r0+r1*2], p0
  340. mova [r2], q0
  341. mova [r2+r1], q1
  342. add r0, mmsize
  343. add r2, mmsize
  344. add r4, 2
  345. dec r3
  346. jg .loop
  347. RET
  348. cglobal deblock_h_luma, 5,7,15
  349. add r1, r1
  350. LOAD_AB m12, m13, r2d, r3d
  351. mov r2, r1
  352. add r2, r1
  353. add r2, r1
  354. mov r5, r0
  355. add r5, r2
  356. mov r6, 2
  357. .loop:
  358. movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
  359. movu m0, [r0+r1-8]
  360. movu m2, [r0+r1*2-8]
  361. movu m9, [r5-8]
  362. movu m5, [r5+r1-8]
  363. movu m1, [r5+r1*2-8]
  364. movu m3, [r5+r2-8]
  365. movu m7, [r5+r1*4-8]
  366. TRANSPOSE4x4W 8, 0, 2, 9, 10
  367. TRANSPOSE4x4W 5, 1, 3, 7, 10
  368. punpckhqdq m8, m5
  369. SBUTTERFLY qdq, 0, 1, 10
  370. SBUTTERFLY qdq, 2, 3, 10
  371. punpcklqdq m9, m7
  372. DEBLOCK_LUMA_INTER_SSE2
  373. TRANSPOSE4x4W 0, 1, 2, 3, 4
  374. LUMA_H_STORE r5, r2
  375. add r4, 2
  376. lea r0, [r0+r1*8]
  377. lea r5, [r5+r1*8]
  378. dec r6
  379. jg .loop
  380. RET
  381. %endmacro
  382. INIT_XMM sse2
  383. DEBLOCK_LUMA_64
  384. INIT_XMM avx
  385. DEBLOCK_LUMA_64
  386. %endif
  387. %macro SWAPMOVA 2
  388. %ifnum sizeof%1
  389. SWAP %1, %2
  390. %else
  391. mova %1, %2
  392. %endif
  393. %endmacro
  394. ; in: t0-t2: tmp registers
  395. ; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
  396. ; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
  397. %macro LUMA_INTRA_P012 12 ; p0..p3 in memory
  398. %if ARCH_X86_64
  399. paddw t0, %3, %2
  400. mova t2, %4
  401. paddw t2, %3
  402. %else
  403. mova t0, %3
  404. mova t2, %4
  405. paddw t0, %2
  406. paddw t2, %3
  407. %endif
  408. paddw t0, %1
  409. paddw t2, t2
  410. paddw t0, %5
  411. paddw t2, %9
  412. paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
  413. paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
  414. psrlw t2, 3
  415. psrlw t1, t0, 2
  416. psubw t2, %3
  417. psubw t1, %2
  418. pand t2, %8
  419. pand t1, %8
  420. paddw t2, %3
  421. paddw t1, %2
  422. SWAPMOVA %11, t1
  423. psubw t1, t0, %3
  424. paddw t0, t0
  425. psubw t1, %5
  426. psubw t0, %3
  427. paddw t1, %6
  428. paddw t1, %2
  429. paddw t0, %6
  430. psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
  431. psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
  432. pxor t0, t1
  433. pxor t1, %1
  434. pand t0, %8
  435. pand t1, %7
  436. pxor t0, t1
  437. pxor t0, %1
  438. SWAPMOVA %10, t0
  439. SWAPMOVA %12, t2
  440. %endmacro
  441. %macro LUMA_INTRA_INIT 1
  442. %define t0 m4
  443. %define t1 m5
  444. %define t2 m6
  445. %define t3 m7
  446. %assign i 4
  447. %rep %1
  448. CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
  449. %assign i i+1
  450. %endrep
  451. add r1, r1
  452. %endmacro
  453. ; in: %1-%3=tmp, %4=p2, %5=q2
  454. %macro LUMA_INTRA_INTER 5
  455. LOAD_AB t0, t1, r2d, r3d
  456. mova %1, t0
  457. LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
  458. %if ARCH_X86_64
  459. mova %2, t0 ; mask0
  460. psrlw t3, %1, 2
  461. %else
  462. mova t3, %1
  463. mova %2, t0 ; mask0
  464. psrlw t3, 2
  465. %endif
  466. paddw t3, [pw_2] ; alpha/4+2
  467. DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
  468. pand t2, %2
  469. mova t3, %5 ; q2
  470. mova %1, t2 ; mask1
  471. DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
  472. pand t2, %1
  473. mova t3, %4 ; p2
  474. mova %3, t2 ; mask1q
  475. DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
  476. pand t2, %1
  477. mova %1, t2 ; mask1p
  478. %endmacro
  479. %macro LUMA_H_INTRA_LOAD 0
  480. %if mmsize == 8
  481. movu t0, [r0-8]
  482. movu t1, [r0+r1-8]
  483. movu m0, [r0+r1*2-8]
  484. movu m1, [r0+r4-8]
  485. TRANSPOSE4x4W 4, 5, 0, 1, 2
  486. mova t4, t0 ; p3
  487. mova t5, t1 ; p2
  488. movu m2, [r0]
  489. movu m3, [r0+r1]
  490. movu t0, [r0+r1*2]
  491. movu t1, [r0+r4]
  492. TRANSPOSE4x4W 2, 3, 4, 5, 6
  493. mova t6, t0 ; q2
  494. mova t7, t1 ; q3
  495. %else
  496. movu t0, [r0-8]
  497. movu t1, [r0+r1-8]
  498. movu m0, [r0+r1*2-8]
  499. movu m1, [r0+r5-8]
  500. movu m2, [r4-8]
  501. movu m3, [r4+r1-8]
  502. movu t2, [r4+r1*2-8]
  503. movu t3, [r4+r5-8]
  504. TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
  505. mova t4, t0 ; p3
  506. mova t5, t1 ; p2
  507. mova t6, t2 ; q2
  508. mova t7, t3 ; q3
  509. %endif
  510. %endmacro
  511. ; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
  512. %macro LUMA_H_INTRA_STORE 9
  513. %if mmsize == 8
  514. TRANSPOSE4x4W %1, %2, %3, %4, %9
  515. movq [r0-8], m%1
  516. movq [r0+r1-8], m%2
  517. movq [r0+r1*2-8], m%3
  518. movq [r0+r4-8], m%4
  519. movq m%1, %8
  520. TRANSPOSE4x4W %5, %6, %7, %1, %9
  521. movq [r0], m%5
  522. movq [r0+r1], m%6
  523. movq [r0+r1*2], m%7
  524. movq [r0+r4], m%1
  525. %else
  526. TRANSPOSE2x4x4W %1, %2, %3, %4, %9
  527. movq [r0-8], m%1
  528. movq [r0+r1-8], m%2
  529. movq [r0+r1*2-8], m%3
  530. movq [r0+r5-8], m%4
  531. movhps [r4-8], m%1
  532. movhps [r4+r1-8], m%2
  533. movhps [r4+r1*2-8], m%3
  534. movhps [r4+r5-8], m%4
  535. %ifnum %8
  536. SWAP %1, %8
  537. %else
  538. mova m%1, %8
  539. %endif
  540. TRANSPOSE2x4x4W %5, %6, %7, %1, %9
  541. movq [r0], m%5
  542. movq [r0+r1], m%6
  543. movq [r0+r1*2], m%7
  544. movq [r0+r5], m%1
  545. movhps [r4], m%5
  546. movhps [r4+r1], m%6
  547. movhps [r4+r1*2], m%7
  548. movhps [r4+r5], m%1
  549. %endif
  550. %endmacro
  551. %if ARCH_X86_64
  552. ;-----------------------------------------------------------------------------
  553. ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  554. ;-----------------------------------------------------------------------------
  555. %macro DEBLOCK_LUMA_INTRA_64 0
  556. cglobal deblock_v_luma_intra, 4,7,16
  557. %define t0 m1
  558. %define t1 m2
  559. %define t2 m4
  560. %define p2 m8
  561. %define p1 m9
  562. %define p0 m10
  563. %define q0 m11
  564. %define q1 m12
  565. %define q2 m13
  566. %define aa m5
  567. %define bb m14
  568. add r1, r1
  569. lea r4, [r1*4]
  570. lea r5, [r1*3] ; 3*stride
  571. neg r4
  572. add r4, r0 ; pix-4*stride
  573. mov r6, 2
  574. mova m0, [pw_2]
  575. LOAD_AB aa, bb, r2d, r3d
  576. .loop:
  577. mova p2, [r4+r1]
  578. mova p1, [r4+2*r1]
  579. mova p0, [r4+r5]
  580. mova q0, [r0]
  581. mova q1, [r0+r1]
  582. mova q2, [r0+2*r1]
  583. LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
  584. mova t2, aa
  585. psrlw t2, 2
  586. paddw t2, m0 ; alpha/4+2
  587. DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  588. DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
  589. DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
  590. pand m6, m3
  591. pand m7, m6
  592. pand m6, t1
  593. LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
  594. LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
  595. add r0, mmsize
  596. add r4, mmsize
  597. dec r6
  598. jg .loop
  599. RET
  600. ;-----------------------------------------------------------------------------
  601. ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  602. ;-----------------------------------------------------------------------------
  603. cglobal deblock_h_luma_intra, 4,7,16
  604. %define t0 m15
  605. %define t1 m14
  606. %define t2 m2
  607. %define q3 m5
  608. %define q2 m8
  609. %define q1 m9
  610. %define q0 m10
  611. %define p0 m11
  612. %define p1 m12
  613. %define p2 m13
  614. %define p3 m4
  615. %define spill [rsp]
  616. %assign pad 24-(stack_offset&15)
  617. SUB rsp, pad
  618. add r1, r1
  619. lea r4, [r1*4]
  620. lea r5, [r1*3] ; 3*stride
  621. add r4, r0 ; pix+4*stride
  622. mov r6, 2
  623. mova m0, [pw_2]
  624. .loop:
  625. movu q3, [r0-8]
  626. movu q2, [r0+r1-8]
  627. movu q1, [r0+r1*2-8]
  628. movu q0, [r0+r5-8]
  629. movu p0, [r4-8]
  630. movu p1, [r4+r1-8]
  631. movu p2, [r4+r1*2-8]
  632. movu p3, [r4+r5-8]
  633. TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
  634. LOAD_AB m1, m2, r2d, r3d
  635. LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
  636. psrlw m1, 2
  637. paddw m1, m0 ; alpha/4+2
  638. DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
  639. DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
  640. DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
  641. pand m6, m3
  642. pand m7, m6
  643. pand m6, t1
  644. mova spill, q3
  645. LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
  646. LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
  647. mova m7, spill
  648. LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
  649. lea r0, [r0+r1*8]
  650. lea r4, [r4+r1*8]
  651. dec r6
  652. jg .loop
  653. ADD rsp, pad
  654. RET
  655. %endmacro
  656. INIT_XMM sse2
  657. DEBLOCK_LUMA_INTRA_64
  658. INIT_XMM avx
  659. DEBLOCK_LUMA_INTRA_64
  660. %endif
  661. %macro DEBLOCK_LUMA_INTRA 0
  662. ;-----------------------------------------------------------------------------
  663. ; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  664. ;-----------------------------------------------------------------------------
  665. cglobal deblock_v_luma_intra, 4,7,8,0-3*mmsize
  666. LUMA_INTRA_INIT 3
  667. lea r4, [r1*4]
  668. lea r5, [r1*3]
  669. neg r4
  670. add r4, r0
  671. mov r6, 32/mmsize
  672. .loop:
  673. mova m0, [r4+r1*2] ; p1
  674. mova m1, [r4+r5] ; p0
  675. mova m2, [r0] ; q0
  676. mova m3, [r0+r1] ; q1
  677. LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
  678. LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
  679. mova t3, [r0+r1*2] ; q2
  680. LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
  681. add r0, mmsize
  682. add r4, mmsize
  683. dec r6
  684. jg .loop
  685. RET
  686. ;-----------------------------------------------------------------------------
  687. ; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  688. ;-----------------------------------------------------------------------------
  689. cglobal deblock_h_luma_intra, 4,7,8,0-8*mmsize
  690. LUMA_INTRA_INIT 8
  691. %if mmsize == 8
  692. lea r4, [r1*3]
  693. mov r5, 32/mmsize
  694. %else
  695. lea r4, [r1*4]
  696. lea r5, [r1*3] ; 3*stride
  697. add r4, r0 ; pix+4*stride
  698. mov r6, 32/mmsize
  699. %endif
  700. .loop:
  701. LUMA_H_INTRA_LOAD
  702. LUMA_INTRA_INTER t8, t9, t10, t5, t6
  703. LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
  704. mova t3, t6 ; q2
  705. LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
  706. mova m2, t4
  707. mova m0, t11
  708. mova m1, t5
  709. mova m3, t8
  710. mova m6, t6
  711. LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
  712. lea r0, [r0+r1*(mmsize/2)]
  713. %if mmsize == 8
  714. dec r5
  715. %else
  716. lea r4, [r4+r1*(mmsize/2)]
  717. dec r6
  718. %endif
  719. jg .loop
  720. RET
  721. %endmacro
  722. %if ARCH_X86_64 == 0
  723. INIT_MMX mmx2
  724. DEBLOCK_LUMA
  725. DEBLOCK_LUMA_INTRA
  726. INIT_XMM sse2
  727. DEBLOCK_LUMA
  728. DEBLOCK_LUMA_INTRA
  729. INIT_XMM avx
  730. DEBLOCK_LUMA
  731. DEBLOCK_LUMA_INTRA
  732. %endif
  733. %endif ; HIGH_BIT_DEPTH
  734. %if HIGH_BIT_DEPTH == 0
  735. ; expands to [base],...,[base+7*stride]
  736. %define PASS8ROWS(base, base3, stride, stride3) \
  737. [base], [base+stride], [base+stride*2], [base3], \
  738. [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
  739. %define PASS8ROWS(base, base3, stride, stride3, offset) \
  740. PASS8ROWS(base+offset, base3+offset, stride, stride3)
  741. ; in: 4 rows of 8 bytes in m0..m3
  742. ; out: 8 rows of 4 bytes in %1..%8
  743. %macro TRANSPOSE8x4B_STORE 8
  744. punpckhdq m4, m0, m0
  745. punpckhdq m5, m1, m1
  746. punpckhdq m6, m2, m2
  747. punpcklbw m0, m1
  748. punpcklbw m2, m3
  749. punpcklwd m1, m0, m2
  750. punpckhwd m0, m2
  751. movd %1, m1
  752. punpckhdq m1, m1
  753. movd %2, m1
  754. movd %3, m0
  755. punpckhdq m0, m0
  756. movd %4, m0
  757. punpckhdq m3, m3
  758. punpcklbw m4, m5
  759. punpcklbw m6, m3
  760. punpcklwd m5, m4, m6
  761. punpckhwd m4, m6
  762. movd %5, m5
  763. punpckhdq m5, m5
  764. movd %6, m5
  765. movd %7, m4
  766. punpckhdq m4, m4
  767. movd %8, m4
  768. %endmacro
  769. ; in: 8 rows of 4 bytes in %9..%10
  770. ; out: 8 rows of 4 bytes in %1..%8
  771. %macro STORE_8x4B 10
  772. movd %1, %9
  773. pextrd %2, %9, 1
  774. pextrd %3, %9, 2
  775. pextrd %4, %9, 3
  776. movd %5, %10
  777. pextrd %6, %10, 1
  778. pextrd %7, %10, 2
  779. pextrd %8, %10, 3
  780. %endmacro
  781. ; in: 4 rows of 4 words in %1..%4
  782. ; out: 4 rows of 4 word in m0..m3
  783. ; clobbers: m4
  784. %macro TRANSPOSE4x4W_LOAD 4-8
  785. %if mmsize==8
  786. SWAP 1, 4, 2, 3
  787. movq m0, %1
  788. movq m1, %2
  789. movq m2, %3
  790. movq m3, %4
  791. TRANSPOSE4x4W 0, 1, 2, 3, 4
  792. %else
  793. movq m0, %1
  794. movq m2, %2
  795. movq m1, %3
  796. movq m3, %4
  797. punpcklwd m0, m2
  798. punpcklwd m1, m3
  799. mova m2, m0
  800. punpckldq m0, m1
  801. punpckhdq m2, m1
  802. MOVHL m1, m0
  803. MOVHL m3, m2
  804. %endif
  805. %endmacro
  806. ; in: 2 rows of 4 words in m1..m2
  807. ; out: 4 rows of 2 words in %1..%4
  808. ; clobbers: m0, m1
  809. %macro TRANSPOSE4x2W_STORE 4-8
  810. %if mmsize==8
  811. punpckhwd m0, m1, m2
  812. punpcklwd m1, m2
  813. %else
  814. punpcklwd m1, m2
  815. MOVHL m0, m1
  816. %endif
  817. movd %3, m0
  818. movd %1, m1
  819. psrlq m1, 32
  820. psrlq m0, 32
  821. movd %2, m1
  822. movd %4, m0
  823. %endmacro
  824. ; in: 4/8 rows of 4 words in %1..%8
  825. ; out: 4 rows of 4/8 word in m0..m3
  826. ; clobbers: m4, m5, m6, m7
  827. %macro TRANSPOSE4x8W_LOAD 8
  828. %if mmsize==8
  829. TRANSPOSE4x4W_LOAD %1, %2, %3, %4
  830. %else
  831. movq m0, %1
  832. movq m2, %2
  833. movq m1, %3
  834. movq m3, %4
  835. punpcklwd m0, m2
  836. punpcklwd m1, m3
  837. punpckhdq m2, m0, m1
  838. punpckldq m0, m1
  839. movq m4, %5
  840. movq m6, %6
  841. movq m5, %7
  842. movq m7, %8
  843. punpcklwd m4, m6
  844. punpcklwd m5, m7
  845. punpckhdq m6, m4, m5
  846. punpckldq m4, m5
  847. punpckhqdq m1, m0, m4
  848. punpckhqdq m3, m2, m6
  849. punpcklqdq m0, m4
  850. punpcklqdq m2, m6
  851. %endif
  852. %endmacro
  853. ; in: 2 rows of 4/8 words in m1..m2
  854. ; out: 4/8 rows of 2 words in %1..%8
  855. ; clobbers: m0, m1
  856. %macro TRANSPOSE8x2W_STORE 8
  857. %if mmsize==8
  858. TRANSPOSE4x2W_STORE %1, %2, %3, %4
  859. %else
  860. punpckhwd m0, m1, m2
  861. punpcklwd m1, m2
  862. movd %5, m0
  863. movd %1, m1
  864. psrldq m1, 4
  865. psrldq m0, 4
  866. movd %2, m1
  867. movd %6, m0
  868. psrldq m1, 4
  869. psrldq m0, 4
  870. movd %3, m1
  871. movd %7, m0
  872. psrldq m1, 4
  873. psrldq m0, 4
  874. movd %4, m1
  875. movd %8, m0
  876. %endif
  877. %endmacro
  878. %macro SBUTTERFLY3 4
  879. punpckh%1 %4, %2, %3
  880. punpckl%1 %2, %3
  881. %endmacro
  882. ; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
  883. ; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
  884. %macro TRANSPOSE6x8_MEM 9
  885. RESET_MM_PERMUTATION
  886. %if cpuflag(avx)
  887. ; input:
  888. ; _ABCDEF_
  889. ; _GHIJKL_
  890. ; _MNOPQR_
  891. ; _STUVWX_
  892. ; _YZabcd_
  893. ; _efghij_
  894. ; _klmnop_
  895. ; _qrstuv_
  896. movh m0, %1
  897. movh m2, %2
  898. movh m1, %3
  899. movh m3, %4
  900. punpcklbw m0, m2 ; __ AG BH CI DJ EK FL __
  901. punpcklbw m1, m3 ; __ MS NT OU PV QW RX __
  902. movh m2, %5
  903. movh m3, %6
  904. punpcklbw m2, m3 ; __ Ye Zf ag bh ci dj __
  905. movh m3, %7
  906. movh m4, %8
  907. punpcklbw m3, m4 ; __ kq lr ms nt ou pv __
  908. SBUTTERFLY wd, 0, 1, 4 ; __ __ AG MS BH NT CI OU
  909. ; DJ PV EK QW FL RX __ __
  910. SBUTTERFLY wd, 2, 3, 4 ; __ __ Ye kq Zf lr ag ms
  911. ; bh nt ci ou dj pv __ __
  912. SBUTTERFLY dq, 0, 2, 4 ; __ __ __ __ AG MS Ye kq
  913. ; BH NT Zf lr CI FL OU RX
  914. SBUTTERFLY dq, 1, 3, 4 ; DJ PV bh nt EK QW Zf lr
  915. ; FL RX dj pv __ __ __ __
  916. movhps [%9+0x00], m0
  917. movh [%9+0x10], m2
  918. movhps [%9+0x20], m2
  919. movh [%9+0x30], m1
  920. movhps [%9+0x40], m1
  921. movh [%9+0x50], m3
  922. %else
  923. movq m0, %1
  924. movq m1, %2
  925. movq m2, %3
  926. movq m3, %4
  927. movq m4, %5
  928. movq m5, %6
  929. movq m6, %7
  930. SBUTTERFLY bw, 0, 1, 7
  931. SBUTTERFLY bw, 2, 3, 7
  932. SBUTTERFLY bw, 4, 5, 7
  933. movq [%9+0x10], m3
  934. SBUTTERFLY3 bw, m6, %8, m7
  935. SBUTTERFLY wd, 0, 2, 3
  936. SBUTTERFLY wd, 4, 6, 3
  937. punpckhdq m0, m4
  938. movq [%9+0x00], m0
  939. SBUTTERFLY3 wd, m1, [%9+0x10], m3
  940. SBUTTERFLY wd, 5, 7, 0
  941. SBUTTERFLY dq, 1, 5, 0
  942. SBUTTERFLY dq, 2, 6, 0
  943. punpckldq m3, m7
  944. movq [%9+0x10], m2
  945. movq [%9+0x20], m6
  946. movq [%9+0x30], m1
  947. movq [%9+0x40], m5
  948. movq [%9+0x50], m3
  949. %endif
  950. RESET_MM_PERMUTATION
  951. %endmacro
  952. ; in: 8 rows of 8 in %1..%8
  953. ; out: 8 rows of 8 in %9..%16
  954. %macro TRANSPOSE8x8_MEM 16
  955. RESET_MM_PERMUTATION
  956. %if cpuflag(avx)
  957. movh m0, %1
  958. movh m4, %2
  959. movh m1, %3
  960. movh m5, %4
  961. movh m2, %5
  962. movh m3, %7
  963. punpcklbw m0, m4
  964. punpcklbw m1, m5
  965. movh m4, %6
  966. movh m5, %8
  967. punpcklbw m2, m4
  968. punpcklbw m3, m5
  969. SBUTTERFLY wd, 0, 1, 4
  970. SBUTTERFLY wd, 2, 3, 4
  971. SBUTTERFLY dq, 0, 2, 4
  972. SBUTTERFLY dq, 1, 3, 4
  973. movh %9, m0
  974. movhps %10, m0
  975. movh %11, m2
  976. movhps %12, m2
  977. movh %13, m1
  978. movhps %14, m1
  979. movh %15, m3
  980. movhps %16, m3
  981. %else
  982. movq m0, %1
  983. movq m1, %2
  984. movq m2, %3
  985. movq m3, %4
  986. movq m4, %5
  987. movq m5, %6
  988. movq m6, %7
  989. SBUTTERFLY bw, 0, 1, 7
  990. SBUTTERFLY bw, 2, 3, 7
  991. SBUTTERFLY bw, 4, 5, 7
  992. SBUTTERFLY3 bw, m6, %8, m7
  993. movq %9, m5
  994. SBUTTERFLY wd, 0, 2, 5
  995. SBUTTERFLY wd, 4, 6, 5
  996. SBUTTERFLY wd, 1, 3, 5
  997. movq %11, m6
  998. movq m6, %9
  999. SBUTTERFLY wd, 6, 7, 5
  1000. SBUTTERFLY dq, 0, 4, 5
  1001. SBUTTERFLY dq, 1, 6, 5
  1002. movq %9, m0
  1003. movq %10, m4
  1004. movq %13, m1
  1005. movq %14, m6
  1006. SBUTTERFLY3 dq, m2, %11, m0
  1007. SBUTTERFLY dq, 3, 7, 4
  1008. movq %11, m2
  1009. movq %12, m0
  1010. movq %15, m3
  1011. movq %16, m7
  1012. %endif
  1013. RESET_MM_PERMUTATION
  1014. %endmacro
  1015. ; out: %4 = |%1-%2|>%3
  1016. ; clobbers: %5
  1017. %macro DIFF_GT 5
  1018. %if avx_enabled == 0
  1019. mova %5, %2
  1020. mova %4, %1
  1021. psubusb %5, %1
  1022. psubusb %4, %2
  1023. %else
  1024. psubusb %5, %2, %1
  1025. psubusb %4, %1, %2
  1026. %endif
  1027. por %4, %5
  1028. psubusb %4, %3
  1029. %endmacro
  1030. ; out: %4 = |%1-%2|>%3
  1031. ; clobbers: %5
  1032. %macro DIFF_GT2 5-6
  1033. %if %0<6
  1034. psubusb %4, %1, %2
  1035. psubusb %5, %2, %1
  1036. %else
  1037. mova %4, %1
  1038. mova %5, %2
  1039. psubusb %4, %2
  1040. psubusb %5, %1
  1041. %endif
  1042. psubusb %5, %3
  1043. psubusb %4, %3
  1044. pcmpeqb %4, %5
  1045. %endmacro
  1046. ; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha %2=beta
  1047. ; out: m5=beta-1, m7=mask, %3=alpha-1
  1048. ; clobbers: m4,m6
  1049. %macro LOAD_MASK 2-3
  1050. %if cpuflag(ssse3)
  1051. movd m4, %1
  1052. movd m5, %2
  1053. pxor m6, m6
  1054. pshufb m4, m6
  1055. pshufb m5, m6
  1056. %else
  1057. movd m4, %1
  1058. movd m5, %2
  1059. punpcklbw m4, m4
  1060. punpcklbw m5, m5
  1061. SPLATW m4, m4
  1062. SPLATW m5, m5
  1063. %endif
  1064. mova m6, [pb_1]
  1065. psubusb m4, m6 ; alpha - 1
  1066. psubusb m5, m6 ; beta - 1
  1067. %if %0>2
  1068. mova %3, m4
  1069. %endif
  1070. DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
  1071. DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
  1072. por m7, m4
  1073. DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
  1074. por m7, m4
  1075. pxor m6, m6
  1076. pcmpeqb m7, m6
  1077. %endmacro
  1078. ; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
  1079. ; out: m1=p0' m2=q0'
  1080. ; clobbers: m0,3-6
  1081. %macro DEBLOCK_P0_Q0 0
  1082. pxor m5, m1, m2 ; p0^q0
  1083. pand m5, [pb_1] ; (p0^q0)&1
  1084. pcmpeqb m4, m4
  1085. pxor m3, m4
  1086. pavgb m3, m0 ; (p1 - q1 + 256)>>1
  1087. pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
  1088. pxor m4, m1
  1089. pavgb m4, m2 ; (q0 - p0 + 256)>>1
  1090. pavgb m3, m5
  1091. paddusb m3, m4 ; d+128+33
  1092. mova m6, [pb_a1]
  1093. psubusb m6, m3
  1094. psubusb m3, [pb_a1]
  1095. pminub m6, m7
  1096. pminub m3, m7
  1097. psubusb m1, m6
  1098. psubusb m2, m3
  1099. paddusb m1, m3
  1100. paddusb m2, m6
  1101. %endmacro
  1102. ; in: m1=p0 m2=q0
  1103. ; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
  1104. ; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
  1105. ; clobbers: q2, tmp, tc0
  1106. %macro LUMA_Q1 6
  1107. pavgb %6, m1, m2
  1108. pavgb %2, %6 ; avg(p2,avg(p0,q0))
  1109. pxor %6, %3
  1110. pand %6, [pb_1] ; (p2^avg(p0,q0))&1
  1111. psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
  1112. psubusb %6, %1, %5
  1113. paddusb %5, %1
  1114. pmaxub %2, %6
  1115. pminub %2, %5
  1116. mova %4, %2
  1117. %endmacro
  1118. %if ARCH_X86_64
  1119. ;-----------------------------------------------------------------------------
  1120. ; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1121. ;-----------------------------------------------------------------------------
  1122. %macro DEBLOCK_LUMA 0
  1123. cglobal deblock_v_luma, 5,5,10
  1124. movd m8, [r4] ; tc0
  1125. lea r4, [r1*3]
  1126. neg r4
  1127. add r4, r0 ; pix-3*stride
  1128. mova m0, [r4+r1] ; p1
  1129. mova m1, [r4+2*r1] ; p0
  1130. mova m2, [r0] ; q0
  1131. mova m3, [r0+r1] ; q1
  1132. LOAD_MASK r2d, r3d
  1133. %if cpuflag(avx)
  1134. pshufb m8, [pb_unpackbd1]
  1135. pblendvb m9, m7, m6, m8
  1136. %else
  1137. punpcklbw m8, m8
  1138. punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
  1139. pcmpeqb m9, m9
  1140. pcmpeqb m9, m8
  1141. pandn m9, m7
  1142. %endif
  1143. pand m8, m9
  1144. mova m3, [r4] ; p2
  1145. DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
  1146. pand m6, m9
  1147. psubb m7, m8, m6 ; tc++
  1148. pand m6, m8
  1149. LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
  1150. mova m4, [r0+2*r1] ; q2
  1151. DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
  1152. pand m6, m9
  1153. pand m8, m6
  1154. psubb m7, m6
  1155. mova m3, [r0+r1]
  1156. LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
  1157. DEBLOCK_P0_Q0
  1158. mova [r4+2*r1], m1
  1159. mova [r0], m2
  1160. RET
  1161. ;-----------------------------------------------------------------------------
  1162. ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1163. ;-----------------------------------------------------------------------------
  1164. %if cpuflag(avx)
  1165. INIT_XMM cpuname
  1166. %else
  1167. INIT_MMX cpuname
  1168. %endif
  1169. cglobal deblock_h_luma, 5,9,0,0x60+16*WIN64
  1170. lea r8, [r1*3]
  1171. lea r6, [r0-4]
  1172. lea r5, [r0-4+r8]
  1173. %xdefine pix_tmp rsp+0x30*WIN64 ; shadow space + r4
  1174. ; transpose 6x16 -> tmp space
  1175. TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp
  1176. lea r6, [r6+r1*8]
  1177. lea r5, [r5+r1*8]
  1178. TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
  1179. ; vertical filter
  1180. ; alpha, beta, tc0 are still in r2d, r3d, r4
  1181. ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
  1182. mov r7, r1
  1183. lea r0, [pix_tmp+0x30]
  1184. mov r1d, 0x10
  1185. %if WIN64
  1186. mov [rsp+0x20], r4
  1187. %endif
  1188. call deblock_v_luma
  1189. ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
  1190. add r6, 2
  1191. add r5, 2
  1192. %if cpuflag(sse4)
  1193. mova m0, [pix_tmp+0x10]
  1194. mova m1, [pix_tmp+0x20]
  1195. mova m2, [pix_tmp+0x30]
  1196. mova m3, [pix_tmp+0x40]
  1197. SBUTTERFLY bw, 0, 1, 4
  1198. SBUTTERFLY bw, 2, 3, 4
  1199. SBUTTERFLY wd, 0, 2, 4
  1200. SBUTTERFLY wd, 1, 3, 4
  1201. STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m1, m3
  1202. shl r7, 3
  1203. sub r6, r7
  1204. sub r5, r7
  1205. shr r7, 3
  1206. STORE_8x4B PASS8ROWS(r6, r5, r7, r8), m0, m2
  1207. %else
  1208. movq m0, [pix_tmp+0x18]
  1209. movq m1, [pix_tmp+0x28]
  1210. movq m2, [pix_tmp+0x38]
  1211. movq m3, [pix_tmp+0x48]
  1212. TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
  1213. shl r7, 3
  1214. sub r6, r7
  1215. sub r5, r7
  1216. shr r7, 3
  1217. movq m0, [pix_tmp+0x10]
  1218. movq m1, [pix_tmp+0x20]
  1219. movq m2, [pix_tmp+0x30]
  1220. movq m3, [pix_tmp+0x40]
  1221. TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
  1222. %endif
  1223. RET
  1224. %endmacro
  1225. INIT_XMM sse2
  1226. DEBLOCK_LUMA
  1227. INIT_XMM avx
  1228. DEBLOCK_LUMA
  1229. %else
  1230. %macro DEBLOCK_LUMA 2
  1231. ;-----------------------------------------------------------------------------
  1232. ; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1233. ;-----------------------------------------------------------------------------
  1234. cglobal deblock_%1_luma, 5,5,8,2*%2
  1235. lea r4, [r1*3]
  1236. neg r4
  1237. add r4, r0 ; pix-3*stride
  1238. mova m0, [r4+r1] ; p1
  1239. mova m1, [r4+2*r1] ; p0
  1240. mova m2, [r0] ; q0
  1241. mova m3, [r0+r1] ; q1
  1242. LOAD_MASK r2d, r3d
  1243. mov r3, r4mp
  1244. movd m4, [r3] ; tc0
  1245. %if cpuflag(avx)
  1246. pshufb m4, [pb_unpackbd1]
  1247. mova [esp+%2], m4 ; tc
  1248. pblendvb m4, m7, m6, m4
  1249. %else
  1250. punpcklbw m4, m4
  1251. punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
  1252. mova [esp+%2], m4 ; tc
  1253. pcmpeqb m3, m3
  1254. pcmpgtb m4, m3
  1255. pand m4, m7
  1256. %endif
  1257. mova [esp], m4 ; mask
  1258. mova m3, [r4] ; p2
  1259. DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
  1260. pand m6, m4
  1261. pand m4, [esp+%2] ; tc
  1262. psubb m7, m4, m6
  1263. pand m6, m4
  1264. LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
  1265. mova m4, [r0+2*r1] ; q2
  1266. DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
  1267. mova m5, [esp] ; mask
  1268. pand m6, m5
  1269. mova m5, [esp+%2] ; tc
  1270. pand m5, m6
  1271. psubb m7, m6
  1272. mova m3, [r0+r1]
  1273. LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
  1274. DEBLOCK_P0_Q0
  1275. mova [r4+2*r1], m1
  1276. mova [r0], m2
  1277. RET
  1278. ;-----------------------------------------------------------------------------
  1279. ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1280. ;-----------------------------------------------------------------------------
  1281. %if cpuflag(avx)
  1282. INIT_XMM cpuname
  1283. %else
  1284. INIT_MMX cpuname
  1285. %endif
  1286. cglobal deblock_h_luma, 1,5,8,0x60+12
  1287. mov r3, r1m
  1288. lea r4, [r3*3]
  1289. sub r0, 4
  1290. lea r1, [r0+r4]
  1291. %define pix_tmp esp+12
  1292. ; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma.
  1293. ; transpose 6x16 -> tmp space
  1294. TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
  1295. lea r0, [r0+r3*8]
  1296. lea r1, [r1+r3*8]
  1297. TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
  1298. ; vertical filter
  1299. lea r0, [pix_tmp+0x30]
  1300. PUSH dword r4m
  1301. PUSH dword r3m
  1302. PUSH dword r2m
  1303. PUSH dword 16
  1304. PUSH dword r0
  1305. call deblock_%1_luma
  1306. %ifidn %1, v8
  1307. add dword [esp ], 8 ; pix_tmp+0x38
  1308. add dword [esp+16], 2 ; tc0+2
  1309. call deblock_%1_luma
  1310. %endif
  1311. ADD esp, 20
  1312. ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
  1313. mov r0, r0mp
  1314. sub r0, 2
  1315. lea r1, [r0+r4]
  1316. %if cpuflag(avx)
  1317. mova m0, [pix_tmp+0x10]
  1318. mova m1, [pix_tmp+0x20]
  1319. mova m2, [pix_tmp+0x30]
  1320. mova m3, [pix_tmp+0x40]
  1321. SBUTTERFLY bw, 0, 1, 4
  1322. SBUTTERFLY bw, 2, 3, 4
  1323. SBUTTERFLY wd, 0, 2, 4
  1324. SBUTTERFLY wd, 1, 3, 4
  1325. STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m0, m2
  1326. lea r0, [r0+r3*8]
  1327. lea r1, [r1+r3*8]
  1328. STORE_8x4B PASS8ROWS(r0, r1, r3, r4), m1, m3
  1329. %else
  1330. movq m0, [pix_tmp+0x10]
  1331. movq m1, [pix_tmp+0x20]
  1332. movq m2, [pix_tmp+0x30]
  1333. movq m3, [pix_tmp+0x40]
  1334. TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
  1335. lea r0, [r0+r3*8]
  1336. lea r1, [r1+r3*8]
  1337. movq m0, [pix_tmp+0x18]
  1338. movq m1, [pix_tmp+0x28]
  1339. movq m2, [pix_tmp+0x38]
  1340. movq m3, [pix_tmp+0x48]
  1341. TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
  1342. %endif
  1343. RET
  1344. %endmacro ; DEBLOCK_LUMA
  1345. INIT_MMX mmx2
  1346. DEBLOCK_LUMA v8, 8
  1347. INIT_XMM sse2
  1348. DEBLOCK_LUMA v, 16
  1349. INIT_XMM avx
  1350. DEBLOCK_LUMA v, 16
  1351. %endif ; ARCH
  1352. %macro LUMA_INTRA_P012 4 ; p0..p3 in memory
  1353. %if ARCH_X86_64
  1354. pavgb t0, p2, p1
  1355. pavgb t1, p0, q0
  1356. %else
  1357. mova t0, p2
  1358. mova t1, p0
  1359. pavgb t0, p1
  1360. pavgb t1, q0
  1361. %endif
  1362. pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
  1363. mova t5, t1
  1364. %if ARCH_X86_64
  1365. paddb t2, p2, p1
  1366. paddb t3, p0, q0
  1367. %else
  1368. mova t2, p2
  1369. mova t3, p0
  1370. paddb t2, p1
  1371. paddb t3, q0
  1372. %endif
  1373. paddb t2, t3
  1374. mova t3, t2
  1375. mova t4, t2
  1376. psrlw t2, 1
  1377. pavgb t2, mpb_0
  1378. pxor t2, t0
  1379. pand t2, mpb_1
  1380. psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
  1381. %if ARCH_X86_64
  1382. pavgb t1, p2, q1
  1383. psubb t2, p2, q1
  1384. %else
  1385. mova t1, p2
  1386. mova t2, p2
  1387. pavgb t1, q1
  1388. psubb t2, q1
  1389. %endif
  1390. paddb t3, t3
  1391. psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
  1392. pand t2, mpb_1
  1393. psubb t1, t2
  1394. pavgb t1, p1
  1395. pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
  1396. psrlw t3, 2
  1397. pavgb t3, mpb_0
  1398. pxor t3, t1
  1399. pand t3, mpb_1
  1400. psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
  1401. pxor t3, p0, q1
  1402. pavgb t2, p0, q1
  1403. pand t3, mpb_1
  1404. psubb t2, t3
  1405. pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
  1406. pxor t1, t2
  1407. pxor t2, p0
  1408. pand t1, mask1p
  1409. pand t2, mask0
  1410. pxor t1, t2
  1411. pxor t1, p0
  1412. mova %1, t1 ; store p0
  1413. mova t1, %4 ; p3
  1414. paddb t2, t1, p2
  1415. pavgb t1, p2
  1416. pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
  1417. paddb t2, t2
  1418. paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
  1419. psrlw t2, 2
  1420. pavgb t2, mpb_0
  1421. pxor t2, t1
  1422. pand t2, mpb_1
  1423. psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
  1424. pxor t0, p1
  1425. pxor t1, p2
  1426. pand t0, mask1p
  1427. pand t1, mask1p
  1428. pxor t0, p1
  1429. pxor t1, p2
  1430. mova %2, t0 ; store p1
  1431. mova %3, t1 ; store p2
  1432. %endmacro
  1433. %macro LUMA_INTRA_SWAP_PQ 0
  1434. %define q1 m0
  1435. %define q0 m1
  1436. %define p0 m2
  1437. %define p1 m3
  1438. %define p2 q2
  1439. %define mask1p mask1q
  1440. %endmacro
  1441. %macro DEBLOCK_LUMA_INTRA 1
  1442. %define p1 m0
  1443. %define p0 m1
  1444. %define q0 m2
  1445. %define q1 m3
  1446. %define t0 m4
  1447. %define t1 m5
  1448. %define t2 m6
  1449. %define t3 m7
  1450. %if ARCH_X86_64
  1451. %define p2 m8
  1452. %define q2 m9
  1453. %define t4 m10
  1454. %define t5 m11
  1455. %define mask0 m12
  1456. %define mask1p m13
  1457. %if WIN64
  1458. %define mask1q [rsp]
  1459. %else
  1460. %define mask1q [rsp-24]
  1461. %endif
  1462. %define mpb_0 m14
  1463. %define mpb_1 m15
  1464. %else
  1465. %define spill(x) [esp+16*x]
  1466. %define p2 [r4+r1]
  1467. %define q2 [r0+2*r1]
  1468. %define t4 spill(0)
  1469. %define t5 spill(1)
  1470. %define mask0 spill(2)
  1471. %define mask1p spill(3)
  1472. %define mask1q spill(4)
  1473. %define mpb_0 [pb_0]
  1474. %define mpb_1 [pb_1]
  1475. %endif
  1476. ;-----------------------------------------------------------------------------
  1477. ; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
  1478. ;-----------------------------------------------------------------------------
  1479. cglobal deblock_%1_luma_intra, 4,6,16,0-(1-ARCH_X86_64)*0x50-WIN64*0x10
  1480. lea r4, [r1*4]
  1481. lea r5, [r1*3] ; 3*stride
  1482. neg r4
  1483. add r4, r0 ; pix-4*stride
  1484. mova p1, [r4+2*r1]
  1485. mova p0, [r4+r5]
  1486. mova q0, [r0]
  1487. mova q1, [r0+r1]
  1488. %if ARCH_X86_64
  1489. pxor mpb_0, mpb_0
  1490. mova mpb_1, [pb_1]
  1491. LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
  1492. SWAP 7, 12 ; m12=mask0
  1493. pavgb t5, mpb_0
  1494. pavgb t5, mpb_1 ; alpha/4+1
  1495. movdqa p2, [r4+r1]
  1496. movdqa q2, [r0+2*r1]
  1497. DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
  1498. DIFF_GT2 p0, p2, m5, t2, t5, 1 ; mask1 = |p2-p0| > beta-1
  1499. DIFF_GT2 q0, q2, m5, t4, t5, 1 ; t4 = |q2-q0| > beta-1
  1500. pand t0, mask0
  1501. pand t4, t0
  1502. pand t2, t0
  1503. mova mask1q, t4
  1504. mova mask1p, t2
  1505. %else
  1506. LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
  1507. mova m4, t5
  1508. mova mask0, m7
  1509. pavgb m4, [pb_0]
  1510. pavgb m4, [pb_1] ; alpha/4+1
  1511. DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
  1512. pand m6, mask0
  1513. DIFF_GT2 p0, p2, m5, m4, m7, 1 ; m4 = |p2-p0| > beta-1
  1514. pand m4, m6
  1515. mova mask1p, m4
  1516. DIFF_GT2 q0, q2, m5, m4, m7, 1 ; m4 = |q2-q0| > beta-1
  1517. pand m4, m6
  1518. mova mask1q, m4
  1519. %endif
  1520. LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
  1521. LUMA_INTRA_SWAP_PQ
  1522. LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
  1523. .end:
  1524. REP_RET
  1525. %if cpuflag(avx)
  1526. INIT_XMM cpuname
  1527. %else
  1528. INIT_MMX cpuname
  1529. %endif
  1530. %if ARCH_X86_64
  1531. ;-----------------------------------------------------------------------------
  1532. ; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
  1533. ;-----------------------------------------------------------------------------
  1534. cglobal deblock_h_luma_intra, 4,9,0,0x80
  1535. lea r8, [r1*3]
  1536. lea r6, [r0-4]
  1537. lea r5, [r0-4+r8]
  1538. %if WIN64
  1539. %define pix_tmp rsp+0x20 ; shadow space
  1540. %else
  1541. %define pix_tmp rsp
  1542. %endif
  1543. ; transpose 8x16 -> tmp space
  1544. TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
  1545. lea r6, [r6+r1*8]
  1546. lea r5, [r5+r1*8]
  1547. TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
  1548. mov r7, r1
  1549. lea r0, [pix_tmp+0x40]
  1550. mov r1, 0x10
  1551. call deblock_v_luma_intra
  1552. ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
  1553. lea r5, [r6+r8]
  1554. TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
  1555. shl r7, 3
  1556. sub r6, r7
  1557. sub r5, r7
  1558. shr r7, 3
  1559. TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
  1560. RET
  1561. %else
  1562. cglobal deblock_h_luma_intra, 2,4,8,0x80
  1563. lea r3, [r1*3]
  1564. sub r0, 4
  1565. lea r2, [r0+r3]
  1566. %define pix_tmp rsp
  1567. ; transpose 8x16 -> tmp space
  1568. TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
  1569. lea r0, [r0+r1*8]
  1570. lea r2, [r2+r1*8]
  1571. TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
  1572. lea r0, [pix_tmp+0x40]
  1573. PUSH dword r3m
  1574. PUSH dword r2m
  1575. PUSH dword 16
  1576. PUSH r0
  1577. call deblock_%1_luma_intra
  1578. %ifidn %1, v8
  1579. add dword [rsp], 8 ; pix_tmp+8
  1580. call deblock_%1_luma_intra
  1581. %endif
  1582. ADD esp, 16
  1583. mov r1, r1m
  1584. mov r0, r0mp
  1585. lea r3, [r1*3]
  1586. sub r0, 4
  1587. lea r2, [r0+r3]
  1588. ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
  1589. TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
  1590. lea r0, [r0+r1*8]
  1591. lea r2, [r2+r1*8]
  1592. TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
  1593. RET
  1594. %endif ; ARCH_X86_64
  1595. %endmacro ; DEBLOCK_LUMA_INTRA
  1596. INIT_XMM sse2
  1597. DEBLOCK_LUMA_INTRA v
  1598. INIT_XMM avx
  1599. DEBLOCK_LUMA_INTRA v
  1600. %if ARCH_X86_64 == 0
  1601. INIT_MMX mmx2
  1602. DEBLOCK_LUMA_INTRA v8
  1603. %endif
  1604. %endif ; !HIGH_BIT_DEPTH
  1605. %if HIGH_BIT_DEPTH
  1606. ; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
  1607. ; out: %1=p0', %2=q0'
  1608. %macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
  1609. mova %6, [pw_2]
  1610. paddw %6, %3
  1611. paddw %6, %4
  1612. paddw %7, %6, %2
  1613. paddw %6, %1
  1614. paddw %6, %3
  1615. paddw %7, %4
  1616. psraw %6, 2
  1617. psraw %7, 2
  1618. psubw %6, %1
  1619. psubw %7, %2
  1620. pand %6, %5
  1621. pand %7, %5
  1622. paddw %1, %6
  1623. paddw %2, %7
  1624. %endmacro
  1625. ; out: m0-m3
  1626. ; clobbers: m4-m7
  1627. %macro CHROMA_H_LOAD 0-1
  1628. movq m0, [r0-8] ; p1 p1 p0 p0
  1629. movq m2, [r0] ; q0 q0 q1 q1
  1630. movq m5, [r0+r1-8]
  1631. movq m7, [r0+r1]
  1632. %if mmsize == 8
  1633. mova m1, m0
  1634. mova m3, m2
  1635. punpckldq m0, m5 ; p1
  1636. punpckhdq m1, m5 ; p0
  1637. punpckldq m2, m7 ; q0
  1638. punpckhdq m3, m7 ; q1
  1639. %else
  1640. movq m4, [r0+r1*2-8]
  1641. movq m6, [r0+r1*2]
  1642. movq m1, [r0+%1-8]
  1643. movq m3, [r0+%1]
  1644. punpckldq m0, m5 ; p1 ... p0 ...
  1645. punpckldq m2, m7 ; q0 ... q1 ...
  1646. punpckldq m4, m1
  1647. punpckldq m6, m3
  1648. punpckhqdq m1, m0, m4 ; p0
  1649. punpcklqdq m0, m4 ; p1
  1650. punpckhqdq m3, m2, m6 ; q1
  1651. punpcklqdq m2, m6 ; q0
  1652. %endif
  1653. %endmacro
  1654. %macro CHROMA_V_LOAD 1
  1655. mova m0, [r0] ; p1
  1656. mova m1, [r0+r1] ; p0
  1657. mova m2, [%1] ; q0
  1658. mova m3, [%1+r1] ; q1
  1659. %endmacro
  1660. ; clobbers: m1, m2, m3
  1661. %macro CHROMA_H_STORE 0-1
  1662. SBUTTERFLY dq, 1, 2, 3
  1663. %if mmsize == 8
  1664. movq [r0-4], m1
  1665. movq [r0+r1-4], m2
  1666. %else
  1667. movq [r0-4], m1
  1668. movq [r0+r1*2-4], m2
  1669. movhps [r0+r1-4], m1
  1670. movhps [r0+%1-4], m2
  1671. %endif
  1672. %endmacro
  1673. %macro CHROMA_V_STORE 0
  1674. mova [r0+1*r1], m1
  1675. mova [r0+2*r1], m2
  1676. %endmacro
  1677. %macro DEBLOCK_CHROMA 0
  1678. cglobal deblock_inter_body
  1679. LOAD_AB m4, m5, r2d, r3d
  1680. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  1681. pxor m4, m4
  1682. LOAD_TC m6, r4
  1683. pmaxsw m6, m4
  1684. pand m7, m6
  1685. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  1686. ret
  1687. ;-----------------------------------------------------------------------------
  1688. ; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1689. ;-----------------------------------------------------------------------------
  1690. cglobal deblock_v_chroma, 5,7,8
  1691. FIX_STRIDES r1
  1692. mov r5, r0
  1693. sub r0, r1
  1694. sub r0, r1
  1695. mov r6, 32/mmsize
  1696. .loop:
  1697. CHROMA_V_LOAD r5
  1698. call deblock_inter_body
  1699. CHROMA_V_STORE
  1700. add r0, mmsize
  1701. add r5, mmsize
  1702. add r4, mmsize/8
  1703. dec r6
  1704. jg .loop
  1705. RET
  1706. ;-----------------------------------------------------------------------------
  1707. ; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1708. ;-----------------------------------------------------------------------------
  1709. cglobal deblock_h_chroma, 5,7,8
  1710. add r1, r1
  1711. mov r5, 32/mmsize
  1712. %if mmsize == 16
  1713. lea r6, [r1*3]
  1714. %endif
  1715. .loop:
  1716. CHROMA_H_LOAD r6
  1717. call deblock_inter_body
  1718. CHROMA_H_STORE r6
  1719. lea r0, [r0+r1*(mmsize/4)]
  1720. add r4, mmsize/8
  1721. dec r5
  1722. jg .loop
  1723. RET
  1724. cglobal deblock_intra_body
  1725. LOAD_AB m4, m5, r2d, r3d
  1726. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  1727. CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
  1728. ret
  1729. ;-----------------------------------------------------------------------------
  1730. ; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  1731. ;-----------------------------------------------------------------------------
  1732. cglobal deblock_v_chroma_intra, 4,6,8
  1733. add r1, r1
  1734. mov r5, 32/mmsize
  1735. movd m5, r3d
  1736. mov r4, r0
  1737. sub r0, r1
  1738. sub r0, r1
  1739. SPLATW m5, m5
  1740. .loop:
  1741. CHROMA_V_LOAD r4
  1742. call deblock_intra_body
  1743. CHROMA_V_STORE
  1744. add r0, mmsize
  1745. add r4, mmsize
  1746. dec r5
  1747. jg .loop
  1748. RET
  1749. ;-----------------------------------------------------------------------------
  1750. ; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  1751. ;-----------------------------------------------------------------------------
  1752. cglobal deblock_h_chroma_intra, 4,6,8
  1753. add r1, r1
  1754. mov r4, 32/mmsize
  1755. %if mmsize == 16
  1756. lea r5, [r1*3]
  1757. %endif
  1758. .loop:
  1759. CHROMA_H_LOAD r5
  1760. call deblock_intra_body
  1761. CHROMA_H_STORE r5
  1762. lea r0, [r0+r1*(mmsize/4)]
  1763. dec r4
  1764. jg .loop
  1765. RET
  1766. ;-----------------------------------------------------------------------------
  1767. ; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
  1768. ;-----------------------------------------------------------------------------
  1769. cglobal deblock_h_chroma_intra_mbaff, 4,6,8
  1770. add r1, r1
  1771. %if mmsize == 8
  1772. mov r4, 16/mmsize
  1773. .loop:
  1774. %else
  1775. lea r5, [r1*3]
  1776. %endif
  1777. CHROMA_H_LOAD r5
  1778. LOAD_AB m4, m5, r2d, r3d
  1779. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  1780. CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
  1781. CHROMA_H_STORE r5
  1782. %if mmsize == 8
  1783. lea r0, [r0+r1*(mmsize/4)]
  1784. dec r4
  1785. jg .loop
  1786. %endif
  1787. RET
  1788. ;-----------------------------------------------------------------------------
  1789. ; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1790. ;-----------------------------------------------------------------------------
  1791. cglobal deblock_h_chroma_mbaff, 5,7,8
  1792. add r1, r1
  1793. lea r6, [r1*3]
  1794. %if mmsize == 8
  1795. mov r5, 16/mmsize
  1796. .loop:
  1797. %endif
  1798. CHROMA_H_LOAD r6
  1799. LOAD_AB m4, m5, r2d, r3d
  1800. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  1801. movd m6, [r4]
  1802. punpcklbw m6, m6
  1803. psraw m6, 8
  1804. punpcklwd m6, m6
  1805. pand m7, m6
  1806. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  1807. CHROMA_H_STORE r6
  1808. %if mmsize == 8
  1809. lea r0, [r0+r1*(mmsize/4)]
  1810. add r4, mmsize/4
  1811. dec r5
  1812. jg .loop
  1813. %endif
  1814. RET
  1815. ;-----------------------------------------------------------------------------
  1816. ; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
  1817. ;-----------------------------------------------------------------------------
  1818. cglobal deblock_h_chroma_422_intra, 4,6,8
  1819. add r1, r1
  1820. mov r4, 64/mmsize
  1821. %if mmsize == 16
  1822. lea r5, [r1*3]
  1823. %endif
  1824. .loop:
  1825. CHROMA_H_LOAD r5
  1826. call deblock_intra_body
  1827. CHROMA_H_STORE r5
  1828. lea r0, [r0+r1*(mmsize/4)]
  1829. dec r4
  1830. jg .loop
  1831. RET
  1832. ;-----------------------------------------------------------------------------
  1833. ; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1834. ;-----------------------------------------------------------------------------
  1835. cglobal deblock_h_chroma_422, 5,7,8
  1836. add r1, r1
  1837. mov r5, 64/mmsize
  1838. lea r6, [r1*3]
  1839. .loop:
  1840. CHROMA_H_LOAD r6
  1841. LOAD_AB m4, m5, r2m, r3d
  1842. LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
  1843. pxor m4, m4
  1844. movd m6, [r4-1]
  1845. psraw m6, 8
  1846. SPLATW m6, m6
  1847. pmaxsw m6, m4
  1848. pand m7, m6
  1849. DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
  1850. CHROMA_H_STORE r6
  1851. lea r0, [r0+r1*(mmsize/4)]
  1852. %if mmsize == 16
  1853. inc r4
  1854. %else
  1855. mov r2, r5
  1856. and r2, 1
  1857. add r4, r2 ; increment once every 2 iterations
  1858. %endif
  1859. dec r5
  1860. jg .loop
  1861. RET
  1862. %endmacro ; DEBLOCK_CHROMA
  1863. %if ARCH_X86_64 == 0
  1864. INIT_MMX mmx2
  1865. DEBLOCK_CHROMA
  1866. %endif
  1867. INIT_XMM sse2
  1868. DEBLOCK_CHROMA
  1869. INIT_XMM avx
  1870. DEBLOCK_CHROMA
  1871. %endif ; HIGH_BIT_DEPTH
  1872. %if HIGH_BIT_DEPTH == 0
  1873. %macro CHROMA_V_START 0
  1874. mov t5, r0
  1875. sub t5, r1
  1876. sub t5, r1
  1877. %if mmsize==8
  1878. mov dword r0m, 2
  1879. .loop:
  1880. %endif
  1881. %endmacro
  1882. %macro CHROMA_H_START 0
  1883. sub r0, 4
  1884. lea t6, [r1*3]
  1885. mov t5, r0
  1886. add r0, t6
  1887. %endmacro
  1888. %macro CHROMA_V_LOOP 1
  1889. %if mmsize==8
  1890. add r0, 8
  1891. add t5, 8
  1892. %if %1
  1893. add r4, 2
  1894. %endif
  1895. dec dword r0m
  1896. jg .loop
  1897. %endif
  1898. %endmacro
  1899. %macro CHROMA_H_LOOP 1
  1900. %if mmsize==8
  1901. lea r0, [r0+r1*4]
  1902. lea t5, [t5+r1*4]
  1903. %if %1
  1904. add r4, 2
  1905. %endif
  1906. dec dword r0m
  1907. jg .loop
  1908. %endif
  1909. %endmacro
  1910. %define t5 r5
  1911. %define t6 r6
  1912. %macro DEBLOCK_CHROMA 0
  1913. cglobal chroma_inter_body
  1914. LOAD_MASK r2d, r3d
  1915. movd m6, [r4] ; tc0
  1916. punpcklbw m6, m6
  1917. punpcklbw m6, m6
  1918. pand m7, m6
  1919. DEBLOCK_P0_Q0
  1920. ret
  1921. ;-----------------------------------------------------------------------------
  1922. ; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1923. ;-----------------------------------------------------------------------------
  1924. cglobal deblock_v_chroma, 5,6,8
  1925. CHROMA_V_START
  1926. mova m0, [t5]
  1927. mova m1, [t5+r1]
  1928. mova m2, [r0]
  1929. mova m3, [r0+r1]
  1930. call chroma_inter_body
  1931. mova [t5+r1], m1
  1932. mova [r0], m2
  1933. CHROMA_V_LOOP 1
  1934. RET
  1935. ;-----------------------------------------------------------------------------
  1936. ; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1937. ;-----------------------------------------------------------------------------
  1938. cglobal deblock_h_chroma, 5,7,8
  1939. CHROMA_H_START
  1940. %if mmsize==8
  1941. mov dword r0m, 2
  1942. .loop:
  1943. %endif
  1944. TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
  1945. call chroma_inter_body
  1946. TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  1947. CHROMA_H_LOOP 1
  1948. RET
  1949. %endmacro ; DEBLOCK_CHROMA
  1950. INIT_XMM sse2
  1951. DEBLOCK_CHROMA
  1952. INIT_XMM avx
  1953. DEBLOCK_CHROMA
  1954. %if ARCH_X86_64 == 0
  1955. INIT_MMX mmx2
  1956. DEBLOCK_CHROMA
  1957. %endif
  1958. ;-----------------------------------------------------------------------------
  1959. ; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
  1960. ;-----------------------------------------------------------------------------
  1961. %macro DEBLOCK_H_CHROMA_420_MBAFF 0
  1962. cglobal deblock_h_chroma_mbaff, 5,7,8
  1963. CHROMA_H_START
  1964. TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
  1965. LOAD_MASK r2d, r3d
  1966. movd m6, [r4] ; tc0
  1967. punpcklbw m6, m6
  1968. pand m7, m6
  1969. DEBLOCK_P0_Q0
  1970. TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  1971. RET
  1972. %endmacro
  1973. INIT_XMM sse2
  1974. DEBLOCK_H_CHROMA_420_MBAFF
  1975. %if ARCH_X86_64 == 0
  1976. INIT_MMX mmx2
  1977. DEBLOCK_H_CHROMA_420_MBAFF
  1978. %endif
  1979. %macro DEBLOCK_H_CHROMA_422 0
  1980. cglobal deblock_h_chroma_422, 5,8,8
  1981. %if ARCH_X86_64
  1982. %define cntr r7
  1983. %else
  1984. %define cntr dword r0m
  1985. %endif
  1986. CHROMA_H_START
  1987. mov cntr, 32/mmsize
  1988. .loop:
  1989. TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
  1990. LOAD_MASK r2d, r3d
  1991. movd m6, [r4] ; tc0
  1992. punpcklbw m6, m6
  1993. %if mmsize == 16
  1994. punpcklbw m6, m6
  1995. punpcklbw m6, m6
  1996. %else
  1997. pshufw m6, m6, q0000
  1998. %endif
  1999. pand m7, m6
  2000. DEBLOCK_P0_Q0
  2001. TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  2002. lea r0, [r0+r1*(mmsize/2)]
  2003. lea t5, [t5+r1*(mmsize/2)]
  2004. add r4, mmsize/8
  2005. dec cntr
  2006. jg .loop
  2007. RET
  2008. %endmacro
  2009. INIT_MMX mmx2
  2010. DEBLOCK_H_CHROMA_422
  2011. INIT_XMM sse2
  2012. DEBLOCK_H_CHROMA_422
  2013. INIT_XMM avx
  2014. DEBLOCK_H_CHROMA_422
  2015. ; in: %1=p0 %2=p1 %3=q1
  2016. ; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
  2017. %macro CHROMA_INTRA_P0 3
  2018. pxor m4, %1, %3
  2019. pand m4, [pb_1] ; m4 = (p0^q1)&1
  2020. pavgb %1, %3
  2021. psubusb %1, m4
  2022. pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
  2023. %endmacro
  2024. %define t5 r4
  2025. %define t6 r5
  2026. %macro DEBLOCK_CHROMA_INTRA_BODY 0
  2027. cglobal chroma_intra_body
  2028. LOAD_MASK r2d, r3d
  2029. mova m5, m1
  2030. mova m6, m2
  2031. CHROMA_INTRA_P0 m1, m0, m3
  2032. CHROMA_INTRA_P0 m2, m3, m0
  2033. psubb m1, m5
  2034. psubb m2, m6
  2035. pand m1, m7
  2036. pand m2, m7
  2037. paddb m1, m5
  2038. paddb m2, m6
  2039. ret
  2040. %endmacro
  2041. %macro DEBLOCK_CHROMA_INTRA 0
  2042. ;-----------------------------------------------------------------------------
  2043. ; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
  2044. ;-----------------------------------------------------------------------------
  2045. cglobal deblock_v_chroma_intra, 4,5,8
  2046. CHROMA_V_START
  2047. mova m0, [t5]
  2048. mova m1, [t5+r1]
  2049. mova m2, [r0]
  2050. mova m3, [r0+r1]
  2051. call chroma_intra_body
  2052. mova [t5+r1], m1
  2053. mova [r0], m2
  2054. CHROMA_V_LOOP 0
  2055. RET
  2056. ;-----------------------------------------------------------------------------
  2057. ; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
  2058. ;-----------------------------------------------------------------------------
  2059. cglobal deblock_h_chroma_intra, 4,6,8
  2060. CHROMA_H_START
  2061. %if mmsize==8
  2062. mov dword r0m, 2
  2063. .loop:
  2064. %endif
  2065. TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
  2066. call chroma_intra_body
  2067. TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  2068. CHROMA_H_LOOP 0
  2069. RET
  2070. cglobal deblock_h_chroma_422_intra, 4,7,8
  2071. CHROMA_H_START
  2072. mov r6d, 32/mmsize
  2073. .loop:
  2074. TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
  2075. call chroma_intra_body
  2076. TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  2077. lea r0, [r0+r1*(mmsize/2)]
  2078. lea t5, [t5+r1*(mmsize/2)]
  2079. dec r6d
  2080. jg .loop
  2081. RET
  2082. %endmacro ; DEBLOCK_CHROMA_INTRA
  2083. INIT_XMM sse2
  2084. DEBLOCK_CHROMA_INTRA_BODY
  2085. DEBLOCK_CHROMA_INTRA
  2086. INIT_XMM avx
  2087. DEBLOCK_CHROMA_INTRA_BODY
  2088. DEBLOCK_CHROMA_INTRA
  2089. INIT_MMX mmx2
  2090. DEBLOCK_CHROMA_INTRA_BODY
  2091. %if ARCH_X86_64 == 0
  2092. DEBLOCK_CHROMA_INTRA
  2093. %endif
  2094. ;-----------------------------------------------------------------------------
  2095. ; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
  2096. ;-----------------------------------------------------------------------------
  2097. INIT_MMX mmx2
  2098. cglobal deblock_h_chroma_intra_mbaff, 4,6,8
  2099. CHROMA_H_START
  2100. TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
  2101. call chroma_intra_body
  2102. TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
  2103. RET
  2104. %endif ; !HIGH_BIT_DEPTH
  2105. ;-----------------------------------------------------------------------------
  2106. ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
  2107. ; uint8_t bs[2][4][4], int mvy_limit, int bframe )
  2108. ;-----------------------------------------------------------------------------
  2109. %define scan8start (4+1*8)
  2110. %define nnz r0+scan8start
  2111. %define ref r1+scan8start
  2112. %define mv r2+scan8start*4
  2113. %define bs0 r3
  2114. %define bs1 r3+32
  2115. %macro LOAD_BYTES_XMM 2 ; src, aligned
  2116. %if %2
  2117. mova m2, [%1-4]
  2118. mova m1, [%1+12]
  2119. %else
  2120. movu m2, [%1-4]
  2121. movu m1, [%1+12]
  2122. %endif
  2123. psllq m0, m2, 8
  2124. shufps m2, m1, q3131 ; cur nnz, all rows
  2125. psllq m1, 8
  2126. shufps m0, m1, q3131 ; left neighbors
  2127. %if cpuflag(avx) || (%2 && cpuflag(ssse3))
  2128. palignr m1, m2, [%1-20], 12
  2129. %else
  2130. pslldq m1, m2, 4
  2131. movd m3, [%1-8]
  2132. por m1, m3 ; top neighbors
  2133. %endif
  2134. %endmacro
  2135. %if UNIX64
  2136. DECLARE_REG_TMP 5
  2137. %else
  2138. DECLARE_REG_TMP 4
  2139. %endif
  2140. %macro DEBLOCK_STRENGTH_XMM 0
  2141. cglobal deblock_strength, 5,5,7
  2142. ; Prepare mv comparison register
  2143. shl r4d, 8
  2144. add r4d, 3 - (1<<8)
  2145. movd m6, r4d
  2146. movifnidn t0d, r5m
  2147. SPLATW m6, m6
  2148. pxor m4, m4 ; bs0
  2149. pxor m5, m5 ; bs1
  2150. .lists:
  2151. ; Check refs
  2152. LOAD_BYTES_XMM ref, 0
  2153. pxor m0, m2
  2154. pxor m1, m2
  2155. por m4, m0
  2156. por m5, m1
  2157. ; Check mvs
  2158. %if cpuflag(ssse3) && notcpuflag(avx)
  2159. mova m0, [mv+4*8*0]
  2160. mova m1, [mv+4*8*1]
  2161. palignr m3, m0, [mv+4*8*0-16], 12
  2162. palignr m2, m1, [mv+4*8*1-16], 12
  2163. psubw m0, m3
  2164. psubw m1, m2
  2165. packsswb m0, m1
  2166. mova m2, [mv+4*8*2]
  2167. mova m1, [mv+4*8*3]
  2168. palignr m3, m2, [mv+4*8*2-16], 12
  2169. psubw m2, m3
  2170. palignr m3, m1, [mv+4*8*3-16], 12
  2171. psubw m1, m3
  2172. packsswb m2, m1
  2173. %else
  2174. movu m0, [mv-4+4*8*0]
  2175. movu m1, [mv-4+4*8*1]
  2176. movu m2, [mv-4+4*8*2]
  2177. movu m3, [mv-4+4*8*3]
  2178. psubw m0, [mv+4*8*0]
  2179. psubw m1, [mv+4*8*1]
  2180. psubw m2, [mv+4*8*2]
  2181. psubw m3, [mv+4*8*3]
  2182. packsswb m0, m1
  2183. packsswb m2, m3
  2184. %endif
  2185. ABSB m0, m1
  2186. ABSB m2, m3
  2187. psubusb m0, m6
  2188. psubusb m2, m6
  2189. packsswb m0, m2
  2190. por m4, m0
  2191. mova m0, [mv+4*8*-1]
  2192. mova m1, [mv+4*8* 0]
  2193. mova m2, [mv+4*8* 1]
  2194. mova m3, [mv+4*8* 2]
  2195. psubw m0, m1
  2196. psubw m1, m2
  2197. psubw m2, m3
  2198. psubw m3, [mv+4*8* 3]
  2199. packsswb m0, m1
  2200. packsswb m2, m3
  2201. ABSB m0, m1
  2202. ABSB m2, m3
  2203. psubusb m0, m6
  2204. psubusb m2, m6
  2205. packsswb m0, m2
  2206. por m5, m0
  2207. add r1, 40
  2208. add r2, 4*8*5
  2209. dec t0d
  2210. jge .lists
  2211. ; Check nnz
  2212. LOAD_BYTES_XMM nnz, 1
  2213. por m0, m2
  2214. por m1, m2
  2215. mova m6, [pb_1]
  2216. pminub m0, m6
  2217. pminub m1, m6
  2218. pminub m4, m6 ; mv ? 1 : 0
  2219. pminub m5, m6
  2220. paddb m0, m0 ; nnz ? 2 : 0
  2221. paddb m1, m1
  2222. pmaxub m4, m0
  2223. pmaxub m5, m1
  2224. %if cpuflag(ssse3)
  2225. pshufb m4, [transpose_shuf]
  2226. %else
  2227. movhlps m3, m4
  2228. punpcklbw m4, m3
  2229. movhlps m3, m4
  2230. punpcklbw m4, m3
  2231. %endif
  2232. mova [bs1], m5
  2233. mova [bs0], m4
  2234. RET
  2235. %endmacro
  2236. INIT_XMM sse2
  2237. DEBLOCK_STRENGTH_XMM
  2238. INIT_XMM ssse3
  2239. DEBLOCK_STRENGTH_XMM
  2240. INIT_XMM avx
  2241. DEBLOCK_STRENGTH_XMM
  2242. %macro LOAD_BYTES_YMM 1
  2243. movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
  2244. pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
  2245. vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
  2246. vpbroadcastd m2, [%1-8] ; ABCD ....
  2247. vpblendd m0, m0, m2, 0x80
  2248. vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
  2249. %endmacro
  2250. INIT_YMM avx2
  2251. cglobal deblock_strength, 5,5,8
  2252. mova m6, [load_bytes_ymm_shuf]
  2253. ; Prepare mv comparison register
  2254. shl r4d, 8
  2255. add r4d, 3 - (1<<8)
  2256. movd xm5, r4d
  2257. movifnidn t0d, r5m
  2258. vpbroadcastw m5, xm5
  2259. psrld m7, m6, 4
  2260. pxor m4, m4 ; bs0,bs1
  2261. .lists:
  2262. ; Check refs
  2263. LOAD_BYTES_YMM ref
  2264. pxor m0, m1
  2265. por m4, m0
  2266. ; Check mvs
  2267. movu xm0, [mv+0*4*8-4]
  2268. vinserti128 m0, m0, [mv-1*4*8 ], 1
  2269. vbroadcasti128 m2, [mv+0*4*8 ]
  2270. vinserti128 m1, m2, [mv+1*4*8-4], 0
  2271. psubw m0, m2
  2272. vbroadcasti128 m2, [mv+1*4*8 ]
  2273. psubw m1, m2
  2274. packsswb m0, m1
  2275. vinserti128 m1, m2, [mv+2*4*8-4], 0
  2276. vbroadcasti128 m3, [mv+2*4*8 ]
  2277. vinserti128 m2, m3, [mv+3*4*8-4], 0
  2278. psubw m1, m3
  2279. vbroadcasti128 m3, [mv+3*4*8 ]
  2280. psubw m2, m3
  2281. packsswb m1, m2
  2282. pabsb m0, m0
  2283. pabsb m1, m1
  2284. psubusb m0, m5
  2285. psubusb m1, m5
  2286. packsswb m0, m1
  2287. por m4, m0
  2288. add r1, 40
  2289. add r2, 4*8*5
  2290. dec t0d
  2291. jge .lists
  2292. ; Check nnz
  2293. LOAD_BYTES_YMM nnz
  2294. mova m2, [pb_1]
  2295. por m0, m1
  2296. pminub m0, m2
  2297. pminub m4, m2 ; mv ? 1 : 0
  2298. paddb m0, m0 ; nnz ? 2 : 0
  2299. pmaxub m0, m4
  2300. vextracti128 [bs1], m0, 1
  2301. pshufb xm0, [transpose_shuf]
  2302. mova [bs0], xm0
  2303. RET
  2304. %macro LOAD_BYTES_ZMM 1
  2305. vpermd m1, m6, [%1-12]
  2306. pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX
  2307. %endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX
  2308. INIT_ZMM avx512
  2309. cglobal deblock_strength, 5,5
  2310. mova m6, [load_bytes_zmm_shuf]
  2311. shl r4d, 8
  2312. add r4d, 3 - (1<<8)
  2313. vpbroadcastw m5, r4d
  2314. mov r4d, 0x34cc34cc ; {1,-1} * 11001100b
  2315. kmovb k1, r4d
  2316. vpbroadcastd m4, r4d
  2317. movifnidn t0d, r5m
  2318. psrld m7, m6, 4
  2319. pxor xm3, xm3
  2320. .lists:
  2321. vbroadcasti64x2 m2, [mv+32]
  2322. vinserti64x2 m0, m2, [mv-32], 2
  2323. vbroadcasti64x2 m1, [mv+ 0]
  2324. vinserti64x2 m0, m0, [mv- 4], 0
  2325. vbroadcasti64x2 m1 {k1}, [mv+64]
  2326. vinserti64x2 m0, m0, [mv+60], 1
  2327. psubw m0, m1
  2328. vinserti64x2 m1, m1, [mv+28], 0
  2329. vbroadcasti64x2 m2 {k1}, [mv+96]
  2330. vinserti64x2 m1, m1, [mv+92], 1
  2331. psubw m1, m2
  2332. packsswb m0, m1
  2333. pabsb m0, m0
  2334. psubusb m0, m5
  2335. LOAD_BYTES_ZMM ref
  2336. pmaddubsw m1, m4 ; E-F F-G G-H H-I ...
  2337. vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1
  2338. add r1, 40
  2339. add r2, 4*8*5
  2340. dec t0d
  2341. jge .lists
  2342. LOAD_BYTES_ZMM nnz
  2343. mova ym2, [pb_1]
  2344. vptestmw k1, m1, m1
  2345. vptestmw k2, m3, m3
  2346. vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0
  2347. vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0
  2348. vextracti128 [bs1], ym0, 1
  2349. pshufb xm0, [transpose_shuf]
  2350. mova [bs0], xm0
  2351. RET