mc-a.asm 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226
  1. ;*****************************************************************************
  2. ;* mc-a.asm: x86 motion compensation
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2003-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Laurent Aimar <fenrir@via.ecp.fr>
  9. ;* Dylan Yudaken <dyudaken@gmail.com>
  10. ;* Holger Lubitz <holger@lubitz.org>
  11. ;* Min Chen <chenm001.163.com>
  12. ;* Oskar Arvidsson <oskar@irock.se>
  13. ;*
  14. ;* This program is free software; you can redistribute it and/or modify
  15. ;* it under the terms of the GNU General Public License as published by
  16. ;* the Free Software Foundation; either version 2 of the License, or
  17. ;* (at your option) any later version.
  18. ;*
  19. ;* This program is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  22. ;* GNU General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU General Public License
  25. ;* along with this program; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  27. ;*
  28. ;* This program is also available under a commercial proprietary license.
  29. ;* For more information, contact us at licensing@x264.com.
  30. ;*****************************************************************************
  31. %include "x86inc.asm"
  32. %include "x86util.asm"
  33. SECTION_RODATA 32
  34. ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
  35. ch_shuf_adj: times 8 db 0
  36. times 8 db 2
  37. times 8 db 4
  38. times 8 db 6
  39. sq_1: times 1 dq 1
  40. SECTION .text
  41. cextern pb_0
  42. cextern pw_1
  43. cextern pw_4
  44. cextern pw_8
  45. cextern pw_32
  46. cextern pw_64
  47. cextern pw_512
  48. cextern pw_00ff
  49. cextern pw_pixel_max
  50. cextern sw_64
  51. cextern pd_32
  52. cextern deinterleave_shufd
  53. ;=============================================================================
  54. ; implicit weighted biprediction
  55. ;=============================================================================
  56. ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
  57. %if WIN64
  58. DECLARE_REG_TMP 0,1,2,3,4,5,4,5
  59. %macro AVG_START 0-1 0
  60. PROLOGUE 6,7,%1
  61. %endmacro
  62. %elif UNIX64
  63. DECLARE_REG_TMP 0,1,2,3,4,5,7,8
  64. %macro AVG_START 0-1 0
  65. PROLOGUE 6,9,%1
  66. %endmacro
  67. %else
  68. DECLARE_REG_TMP 1,2,3,4,5,6,1,2
  69. %macro AVG_START 0-1 0
  70. PROLOGUE 0,7,%1
  71. mov t0, r0m
  72. mov t1, r1m
  73. mov t2, r2m
  74. mov t3, r3m
  75. mov t4, r4m
  76. mov t5, r5m
  77. %endmacro
  78. %endif
  79. %macro AVG_END 0-1 2 ; rows
  80. lea t2, [t2+t3*2*SIZEOF_PIXEL]
  81. lea t4, [t4+t5*2*SIZEOF_PIXEL]
  82. lea t0, [t0+t1*2*SIZEOF_PIXEL]
  83. sub eax, %1
  84. jg .height_loop
  85. RET
  86. %endmacro
  87. %if HIGH_BIT_DEPTH
  88. %macro BIWEIGHT_MMX 2
  89. movh m0, %1
  90. movh m1, %2
  91. punpcklwd m0, m1
  92. pmaddwd m0, m3
  93. paddd m0, m4
  94. psrad m0, 6
  95. %endmacro
  96. %macro BIWEIGHT_START_MMX 0
  97. movzx t6d, word r6m
  98. mov t7d, 64
  99. sub t7d, t6d
  100. shl t7d, 16
  101. add t6d, t7d
  102. movd m3, t6d
  103. SPLATD m3, m3
  104. mova m4, [pd_32]
  105. pxor m5, m5
  106. %endmacro
  107. %else ;!HIGH_BIT_DEPTH
  108. %macro BIWEIGHT_MMX 2
  109. movh m0, %1
  110. movh m1, %2
  111. punpcklbw m0, m5
  112. punpcklbw m1, m5
  113. pmullw m0, m2
  114. pmullw m1, m3
  115. paddw m0, m1
  116. paddw m0, m4
  117. psraw m0, 6
  118. %endmacro
  119. %macro BIWEIGHT_START_MMX 0
  120. movd m2, r6m
  121. SPLATW m2, m2 ; weight_dst
  122. mova m3, [pw_64]
  123. psubw m3, m2 ; weight_src
  124. mova m4, [pw_32] ; rounding
  125. pxor m5, m5
  126. %endmacro
  127. %endif ;HIGH_BIT_DEPTH
  128. %macro BIWEIGHT_SSSE3 2
  129. movh m0, %1
  130. movh m1, %2
  131. punpcklbw m0, m1
  132. pmaddubsw m0, m3
  133. pmulhrsw m0, m4
  134. %endmacro
  135. %macro BIWEIGHT_START_SSSE3 0
  136. movzx t6d, byte r6m ; FIXME x86_64
  137. %if mmsize > 16
  138. vbroadcasti128 m4, [pw_512]
  139. %else
  140. mova m4, [pw_512]
  141. %endif
  142. lea t7d, [t6+(64<<8)]
  143. shl t6d, 8
  144. sub t7d, t6d
  145. %if cpuflag(avx512)
  146. vpbroadcastw m3, t7d
  147. %else
  148. movd xm3, t7d
  149. %if cpuflag(avx2)
  150. vpbroadcastw m3, xm3
  151. %else
  152. SPLATW m3, m3 ; weight_dst,src
  153. %endif
  154. %endif
  155. %endmacro
  156. %if HIGH_BIT_DEPTH
  157. %macro BIWEIGHT_ROW 4
  158. BIWEIGHT [%2], [%3]
  159. %if %4==mmsize/4
  160. packssdw m0, m0
  161. CLIPW m0, m5, m7
  162. movh [%1], m0
  163. %else
  164. SWAP 0, 6
  165. BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
  166. packssdw m6, m0
  167. CLIPW m6, m5, m7
  168. mova [%1], m6
  169. %endif
  170. %endmacro
  171. %else ;!HIGH_BIT_DEPTH
  172. %macro BIWEIGHT_ROW 4
  173. BIWEIGHT [%2], [%3]
  174. %if %4==mmsize/2
  175. packuswb m0, m0
  176. movh [%1], m0
  177. %else
  178. SWAP 0, 6
  179. BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
  180. packuswb m6, m0
  181. mova [%1], m6
  182. %endif
  183. %endmacro
  184. %endif ;HIGH_BIT_DEPTH
  185. ;-----------------------------------------------------------------------------
  186. ; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
  187. ;-----------------------------------------------------------------------------
  188. %macro AVG_WEIGHT 1-2 0
  189. cglobal pixel_avg_weight_w%1
  190. BIWEIGHT_START
  191. AVG_START %2
  192. %if HIGH_BIT_DEPTH
  193. mova m7, [pw_pixel_max]
  194. %endif
  195. .height_loop:
  196. %if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
  197. BIWEIGHT [t2], [t4]
  198. SWAP 0, 6
  199. BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
  200. %if HIGH_BIT_DEPTH
  201. packssdw m6, m0
  202. CLIPW m6, m5, m7
  203. %else ;!HIGH_BIT_DEPTH
  204. packuswb m6, m0
  205. %endif ;HIGH_BIT_DEPTH
  206. movlps [t0], m6
  207. movhps [t0+SIZEOF_PIXEL*t1], m6
  208. %else
  209. %assign x 0
  210. %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
  211. BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
  212. BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
  213. %assign x x+mmsize
  214. %endrep
  215. %endif
  216. AVG_END
  217. %endmacro
  218. %define BIWEIGHT BIWEIGHT_MMX
  219. %define BIWEIGHT_START BIWEIGHT_START_MMX
  220. INIT_MMX mmx2
  221. AVG_WEIGHT 4
  222. AVG_WEIGHT 8
  223. AVG_WEIGHT 16
  224. %if HIGH_BIT_DEPTH
  225. INIT_XMM sse2
  226. AVG_WEIGHT 4, 8
  227. AVG_WEIGHT 8, 8
  228. AVG_WEIGHT 16, 8
  229. %else ;!HIGH_BIT_DEPTH
  230. INIT_XMM sse2
  231. AVG_WEIGHT 8, 7
  232. AVG_WEIGHT 16, 7
  233. %define BIWEIGHT BIWEIGHT_SSSE3
  234. %define BIWEIGHT_START BIWEIGHT_START_SSSE3
  235. INIT_MMX ssse3
  236. AVG_WEIGHT 4
  237. INIT_XMM ssse3
  238. AVG_WEIGHT 8, 7
  239. AVG_WEIGHT 16, 7
  240. INIT_YMM avx2
  241. cglobal pixel_avg_weight_w16
  242. BIWEIGHT_START
  243. AVG_START 5
  244. .height_loop:
  245. movu xm0, [t2]
  246. movu xm1, [t4]
  247. vinserti128 m0, m0, [t2+t3], 1
  248. vinserti128 m1, m1, [t4+t5], 1
  249. SBUTTERFLY bw, 0, 1, 2
  250. pmaddubsw m0, m3
  251. pmaddubsw m1, m3
  252. pmulhrsw m0, m4
  253. pmulhrsw m1, m4
  254. packuswb m0, m1
  255. mova [t0], xm0
  256. vextracti128 [t0+t1], m0, 1
  257. AVG_END
  258. INIT_YMM avx512
  259. cglobal pixel_avg_weight_w8
  260. BIWEIGHT_START
  261. kxnorb k1, k1, k1
  262. kaddb k1, k1, k1
  263. AVG_START 5
  264. .height_loop:
  265. movq xm0, [t2]
  266. movq xm2, [t4]
  267. movq xm1, [t2+t3]
  268. movq xm5, [t4+t5]
  269. lea t2, [t2+t3*2]
  270. lea t4, [t4+t5*2]
  271. vpbroadcastq m0 {k1}, [t2]
  272. vpbroadcastq m2 {k1}, [t4]
  273. vpbroadcastq m1 {k1}, [t2+t3]
  274. vpbroadcastq m5 {k1}, [t4+t5]
  275. punpcklbw m0, m2
  276. punpcklbw m1, m5
  277. pmaddubsw m0, m3
  278. pmaddubsw m1, m3
  279. pmulhrsw m0, m4
  280. pmulhrsw m1, m4
  281. packuswb m0, m1
  282. vextracti128 xmm1, m0, 1
  283. movq [t0], xm0
  284. movhps [t0+t1], xm0
  285. lea t0, [t0+t1*2]
  286. movq [t0], xmm1
  287. movhps [t0+t1], xmm1
  288. AVG_END 4
  289. INIT_ZMM avx512
  290. cglobal pixel_avg_weight_w16
  291. BIWEIGHT_START
  292. AVG_START 5
  293. .height_loop:
  294. movu xm0, [t2]
  295. movu xm1, [t4]
  296. vinserti128 ym0, [t2+t3], 1
  297. vinserti128 ym1, [t4+t5], 1
  298. lea t2, [t2+t3*2]
  299. lea t4, [t4+t5*2]
  300. vinserti32x4 m0, [t2], 2
  301. vinserti32x4 m1, [t4], 2
  302. vinserti32x4 m0, [t2+t3], 3
  303. vinserti32x4 m1, [t4+t5], 3
  304. SBUTTERFLY bw, 0, 1, 2
  305. pmaddubsw m0, m3
  306. pmaddubsw m1, m3
  307. pmulhrsw m0, m4
  308. pmulhrsw m1, m4
  309. packuswb m0, m1
  310. mova [t0], xm0
  311. vextracti128 [t0+t1], ym0, 1
  312. lea t0, [t0+t1*2]
  313. vextracti32x4 [t0], m0, 2
  314. vextracti32x4 [t0+t1], m0, 3
  315. AVG_END 4
  316. %endif ;HIGH_BIT_DEPTH
  317. ;=============================================================================
  318. ; P frame explicit weighted prediction
  319. ;=============================================================================
  320. %if HIGH_BIT_DEPTH
  321. ; width
  322. %macro WEIGHT_START 1
  323. mova m0, [r4+ 0] ; 1<<denom
  324. mova m3, [r4+16]
  325. movd m2, [r4+32] ; denom
  326. mova m4, [pw_pixel_max]
  327. paddw m2, [sq_1] ; denom+1
  328. %endmacro
  329. ; src1, src2
  330. %macro WEIGHT 2
  331. movh m5, [%1]
  332. movh m6, [%2]
  333. punpcklwd m5, m0
  334. punpcklwd m6, m0
  335. pmaddwd m5, m3
  336. pmaddwd m6, m3
  337. psrad m5, m2
  338. psrad m6, m2
  339. packssdw m5, m6
  340. %endmacro
  341. ; src, dst, width
  342. %macro WEIGHT_TWO_ROW 4
  343. %assign x 0
  344. %rep (%3+mmsize/2-1)/(mmsize/2)
  345. %if %3-x/2 <= 4 && mmsize == 16
  346. WEIGHT %1+x, %1+r3+x
  347. CLIPW m5, [pb_0], m4
  348. movh [%2+x], m5
  349. movhps [%2+r1+x], m5
  350. %else
  351. WEIGHT %1+x, %1+x+mmsize/2
  352. SWAP 5, 7
  353. WEIGHT %1+r3+x, %1+r3+x+mmsize/2
  354. CLIPW m5, [pb_0], m4
  355. CLIPW m7, [pb_0], m4
  356. mova [%2+x], m7
  357. mova [%2+r1+x], m5
  358. %endif
  359. %assign x x+mmsize
  360. %endrep
  361. %endmacro
  362. %else ; !HIGH_BIT_DEPTH
  363. %macro WEIGHT_START 1
  364. %if cpuflag(avx2)
  365. vbroadcasti128 m3, [r4]
  366. vbroadcasti128 m4, [r4+16]
  367. %else
  368. mova m3, [r4]
  369. mova m4, [r4+16]
  370. %if notcpuflag(ssse3)
  371. movd m5, [r4+32]
  372. %endif
  373. %endif
  374. pxor m2, m2
  375. %endmacro
  376. ; src1, src2, dst1, dst2, fast
  377. %macro WEIGHT_ROWx2 5
  378. movh m0, [%1 ]
  379. movh m1, [%1+mmsize/2]
  380. movh m6, [%2 ]
  381. movh m7, [%2+mmsize/2]
  382. punpcklbw m0, m2
  383. punpcklbw m1, m2
  384. punpcklbw m6, m2
  385. punpcklbw m7, m2
  386. %if cpuflag(ssse3)
  387. %if %5==0
  388. psllw m0, 7
  389. psllw m1, 7
  390. psllw m6, 7
  391. psllw m7, 7
  392. %endif
  393. pmulhrsw m0, m3
  394. pmulhrsw m1, m3
  395. pmulhrsw m6, m3
  396. pmulhrsw m7, m3
  397. paddw m0, m4
  398. paddw m1, m4
  399. paddw m6, m4
  400. paddw m7, m4
  401. %else
  402. pmullw m0, m3
  403. pmullw m1, m3
  404. pmullw m6, m3
  405. pmullw m7, m3
  406. paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
  407. paddsw m1, m4
  408. paddsw m6, m4
  409. paddsw m7, m4
  410. psraw m0, m5
  411. psraw m1, m5
  412. psraw m6, m5
  413. psraw m7, m5
  414. %endif
  415. packuswb m0, m1
  416. packuswb m6, m7
  417. mova [%3], m0
  418. mova [%4], m6
  419. %endmacro
  420. ; src1, src2, dst1, dst2, width, fast
  421. %macro WEIGHT_COL 6
  422. %if cpuflag(avx2)
  423. %if %5==16
  424. movu xm0, [%1]
  425. vinserti128 m0, m0, [%2], 1
  426. punpckhbw m1, m0, m2
  427. punpcklbw m0, m0, m2
  428. %if %6==0
  429. psllw m0, 7
  430. psllw m1, 7
  431. %endif
  432. pmulhrsw m0, m3
  433. pmulhrsw m1, m3
  434. paddw m0, m4
  435. paddw m1, m4
  436. packuswb m0, m1
  437. mova [%3], xm0
  438. vextracti128 [%4], m0, 1
  439. %else
  440. movq xm0, [%1]
  441. vinserti128 m0, m0, [%2], 1
  442. punpcklbw m0, m2
  443. %if %6==0
  444. psllw m0, 7
  445. %endif
  446. pmulhrsw m0, m3
  447. paddw m0, m4
  448. packuswb m0, m0
  449. vextracti128 xm1, m0, 1
  450. %if %5 == 8
  451. movq [%3], xm0
  452. movq [%4], xm1
  453. %else
  454. movd [%3], xm0
  455. movd [%4], xm1
  456. %endif
  457. %endif
  458. %else
  459. movh m0, [%1]
  460. movh m1, [%2]
  461. punpcklbw m0, m2
  462. punpcklbw m1, m2
  463. %if cpuflag(ssse3)
  464. %if %6==0
  465. psllw m0, 7
  466. psllw m1, 7
  467. %endif
  468. pmulhrsw m0, m3
  469. pmulhrsw m1, m3
  470. paddw m0, m4
  471. paddw m1, m4
  472. %else
  473. pmullw m0, m3
  474. pmullw m1, m3
  475. paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
  476. paddsw m1, m4
  477. psraw m0, m5
  478. psraw m1, m5
  479. %endif
  480. %if %5 == 8
  481. packuswb m0, m1
  482. movh [%3], m0
  483. movhps [%4], m0
  484. %else
  485. packuswb m0, m0
  486. packuswb m1, m1
  487. movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
  488. movd [%4], m1
  489. %endif
  490. %endif
  491. %endmacro
  492. ; src, dst, width
  493. %macro WEIGHT_TWO_ROW 4
  494. %assign x 0
  495. %rep %3
  496. %if (%3-x) >= mmsize
  497. WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
  498. %assign x (x+mmsize)
  499. %else
  500. %assign w %3-x
  501. %if w == 20
  502. %assign w 16
  503. %endif
  504. WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
  505. %assign x (x+w)
  506. %endif
  507. %if x >= %3
  508. %exitrep
  509. %endif
  510. %endrep
  511. %endmacro
  512. %endif ; HIGH_BIT_DEPTH
  513. ;-----------------------------------------------------------------------------
  514. ;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
  515. ;-----------------------------------------------------------------------------
  516. %macro WEIGHTER 1
  517. cglobal mc_weight_w%1, 6,6,8
  518. FIX_STRIDES r1, r3
  519. WEIGHT_START %1
  520. %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
  521. ; we can merge the shift step into the scale factor
  522. ; if (m3<<7) doesn't overflow an int16_t
  523. cmp byte [r4+1], 0
  524. jz .fast
  525. %endif
  526. .loop:
  527. WEIGHT_TWO_ROW r2, r0, %1, 0
  528. lea r0, [r0+r1*2]
  529. lea r2, [r2+r3*2]
  530. sub r5d, 2
  531. jg .loop
  532. RET
  533. %if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
  534. .fast:
  535. psllw m3, 7
  536. .fastloop:
  537. WEIGHT_TWO_ROW r2, r0, %1, 1
  538. lea r0, [r0+r1*2]
  539. lea r2, [r2+r3*2]
  540. sub r5d, 2
  541. jg .fastloop
  542. RET
  543. %endif
  544. %endmacro
  545. INIT_MMX mmx2
  546. WEIGHTER 4
  547. WEIGHTER 8
  548. WEIGHTER 12
  549. WEIGHTER 16
  550. WEIGHTER 20
  551. INIT_XMM sse2
  552. WEIGHTER 8
  553. WEIGHTER 16
  554. WEIGHTER 20
  555. %if HIGH_BIT_DEPTH
  556. WEIGHTER 12
  557. %else
  558. INIT_MMX ssse3
  559. WEIGHTER 4
  560. INIT_XMM ssse3
  561. WEIGHTER 8
  562. WEIGHTER 16
  563. WEIGHTER 20
  564. INIT_YMM avx2
  565. WEIGHTER 8
  566. WEIGHTER 16
  567. WEIGHTER 20
  568. %endif
  569. %macro OFFSET_OP 7
  570. mov%6 m0, [%1]
  571. mov%6 m1, [%2]
  572. %if HIGH_BIT_DEPTH
  573. p%5usw m0, m2
  574. p%5usw m1, m2
  575. %ifidn %5,add
  576. pminsw m0, m3
  577. pminsw m1, m3
  578. %endif
  579. %else
  580. p%5usb m0, m2
  581. p%5usb m1, m2
  582. %endif
  583. mov%7 [%3], m0
  584. mov%7 [%4], m1
  585. %endmacro
  586. %macro OFFSET_TWO_ROW 4
  587. %assign x 0
  588. %rep %3
  589. %if (%3*SIZEOF_PIXEL-x) >= mmsize
  590. OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
  591. %assign x (x+mmsize)
  592. %else
  593. %if HIGH_BIT_DEPTH
  594. OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
  595. %else
  596. OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
  597. %endif
  598. %exitrep
  599. %endif
  600. %if x >= %3*SIZEOF_PIXEL
  601. %exitrep
  602. %endif
  603. %endrep
  604. %endmacro
  605. ;-----------------------------------------------------------------------------
  606. ;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
  607. ;-----------------------------------------------------------------------------
  608. %macro OFFSET 2
  609. cglobal mc_offset%2_w%1, 6,6
  610. FIX_STRIDES r1, r3
  611. mova m2, [r4]
  612. %if HIGH_BIT_DEPTH
  613. %ifidn %2,add
  614. mova m3, [pw_pixel_max]
  615. %endif
  616. %endif
  617. .loop:
  618. OFFSET_TWO_ROW r2, r0, %1, %2
  619. lea r0, [r0+r1*2]
  620. lea r2, [r2+r3*2]
  621. sub r5d, 2
  622. jg .loop
  623. RET
  624. %endmacro
  625. %macro OFFSETPN 1
  626. OFFSET %1, add
  627. OFFSET %1, sub
  628. %endmacro
  629. INIT_MMX mmx2
  630. OFFSETPN 4
  631. OFFSETPN 8
  632. OFFSETPN 12
  633. OFFSETPN 16
  634. OFFSETPN 20
  635. INIT_XMM sse2
  636. OFFSETPN 12
  637. OFFSETPN 16
  638. OFFSETPN 20
  639. %if HIGH_BIT_DEPTH
  640. INIT_XMM sse2
  641. OFFSETPN 8
  642. %endif
  643. ;=============================================================================
  644. ; pixel avg
  645. ;=============================================================================
  646. ;-----------------------------------------------------------------------------
  647. ; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
  648. ; pixel *src2, intptr_t src2_stride, int weight );
  649. ;-----------------------------------------------------------------------------
  650. %macro AVGH 2
  651. cglobal pixel_avg_%1x%2
  652. mov eax, %2
  653. cmp dword r6m, 32
  654. jne pixel_avg_weight_w%1 %+ SUFFIX
  655. %if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
  656. jmp pixel_avg_w%1_avx2
  657. %else
  658. %if mmsize == 16 && %1 == 16
  659. test dword r4m, 15
  660. jz pixel_avg_w%1_sse2
  661. %endif
  662. jmp pixel_avg_w%1_mmx2
  663. %endif
  664. %endmacro
  665. ;-----------------------------------------------------------------------------
  666. ; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
  667. ; pixel *src2, intptr_t src2_stride, int height, int weight );
  668. ;-----------------------------------------------------------------------------
  669. %macro AVG_FUNC 3
  670. cglobal pixel_avg_w%1
  671. AVG_START
  672. .height_loop:
  673. %assign x 0
  674. %rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
  675. %2 m0, [t2+x]
  676. %2 m1, [t2+x+SIZEOF_PIXEL*t3]
  677. %if HIGH_BIT_DEPTH
  678. pavgw m0, [t4+x]
  679. pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
  680. %else ;!HIGH_BIT_DEPTH
  681. pavgb m0, [t4+x]
  682. pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
  683. %endif
  684. %3 [t0+x], m0
  685. %3 [t0+x+SIZEOF_PIXEL*t1], m1
  686. %assign x x+mmsize
  687. %endrep
  688. AVG_END
  689. %endmacro
  690. %if HIGH_BIT_DEPTH
  691. INIT_MMX mmx2
  692. AVG_FUNC 4, movq, movq
  693. AVGH 4, 16
  694. AVGH 4, 8
  695. AVGH 4, 4
  696. AVGH 4, 2
  697. AVG_FUNC 8, movq, movq
  698. AVGH 8, 16
  699. AVGH 8, 8
  700. AVGH 8, 4
  701. AVG_FUNC 16, movq, movq
  702. AVGH 16, 16
  703. AVGH 16, 8
  704. INIT_XMM sse2
  705. AVG_FUNC 4, movq, movq
  706. AVGH 4, 16
  707. AVGH 4, 8
  708. AVGH 4, 4
  709. AVGH 4, 2
  710. AVG_FUNC 8, movdqu, movdqa
  711. AVGH 8, 16
  712. AVGH 8, 8
  713. AVGH 8, 4
  714. AVG_FUNC 16, movdqu, movdqa
  715. AVGH 16, 16
  716. AVGH 16, 8
  717. %else ;!HIGH_BIT_DEPTH
  718. INIT_MMX mmx2
  719. AVG_FUNC 4, movd, movd
  720. AVGH 4, 16
  721. AVGH 4, 8
  722. AVGH 4, 4
  723. AVGH 4, 2
  724. AVG_FUNC 8, movq, movq
  725. AVGH 8, 16
  726. AVGH 8, 8
  727. AVGH 8, 4
  728. AVG_FUNC 16, movq, movq
  729. AVGH 16, 16
  730. AVGH 16, 8
  731. INIT_XMM sse2
  732. AVG_FUNC 16, movdqu, movdqa
  733. AVGH 16, 16
  734. AVGH 16, 8
  735. AVGH 8, 16
  736. AVGH 8, 8
  737. AVGH 8, 4
  738. INIT_XMM ssse3
  739. AVGH 16, 16
  740. AVGH 16, 8
  741. AVGH 8, 16
  742. AVGH 8, 8
  743. AVGH 8, 4
  744. INIT_MMX ssse3
  745. AVGH 4, 16
  746. AVGH 4, 8
  747. AVGH 4, 4
  748. AVGH 4, 2
  749. INIT_XMM avx2
  750. AVG_FUNC 16, movdqu, movdqa
  751. AVGH 16, 16
  752. AVGH 16, 8
  753. INIT_XMM avx512
  754. AVGH 16, 16
  755. AVGH 16, 8
  756. AVGH 8, 16
  757. AVGH 8, 8
  758. AVGH 8, 4
  759. %endif ;HIGH_BIT_DEPTH
  760. ;=============================================================================
  761. ; pixel avg2
  762. ;=============================================================================
  763. %if HIGH_BIT_DEPTH
  764. ;-----------------------------------------------------------------------------
  765. ; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
  766. ; uint16_t *src1, intptr_t src_stride,
  767. ; uint16_t *src2, int height );
  768. ;-----------------------------------------------------------------------------
  769. %macro AVG2_W_ONE 1
  770. cglobal pixel_avg2_w%1, 6,7,4
  771. sub r4, r2
  772. lea r6, [r4+r3*2]
  773. .height_loop:
  774. movu m0, [r2]
  775. movu m1, [r2+r3*2]
  776. %if cpuflag(avx) || mmsize == 8
  777. pavgw m0, [r2+r4]
  778. pavgw m1, [r2+r6]
  779. %else
  780. movu m2, [r2+r4]
  781. movu m3, [r2+r6]
  782. pavgw m0, m2
  783. pavgw m1, m3
  784. %endif
  785. mova [r0], m0
  786. mova [r0+r1*2], m1
  787. lea r2, [r2+r3*4]
  788. lea r0, [r0+r1*4]
  789. sub r5d, 2
  790. jg .height_loop
  791. RET
  792. %endmacro
  793. %macro AVG2_W_TWO 3
  794. cglobal pixel_avg2_w%1, 6,7,8
  795. sub r4, r2
  796. lea r6, [r4+r3*2]
  797. .height_loop:
  798. movu m0, [r2]
  799. %2 m1, [r2+mmsize]
  800. movu m2, [r2+r3*2]
  801. %2 m3, [r2+r3*2+mmsize]
  802. %if mmsize == 8
  803. pavgw m0, [r2+r4]
  804. pavgw m1, [r2+r4+mmsize]
  805. pavgw m2, [r2+r6]
  806. pavgw m3, [r2+r6+mmsize]
  807. %else
  808. movu m4, [r2+r4]
  809. %2 m5, [r2+r4+mmsize]
  810. movu m6, [r2+r6]
  811. %2 m7, [r2+r6+mmsize]
  812. pavgw m0, m4
  813. pavgw m1, m5
  814. pavgw m2, m6
  815. pavgw m3, m7
  816. %endif
  817. mova [r0], m0
  818. %3 [r0+mmsize], m1
  819. mova [r0+r1*2], m2
  820. %3 [r0+r1*2+mmsize], m3
  821. lea r2, [r2+r3*4]
  822. lea r0, [r0+r1*4]
  823. sub r5d, 2
  824. jg .height_loop
  825. RET
  826. %endmacro
  827. INIT_MMX mmx2
  828. AVG2_W_ONE 4
  829. AVG2_W_TWO 8, movu, mova
  830. INIT_XMM sse2
  831. AVG2_W_ONE 8
  832. AVG2_W_TWO 10, movd, movd
  833. AVG2_W_TWO 16, movu, mova
  834. INIT_YMM avx2
  835. AVG2_W_ONE 16
  836. INIT_MMX
  837. cglobal pixel_avg2_w10_mmx2, 6,7
  838. sub r4, r2
  839. lea r6, [r4+r3*2]
  840. .height_loop:
  841. movu m0, [r2+ 0]
  842. movu m1, [r2+ 8]
  843. movh m2, [r2+16]
  844. movu m3, [r2+r3*2+ 0]
  845. movu m4, [r2+r3*2+ 8]
  846. movh m5, [r2+r3*2+16]
  847. pavgw m0, [r2+r4+ 0]
  848. pavgw m1, [r2+r4+ 8]
  849. pavgw m2, [r2+r4+16]
  850. pavgw m3, [r2+r6+ 0]
  851. pavgw m4, [r2+r6+ 8]
  852. pavgw m5, [r2+r6+16]
  853. mova [r0+ 0], m0
  854. mova [r0+ 8], m1
  855. movh [r0+16], m2
  856. mova [r0+r1*2+ 0], m3
  857. mova [r0+r1*2+ 8], m4
  858. movh [r0+r1*2+16], m5
  859. lea r2, [r2+r3*2*2]
  860. lea r0, [r0+r1*2*2]
  861. sub r5d, 2
  862. jg .height_loop
  863. RET
  864. cglobal pixel_avg2_w16_mmx2, 6,7
  865. sub r4, r2
  866. lea r6, [r4+r3*2]
  867. .height_loop:
  868. movu m0, [r2+ 0]
  869. movu m1, [r2+ 8]
  870. movu m2, [r2+16]
  871. movu m3, [r2+24]
  872. movu m4, [r2+r3*2+ 0]
  873. movu m5, [r2+r3*2+ 8]
  874. movu m6, [r2+r3*2+16]
  875. movu m7, [r2+r3*2+24]
  876. pavgw m0, [r2+r4+ 0]
  877. pavgw m1, [r2+r4+ 8]
  878. pavgw m2, [r2+r4+16]
  879. pavgw m3, [r2+r4+24]
  880. pavgw m4, [r2+r6+ 0]
  881. pavgw m5, [r2+r6+ 8]
  882. pavgw m6, [r2+r6+16]
  883. pavgw m7, [r2+r6+24]
  884. mova [r0+ 0], m0
  885. mova [r0+ 8], m1
  886. mova [r0+16], m2
  887. mova [r0+24], m3
  888. mova [r0+r1*2+ 0], m4
  889. mova [r0+r1*2+ 8], m5
  890. mova [r0+r1*2+16], m6
  891. mova [r0+r1*2+24], m7
  892. lea r2, [r2+r3*2*2]
  893. lea r0, [r0+r1*2*2]
  894. sub r5d, 2
  895. jg .height_loop
  896. RET
  897. cglobal pixel_avg2_w18_mmx2, 6,7
  898. sub r4, r2
  899. .height_loop:
  900. movu m0, [r2+ 0]
  901. movu m1, [r2+ 8]
  902. movu m2, [r2+16]
  903. movu m3, [r2+24]
  904. movh m4, [r2+32]
  905. pavgw m0, [r2+r4+ 0]
  906. pavgw m1, [r2+r4+ 8]
  907. pavgw m2, [r2+r4+16]
  908. pavgw m3, [r2+r4+24]
  909. pavgw m4, [r2+r4+32]
  910. mova [r0+ 0], m0
  911. mova [r0+ 8], m1
  912. mova [r0+16], m2
  913. mova [r0+24], m3
  914. movh [r0+32], m4
  915. lea r2, [r2+r3*2]
  916. lea r0, [r0+r1*2]
  917. dec r5d
  918. jg .height_loop
  919. RET
  920. %macro PIXEL_AVG_W18 0
  921. cglobal pixel_avg2_w18, 6,7
  922. sub r4, r2
  923. .height_loop:
  924. movu m0, [r2+ 0]
  925. movd xm2, [r2+32]
  926. %if mmsize == 32
  927. pavgw m0, [r2+r4+ 0]
  928. movd xm1, [r2+r4+32]
  929. pavgw xm2, xm1
  930. %else
  931. movu m1, [r2+16]
  932. movu m3, [r2+r4+ 0]
  933. movu m4, [r2+r4+16]
  934. movd m5, [r2+r4+32]
  935. pavgw m0, m3
  936. pavgw m1, m4
  937. pavgw m2, m5
  938. mova [r0+16], m1
  939. %endif
  940. mova [r0+ 0], m0
  941. movd [r0+32], xm2
  942. lea r2, [r2+r3*2]
  943. lea r0, [r0+r1*2]
  944. dec r5d
  945. jg .height_loop
  946. RET
  947. %endmacro
  948. INIT_XMM sse2
  949. PIXEL_AVG_W18
  950. INIT_YMM avx2
  951. PIXEL_AVG_W18
  952. %endif ; HIGH_BIT_DEPTH
  953. %if HIGH_BIT_DEPTH == 0
  954. ;-----------------------------------------------------------------------------
  955. ; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
  956. ; uint8_t *src1, intptr_t src_stride,
  957. ; uint8_t *src2, int height );
  958. ;-----------------------------------------------------------------------------
  959. %macro AVG2_W8 2
  960. cglobal pixel_avg2_w%1_mmx2, 6,7
  961. sub r4, r2
  962. lea r6, [r4+r3]
  963. .height_loop:
  964. %2 mm0, [r2]
  965. %2 mm1, [r2+r3]
  966. pavgb mm0, [r2+r4]
  967. pavgb mm1, [r2+r6]
  968. lea r2, [r2+r3*2]
  969. %2 [r0], mm0
  970. %2 [r0+r1], mm1
  971. lea r0, [r0+r1*2]
  972. sub r5d, 2
  973. jg .height_loop
  974. RET
  975. %endmacro
  976. INIT_MMX
  977. AVG2_W8 4, movd
  978. AVG2_W8 8, movq
  979. %macro AVG2_W16 2
  980. cglobal pixel_avg2_w%1_mmx2, 6,7
  981. sub r2, r4
  982. lea r6, [r2+r3]
  983. .height_loop:
  984. movq mm0, [r4]
  985. %2 mm1, [r4+8]
  986. movq mm2, [r4+r3]
  987. %2 mm3, [r4+r3+8]
  988. pavgb mm0, [r4+r2]
  989. pavgb mm1, [r4+r2+8]
  990. pavgb mm2, [r4+r6]
  991. pavgb mm3, [r4+r6+8]
  992. lea r4, [r4+r3*2]
  993. movq [r0], mm0
  994. %2 [r0+8], mm1
  995. movq [r0+r1], mm2
  996. %2 [r0+r1+8], mm3
  997. lea r0, [r0+r1*2]
  998. sub r5d, 2
  999. jg .height_loop
  1000. RET
  1001. %endmacro
  1002. AVG2_W16 12, movd
  1003. AVG2_W16 16, movq
  1004. cglobal pixel_avg2_w20_mmx2, 6,7
  1005. sub r2, r4
  1006. lea r6, [r2+r3]
  1007. .height_loop:
  1008. movq mm0, [r4]
  1009. movq mm1, [r4+8]
  1010. movd mm2, [r4+16]
  1011. movq mm3, [r4+r3]
  1012. movq mm4, [r4+r3+8]
  1013. movd mm5, [r4+r3+16]
  1014. pavgb mm0, [r4+r2]
  1015. pavgb mm1, [r4+r2+8]
  1016. pavgb mm2, [r4+r2+16]
  1017. pavgb mm3, [r4+r6]
  1018. pavgb mm4, [r4+r6+8]
  1019. pavgb mm5, [r4+r6+16]
  1020. lea r4, [r4+r3*2]
  1021. movq [r0], mm0
  1022. movq [r0+8], mm1
  1023. movd [r0+16], mm2
  1024. movq [r0+r1], mm3
  1025. movq [r0+r1+8], mm4
  1026. movd [r0+r1+16], mm5
  1027. lea r0, [r0+r1*2]
  1028. sub r5d, 2
  1029. jg .height_loop
  1030. RET
  1031. INIT_XMM
  1032. cglobal pixel_avg2_w16_sse2, 6,7
  1033. sub r4, r2
  1034. lea r6, [r4+r3]
  1035. .height_loop:
  1036. movu m0, [r2]
  1037. movu m2, [r2+r3]
  1038. movu m1, [r2+r4]
  1039. movu m3, [r2+r6]
  1040. lea r2, [r2+r3*2]
  1041. pavgb m0, m1
  1042. pavgb m2, m3
  1043. mova [r0], m0
  1044. mova [r0+r1], m2
  1045. lea r0, [r0+r1*2]
  1046. sub r5d, 2
  1047. jg .height_loop
  1048. RET
  1049. cglobal pixel_avg2_w20_sse2, 6,7
  1050. sub r2, r4
  1051. lea r6, [r2+r3]
  1052. .height_loop:
  1053. movu m0, [r4]
  1054. movu m2, [r4+r3]
  1055. movu m1, [r4+r2]
  1056. movu m3, [r4+r6]
  1057. movd mm4, [r4+16]
  1058. movd mm5, [r4+r3+16]
  1059. pavgb m0, m1
  1060. pavgb m2, m3
  1061. pavgb mm4, [r4+r2+16]
  1062. pavgb mm5, [r4+r6+16]
  1063. lea r4, [r4+r3*2]
  1064. mova [r0], m0
  1065. mova [r0+r1], m2
  1066. movd [r0+16], mm4
  1067. movd [r0+r1+16], mm5
  1068. lea r0, [r0+r1*2]
  1069. sub r5d, 2
  1070. jg .height_loop
  1071. RET
  1072. INIT_YMM avx2
  1073. cglobal pixel_avg2_w20, 6,7
  1074. sub r2, r4
  1075. lea r6, [r2+r3]
  1076. .height_loop:
  1077. movu m0, [r4]
  1078. movu m1, [r4+r3]
  1079. pavgb m0, [r4+r2]
  1080. pavgb m1, [r4+r6]
  1081. lea r4, [r4+r3*2]
  1082. mova [r0], m0
  1083. mova [r0+r1], m1
  1084. lea r0, [r0+r1*2]
  1085. sub r5d, 2
  1086. jg .height_loop
  1087. RET
  1088. ; Cacheline split code for processors with high latencies for loads
  1089. ; split over cache lines. See sad-a.asm for a more detailed explanation.
  1090. ; This particular instance is complicated by the fact that src1 and src2
  1091. ; can have different alignments. For simplicity and code size, only the
  1092. ; MMX cacheline workaround is used. As a result, in the case of SSE2
  1093. ; pixel_avg, the cacheline check functions calls the SSE2 version if there
  1094. ; is no cacheline split, and the MMX workaround if there is.
  1095. %macro INIT_SHIFT 2
  1096. and eax, 7
  1097. shl eax, 3
  1098. movd %1, [sw_64]
  1099. movd %2, eax
  1100. psubw %1, %2
  1101. %endmacro
  1102. %macro AVG_CACHELINE_START 0
  1103. %assign stack_offset 0
  1104. INIT_SHIFT mm6, mm7
  1105. mov eax, r4m
  1106. INIT_SHIFT mm4, mm5
  1107. PROLOGUE 6,6
  1108. and r2, ~7
  1109. and r4, ~7
  1110. sub r4, r2
  1111. .height_loop:
  1112. %endmacro
  1113. %macro AVG_CACHELINE_LOOP 2
  1114. movq mm1, [r2+%1]
  1115. movq mm0, [r2+8+%1]
  1116. movq mm3, [r2+r4+%1]
  1117. movq mm2, [r2+r4+8+%1]
  1118. psrlq mm1, mm7
  1119. psllq mm0, mm6
  1120. psrlq mm3, mm5
  1121. psllq mm2, mm4
  1122. por mm0, mm1
  1123. por mm2, mm3
  1124. pavgb mm2, mm0
  1125. %2 [r0+%1], mm2
  1126. %endmacro
  1127. %macro AVG_CACHELINE_FUNC 2
  1128. pixel_avg2_w%1_cache_mmx2:
  1129. AVG_CACHELINE_START
  1130. AVG_CACHELINE_LOOP 0, movq
  1131. %if %1>8
  1132. AVG_CACHELINE_LOOP 8, movq
  1133. %if %1>16
  1134. AVG_CACHELINE_LOOP 16, movd
  1135. %endif
  1136. %endif
  1137. add r2, r3
  1138. add r0, r1
  1139. dec r5d
  1140. jg .height_loop
  1141. RET
  1142. %endmacro
  1143. %macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
  1144. %if %1 == 12
  1145. ;w12 isn't needed because w16 is just as fast if there's no cacheline split
  1146. %define cachesplit pixel_avg2_w16_cache_mmx2
  1147. %else
  1148. %define cachesplit pixel_avg2_w%1_cache_mmx2
  1149. %endif
  1150. cglobal pixel_avg2_w%1_cache%2_%3
  1151. mov eax, r2m
  1152. and eax, %2-1
  1153. cmp eax, (%2-%1-(%1 % 8))
  1154. %if %1==12||%1==20
  1155. jbe pixel_avg2_w%1_%3
  1156. %else
  1157. jb pixel_avg2_w%1_%3
  1158. %endif
  1159. %if 0 ; or %1==8 - but the extra branch seems too expensive
  1160. ja cachesplit
  1161. %if ARCH_X86_64
  1162. test r4b, 1
  1163. %else
  1164. test byte r4m, 1
  1165. %endif
  1166. jz pixel_avg2_w%1_%3
  1167. %else
  1168. or eax, r4m
  1169. and eax, 7
  1170. jz pixel_avg2_w%1_%3
  1171. mov eax, r2m
  1172. %endif
  1173. %if mmsize==16 || (%1==8 && %2==64)
  1174. AVG_CACHELINE_FUNC %1, %2
  1175. %else
  1176. jmp cachesplit
  1177. %endif
  1178. %endmacro
  1179. INIT_MMX
  1180. AVG_CACHELINE_CHECK 8, 64, mmx2
  1181. AVG_CACHELINE_CHECK 12, 64, mmx2
  1182. %if ARCH_X86_64 == 0
  1183. AVG_CACHELINE_CHECK 16, 64, mmx2
  1184. AVG_CACHELINE_CHECK 20, 64, mmx2
  1185. AVG_CACHELINE_CHECK 8, 32, mmx2
  1186. AVG_CACHELINE_CHECK 12, 32, mmx2
  1187. AVG_CACHELINE_CHECK 16, 32, mmx2
  1188. AVG_CACHELINE_CHECK 20, 32, mmx2
  1189. %endif
  1190. INIT_XMM
  1191. AVG_CACHELINE_CHECK 16, 64, sse2
  1192. AVG_CACHELINE_CHECK 20, 64, sse2
  1193. ; computed jump assumes this loop is exactly 48 bytes
  1194. %macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
  1195. ALIGN 16
  1196. avg_w16_align%1_%2_ssse3:
  1197. %if %1==0 && %2==0
  1198. movdqa xmm1, [r2]
  1199. pavgb xmm1, [r2+r4]
  1200. add r2, r3
  1201. %elif %1==0
  1202. movdqa xmm1, [r2+r4+16]
  1203. palignr xmm1, [r2+r4], %2
  1204. pavgb xmm1, [r2]
  1205. add r2, r3
  1206. %elif %2&15==0
  1207. movdqa xmm1, [r2+16]
  1208. palignr xmm1, [r2], %1
  1209. pavgb xmm1, [r2+r4]
  1210. add r2, r3
  1211. %else
  1212. movdqa xmm1, [r2+16]
  1213. movdqa xmm2, [r2+r4+16]
  1214. palignr xmm1, [r2], %1
  1215. palignr xmm2, [r2+r4], %2&15
  1216. add r2, r3
  1217. pavgb xmm1, xmm2
  1218. %endif
  1219. movdqa [r0], xmm1
  1220. add r0, r1
  1221. dec r5d
  1222. jg avg_w16_align%1_%2_ssse3
  1223. ret
  1224. %if %1==0
  1225. ; make sure the first ones don't end up short
  1226. ALIGN 16
  1227. times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
  1228. %endif
  1229. %endmacro
  1230. cglobal pixel_avg2_w16_cache64_ssse3
  1231. %if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
  1232. mov eax, r2m
  1233. and eax, 0x3f
  1234. cmp eax, 0x30
  1235. jb pixel_avg2_w16_sse2
  1236. or eax, r4m
  1237. and eax, 7
  1238. jz pixel_avg2_w16_sse2
  1239. %endif
  1240. PROLOGUE 6, 8
  1241. lea r6, [r4+r2]
  1242. and r4, ~0xf
  1243. and r6, 0x1f
  1244. and r2, ~0xf
  1245. lea r6, [r6*3] ;(offset + align*2)*3
  1246. sub r4, r2
  1247. shl r6, 4 ;jump = (offset + align*2)*48
  1248. %define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
  1249. %ifdef PIC
  1250. lea r7, [avg_w16_addr]
  1251. add r6, r7
  1252. %else
  1253. lea r6, [avg_w16_addr + r6]
  1254. %endif
  1255. TAIL_CALL r6, 1
  1256. %assign j 0
  1257. %assign k 1
  1258. %rep 16
  1259. AVG16_CACHELINE_LOOP_SSSE3 j, j
  1260. AVG16_CACHELINE_LOOP_SSSE3 j, k
  1261. %assign j j+1
  1262. %assign k k+1
  1263. %endrep
  1264. %endif ; !HIGH_BIT_DEPTH
  1265. ;=============================================================================
  1266. ; pixel copy
  1267. ;=============================================================================
  1268. %macro COPY1 2
  1269. movu m0, [r2]
  1270. movu m1, [r2+r3]
  1271. movu m2, [r2+r3*2]
  1272. movu m3, [r2+%2]
  1273. mova [r0], m0
  1274. mova [r0+r1], m1
  1275. mova [r0+r1*2], m2
  1276. mova [r0+%1], m3
  1277. %endmacro
  1278. %macro COPY2 2-4 0, 1
  1279. movu m0, [r2+%3*mmsize]
  1280. movu m1, [r2+%4*mmsize]
  1281. movu m2, [r2+r3+%3*mmsize]
  1282. movu m3, [r2+r3+%4*mmsize]
  1283. mova [r0+%3*mmsize], m0
  1284. mova [r0+%4*mmsize], m1
  1285. mova [r0+r1+%3*mmsize], m2
  1286. mova [r0+r1+%4*mmsize], m3
  1287. movu m0, [r2+r3*2+%3*mmsize]
  1288. movu m1, [r2+r3*2+%4*mmsize]
  1289. movu m2, [r2+%2+%3*mmsize]
  1290. movu m3, [r2+%2+%4*mmsize]
  1291. mova [r0+r1*2+%3*mmsize], m0
  1292. mova [r0+r1*2+%4*mmsize], m1
  1293. mova [r0+%1+%3*mmsize], m2
  1294. mova [r0+%1+%4*mmsize], m3
  1295. %endmacro
  1296. %macro COPY4 2
  1297. COPY2 %1, %2, 0, 1
  1298. COPY2 %1, %2, 2, 3
  1299. %endmacro
  1300. ;-----------------------------------------------------------------------------
  1301. ; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
  1302. ; uint8_t *src, intptr_t i_src_stride, int i_height )
  1303. ;-----------------------------------------------------------------------------
  1304. INIT_MMX
  1305. cglobal mc_copy_w4_mmx, 4,6
  1306. FIX_STRIDES r1, r3
  1307. cmp dword r4m, 4
  1308. lea r5, [r3*3]
  1309. lea r4, [r1*3]
  1310. je .end
  1311. %if HIGH_BIT_DEPTH == 0
  1312. %define mova movd
  1313. %define movu movd
  1314. %endif
  1315. COPY1 r4, r5
  1316. lea r2, [r2+r3*4]
  1317. lea r0, [r0+r1*4]
  1318. .end:
  1319. COPY1 r4, r5
  1320. RET
  1321. %macro MC_COPY 1
  1322. %assign %%w %1*SIZEOF_PIXEL/mmsize
  1323. %if %%w > 0
  1324. cglobal mc_copy_w%1, 5,7
  1325. FIX_STRIDES r1, r3
  1326. lea r6, [r3*3]
  1327. lea r5, [r1*3]
  1328. .height_loop:
  1329. COPY %+ %%w r5, r6
  1330. lea r2, [r2+r3*4]
  1331. lea r0, [r0+r1*4]
  1332. sub r4d, 4
  1333. jg .height_loop
  1334. RET
  1335. %endif
  1336. %endmacro
  1337. INIT_MMX mmx
  1338. MC_COPY 8
  1339. MC_COPY 16
  1340. INIT_XMM sse
  1341. MC_COPY 8
  1342. MC_COPY 16
  1343. INIT_XMM aligned, sse
  1344. MC_COPY 16
  1345. %if HIGH_BIT_DEPTH
  1346. INIT_YMM avx
  1347. MC_COPY 16
  1348. INIT_YMM aligned, avx
  1349. MC_COPY 16
  1350. %endif
  1351. ;=============================================================================
  1352. ; prefetch
  1353. ;=============================================================================
  1354. ; assumes 64 byte cachelines
  1355. ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
  1356. ;-----------------------------------------------------------------------------
  1357. ; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
  1358. ; pixel *pix_uv, intptr_t stride_uv, int mb_x )
  1359. ;-----------------------------------------------------------------------------
  1360. %macro PREFETCH_FENC 1
  1361. %if ARCH_X86_64
  1362. cglobal prefetch_fenc_%1, 5,5
  1363. FIX_STRIDES r1, r3
  1364. and r4d, 3
  1365. mov eax, r4d
  1366. imul r4d, r1d
  1367. lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
  1368. prefetcht0 [r0]
  1369. prefetcht0 [r0+r1]
  1370. lea r0, [r0+r1*2]
  1371. prefetcht0 [r0]
  1372. prefetcht0 [r0+r1]
  1373. imul eax, r3d
  1374. lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
  1375. prefetcht0 [r2]
  1376. prefetcht0 [r2+r3]
  1377. %ifidn %1, 422
  1378. lea r2, [r2+r3*2]
  1379. prefetcht0 [r2]
  1380. prefetcht0 [r2+r3]
  1381. %endif
  1382. RET
  1383. %else
  1384. cglobal prefetch_fenc_%1, 0,3
  1385. mov r2, r4m
  1386. mov r1, r1m
  1387. mov r0, r0m
  1388. FIX_STRIDES r1
  1389. and r2, 3
  1390. imul r2, r1
  1391. lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
  1392. prefetcht0 [r0]
  1393. prefetcht0 [r0+r1]
  1394. lea r0, [r0+r1*2]
  1395. prefetcht0 [r0]
  1396. prefetcht0 [r0+r1]
  1397. mov r2, r4m
  1398. mov r1, r3m
  1399. mov r0, r2m
  1400. FIX_STRIDES r1
  1401. and r2, 3
  1402. imul r2, r1
  1403. lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
  1404. prefetcht0 [r0]
  1405. prefetcht0 [r0+r1]
  1406. %ifidn %1, 422
  1407. lea r0, [r0+r1*2]
  1408. prefetcht0 [r0]
  1409. prefetcht0 [r0+r1]
  1410. %endif
  1411. ret
  1412. %endif ; ARCH_X86_64
  1413. %endmacro
  1414. INIT_MMX mmx2
  1415. PREFETCH_FENC 420
  1416. PREFETCH_FENC 422
  1417. %if ARCH_X86_64
  1418. DECLARE_REG_TMP 4
  1419. %else
  1420. DECLARE_REG_TMP 2
  1421. %endif
  1422. cglobal prefetch_fenc_400, 2,3
  1423. movifnidn t0d, r4m
  1424. FIX_STRIDES r1
  1425. and t0d, 3
  1426. imul t0d, r1d
  1427. lea r0, [r0+t0*4+64*SIZEOF_PIXEL]
  1428. prefetcht0 [r0]
  1429. prefetcht0 [r0+r1]
  1430. lea r0, [r0+r1*2]
  1431. prefetcht0 [r0]
  1432. prefetcht0 [r0+r1]
  1433. RET
  1434. ;-----------------------------------------------------------------------------
  1435. ; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
  1436. ;-----------------------------------------------------------------------------
  1437. INIT_MMX mmx2
  1438. cglobal prefetch_ref, 3,3
  1439. FIX_STRIDES r1
  1440. dec r2d
  1441. and r2d, r1d
  1442. lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
  1443. lea r2, [r1*3]
  1444. prefetcht0 [r0]
  1445. prefetcht0 [r0+r1]
  1446. prefetcht0 [r0+r1*2]
  1447. prefetcht0 [r0+r2]
  1448. lea r0, [r0+r1*4]
  1449. prefetcht0 [r0]
  1450. prefetcht0 [r0+r1]
  1451. prefetcht0 [r0+r1*2]
  1452. prefetcht0 [r0+r2]
  1453. RET
  1454. ;=============================================================================
  1455. ; chroma MC
  1456. ;=============================================================================
  1457. %if ARCH_X86_64
  1458. DECLARE_REG_TMP 6,7,8
  1459. %else
  1460. DECLARE_REG_TMP 0,1,2
  1461. %endif
  1462. %macro MC_CHROMA_START 1
  1463. %if ARCH_X86_64
  1464. PROLOGUE 0,9,%1
  1465. %else
  1466. PROLOGUE 0,6,%1
  1467. %endif
  1468. movifnidn r3, r3mp
  1469. movifnidn r4d, r4m
  1470. movifnidn r5d, r5m
  1471. movifnidn t0d, r6m
  1472. mov t2d, t0d
  1473. mov t1d, r5d
  1474. sar t0d, 3
  1475. sar t1d, 3
  1476. imul t0d, r4d
  1477. lea t0d, [t0+t1*2]
  1478. FIX_STRIDES t0d
  1479. movsxdifnidn t0, t0d
  1480. add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
  1481. %endmacro
  1482. %if HIGH_BIT_DEPTH
  1483. %macro UNPACK_UNALIGNED 4
  1484. movu %1, [%4+0]
  1485. movu %2, [%4+4]
  1486. punpckhwd %3, %1, %2
  1487. punpcklwd %1, %2
  1488. %if mmsize == 8
  1489. mova %2, %1
  1490. punpcklwd %1, %3
  1491. punpckhwd %2, %3
  1492. %else
  1493. shufps %2, %1, %3, q3131
  1494. shufps %1, %3, q2020
  1495. %endif
  1496. %endmacro
  1497. %else ; !HIGH_BIT_DEPTH
  1498. %macro UNPACK_UNALIGNED 3
  1499. %if mmsize == 8
  1500. punpcklwd %1, %3
  1501. %else
  1502. movh %2, %3
  1503. punpcklwd %1, %2
  1504. %endif
  1505. %endmacro
  1506. %endif ; HIGH_BIT_DEPTH
  1507. ;-----------------------------------------------------------------------------
  1508. ; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
  1509. ; uint8_t *src, intptr_t src_stride,
  1510. ; int dx, int dy,
  1511. ; int width, int height )
  1512. ;-----------------------------------------------------------------------------
  1513. %macro MC_CHROMA 0
  1514. cglobal mc_chroma
  1515. MC_CHROMA_START 0
  1516. FIX_STRIDES r4
  1517. and r5d, 7
  1518. %if ARCH_X86_64
  1519. jz .mc1dy
  1520. %endif
  1521. and t2d, 7
  1522. %if ARCH_X86_64
  1523. jz .mc1dx
  1524. %endif
  1525. shl r5d, 16
  1526. add t2d, r5d
  1527. mov t0d, t2d
  1528. shl t2d, 8
  1529. sub t2d, t0d
  1530. add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
  1531. cmp dword r7m, 4
  1532. %if mmsize==8
  1533. .skip_prologue:
  1534. %else
  1535. jl mc_chroma_mmx2 %+ .skip_prologue
  1536. WIN64_SPILL_XMM 9
  1537. %endif
  1538. movd m5, t2d
  1539. movifnidn r0, r0mp
  1540. movifnidn r1, r1mp
  1541. movifnidn r2d, r2m
  1542. movifnidn r5d, r8m
  1543. pxor m6, m6
  1544. punpcklbw m5, m6
  1545. %if mmsize==8
  1546. pshufw m7, m5, q3232
  1547. pshufw m6, m5, q0000
  1548. pshufw m5, m5, q1111
  1549. jge .width4
  1550. %else
  1551. %if WIN64
  1552. cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
  1553. %endif
  1554. pshufd m7, m5, q1111
  1555. punpcklwd m5, m5
  1556. pshufd m6, m5, q0000
  1557. pshufd m5, m5, q1111
  1558. jg .width8
  1559. %endif
  1560. %if HIGH_BIT_DEPTH
  1561. add r2, r2
  1562. UNPACK_UNALIGNED m0, m1, m2, r3
  1563. %else
  1564. movu m0, [r3]
  1565. UNPACK_UNALIGNED m0, m1, [r3+2]
  1566. mova m1, m0
  1567. pand m0, [pw_00ff]
  1568. psrlw m1, 8
  1569. %endif ; HIGH_BIT_DEPTH
  1570. pmaddwd m0, m7
  1571. pmaddwd m1, m7
  1572. packssdw m0, m1
  1573. SWAP 3, 0
  1574. ALIGN 4
  1575. .loop2:
  1576. %if HIGH_BIT_DEPTH
  1577. UNPACK_UNALIGNED m0, m1, m2, r3+r4
  1578. pmullw m3, m6
  1579. %else ; !HIGH_BIT_DEPTH
  1580. movu m0, [r3+r4]
  1581. UNPACK_UNALIGNED m0, m1, [r3+r4+2]
  1582. pmullw m3, m6
  1583. mova m1, m0
  1584. pand m0, [pw_00ff]
  1585. psrlw m1, 8
  1586. %endif ; HIGH_BIT_DEPTH
  1587. pmaddwd m0, m7
  1588. pmaddwd m1, m7
  1589. mova m2, [pw_32]
  1590. packssdw m0, m1
  1591. paddw m2, m3
  1592. mova m3, m0
  1593. pmullw m0, m5
  1594. paddw m0, m2
  1595. psrlw m0, 6
  1596. %if HIGH_BIT_DEPTH
  1597. movh [r0], m0
  1598. %if mmsize == 8
  1599. psrlq m0, 32
  1600. movh [r1], m0
  1601. %else
  1602. movhps [r1], m0
  1603. %endif
  1604. %else ; !HIGH_BIT_DEPTH
  1605. packuswb m0, m0
  1606. movd [r0], m0
  1607. %if mmsize==8
  1608. psrlq m0, 16
  1609. %else
  1610. psrldq m0, 4
  1611. %endif
  1612. movd [r1], m0
  1613. %endif ; HIGH_BIT_DEPTH
  1614. add r3, r4
  1615. add r0, r2
  1616. add r1, r2
  1617. dec r5d
  1618. jg .loop2
  1619. RET
  1620. %if mmsize==8
  1621. .width4:
  1622. %if ARCH_X86_64
  1623. mov t0, r0
  1624. mov t1, r1
  1625. mov t2, r3
  1626. %if WIN64
  1627. %define multy0 r4m
  1628. %else
  1629. %define multy0 [rsp-8]
  1630. %endif
  1631. mova multy0, m5
  1632. %else
  1633. mov r3m, r3
  1634. %define multy0 r4m
  1635. mova multy0, m5
  1636. %endif
  1637. %else
  1638. .width8:
  1639. %if ARCH_X86_64
  1640. %define multy0 m8
  1641. SWAP 8, 5
  1642. %else
  1643. %define multy0 r0m
  1644. mova multy0, m5
  1645. %endif
  1646. %endif
  1647. FIX_STRIDES r2
  1648. .loopx:
  1649. %if HIGH_BIT_DEPTH
  1650. UNPACK_UNALIGNED m0, m2, m4, r3
  1651. UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
  1652. %else
  1653. movu m0, [r3]
  1654. movu m1, [r3+mmsize/2]
  1655. UNPACK_UNALIGNED m0, m2, [r3+2]
  1656. UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
  1657. psrlw m2, m0, 8
  1658. psrlw m3, m1, 8
  1659. pand m0, [pw_00ff]
  1660. pand m1, [pw_00ff]
  1661. %endif
  1662. pmaddwd m0, m7
  1663. pmaddwd m2, m7
  1664. pmaddwd m1, m7
  1665. pmaddwd m3, m7
  1666. packssdw m0, m2
  1667. packssdw m1, m3
  1668. SWAP 4, 0
  1669. SWAP 5, 1
  1670. add r3, r4
  1671. ALIGN 4
  1672. .loop4:
  1673. %if HIGH_BIT_DEPTH
  1674. UNPACK_UNALIGNED m0, m1, m2, r3
  1675. pmaddwd m0, m7
  1676. pmaddwd m1, m7
  1677. packssdw m0, m1
  1678. UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
  1679. pmaddwd m1, m7
  1680. pmaddwd m2, m7
  1681. packssdw m1, m2
  1682. %else ; !HIGH_BIT_DEPTH
  1683. movu m0, [r3]
  1684. movu m1, [r3+mmsize/2]
  1685. UNPACK_UNALIGNED m0, m2, [r3+2]
  1686. UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
  1687. psrlw m2, m0, 8
  1688. psrlw m3, m1, 8
  1689. pand m0, [pw_00ff]
  1690. pand m1, [pw_00ff]
  1691. pmaddwd m0, m7
  1692. pmaddwd m2, m7
  1693. pmaddwd m1, m7
  1694. pmaddwd m3, m7
  1695. packssdw m0, m2
  1696. packssdw m1, m3
  1697. %endif ; HIGH_BIT_DEPTH
  1698. pmullw m4, m6
  1699. pmullw m5, m6
  1700. mova m2, [pw_32]
  1701. paddw m3, m2, m5
  1702. paddw m2, m4
  1703. mova m4, m0
  1704. mova m5, m1
  1705. pmullw m0, multy0
  1706. pmullw m1, multy0
  1707. paddw m0, m2
  1708. paddw m1, m3
  1709. psrlw m0, 6
  1710. psrlw m1, 6
  1711. %if HIGH_BIT_DEPTH
  1712. movh [r0], m0
  1713. movh [r0+mmsize/2], m1
  1714. %if mmsize==8
  1715. psrlq m0, 32
  1716. psrlq m1, 32
  1717. movh [r1], m0
  1718. movh [r1+mmsize/2], m1
  1719. %else
  1720. movhps [r1], m0
  1721. movhps [r1+mmsize/2], m1
  1722. %endif
  1723. %else ; !HIGH_BIT_DEPTH
  1724. packuswb m0, m1
  1725. %if mmsize==8
  1726. pshufw m1, m0, q0020
  1727. pshufw m0, m0, q0031
  1728. movd [r0], m1
  1729. movd [r1], m0
  1730. %else
  1731. pshufd m0, m0, q3120
  1732. movq [r0], m0
  1733. movhps [r1], m0
  1734. %endif
  1735. %endif ; HIGH_BIT_DEPTH
  1736. add r3, r4
  1737. add r0, r2
  1738. add r1, r2
  1739. dec r5d
  1740. jg .loop4
  1741. %if mmsize!=8
  1742. RET
  1743. %else
  1744. sub dword r7m, 4
  1745. jg .width8
  1746. RET
  1747. .width8:
  1748. %if ARCH_X86_64
  1749. lea r3, [t2+8*SIZEOF_PIXEL]
  1750. lea r0, [t0+4*SIZEOF_PIXEL]
  1751. lea r1, [t1+4*SIZEOF_PIXEL]
  1752. %else
  1753. mov r3, r3m
  1754. mov r0, r0m
  1755. mov r1, r1m
  1756. add r3, 8*SIZEOF_PIXEL
  1757. add r0, 4*SIZEOF_PIXEL
  1758. add r1, 4*SIZEOF_PIXEL
  1759. %endif
  1760. mov r5d, r8m
  1761. jmp .loopx
  1762. %endif
  1763. %if ARCH_X86_64 ; too many regs for x86_32
  1764. RESET_MM_PERMUTATION
  1765. %if WIN64
  1766. %assign stack_offset stack_offset - stack_size_padded
  1767. %assign stack_size_padded 0
  1768. %assign xmm_regs_used 0
  1769. %endif
  1770. .mc1dy:
  1771. and t2d, 7
  1772. movd m5, t2d
  1773. mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
  1774. jmp .mc1d
  1775. .mc1dx:
  1776. movd m5, r5d
  1777. mov r6d, 2*SIZEOF_PIXEL
  1778. .mc1d:
  1779. %if HIGH_BIT_DEPTH && mmsize == 16
  1780. WIN64_SPILL_XMM 8
  1781. %endif
  1782. mova m4, [pw_8]
  1783. SPLATW m5, m5
  1784. psubw m4, m5
  1785. movifnidn r0, r0mp
  1786. movifnidn r1, r1mp
  1787. movifnidn r2d, r2m
  1788. FIX_STRIDES r2
  1789. movifnidn r5d, r8m
  1790. cmp dword r7m, 4
  1791. jg .mc1d_w8
  1792. mov r7, r2
  1793. mov r8, r4
  1794. %if mmsize!=8
  1795. shr r5d, 1
  1796. %endif
  1797. .loop1d_w4:
  1798. %if HIGH_BIT_DEPTH
  1799. %if mmsize == 8
  1800. movq m0, [r3+0]
  1801. movq m2, [r3+8]
  1802. movq m1, [r3+r6+0]
  1803. movq m3, [r3+r6+8]
  1804. %else
  1805. movu m0, [r3]
  1806. movu m1, [r3+r6]
  1807. add r3, r8
  1808. movu m2, [r3]
  1809. movu m3, [r3+r6]
  1810. %endif
  1811. SBUTTERFLY wd, 0, 2, 6
  1812. SBUTTERFLY wd, 1, 3, 7
  1813. SBUTTERFLY wd, 0, 2, 6
  1814. SBUTTERFLY wd, 1, 3, 7
  1815. %if mmsize == 16
  1816. SBUTTERFLY wd, 0, 2, 6
  1817. SBUTTERFLY wd, 1, 3, 7
  1818. %endif
  1819. %else ; !HIGH_BIT_DEPTH
  1820. movq m0, [r3]
  1821. movq m1, [r3+r6]
  1822. %if mmsize!=8
  1823. add r3, r8
  1824. movhps m0, [r3]
  1825. movhps m1, [r3+r6]
  1826. %endif
  1827. psrlw m2, m0, 8
  1828. psrlw m3, m1, 8
  1829. pand m0, [pw_00ff]
  1830. pand m1, [pw_00ff]
  1831. %endif ; HIGH_BIT_DEPTH
  1832. pmullw m0, m4
  1833. pmullw m1, m5
  1834. pmullw m2, m4
  1835. pmullw m3, m5
  1836. paddw m0, [pw_4]
  1837. paddw m2, [pw_4]
  1838. paddw m0, m1
  1839. paddw m2, m3
  1840. psrlw m0, 3
  1841. psrlw m2, 3
  1842. %if HIGH_BIT_DEPTH
  1843. %if mmsize == 8
  1844. xchg r4, r8
  1845. xchg r2, r7
  1846. %endif
  1847. movq [r0], m0
  1848. movq [r1], m2
  1849. %if mmsize == 16
  1850. add r0, r7
  1851. add r1, r7
  1852. movhps [r0], m0
  1853. movhps [r1], m2
  1854. %endif
  1855. %else ; !HIGH_BIT_DEPTH
  1856. packuswb m0, m2
  1857. %if mmsize==8
  1858. xchg r4, r8
  1859. xchg r2, r7
  1860. movd [r0], m0
  1861. psrlq m0, 32
  1862. movd [r1], m0
  1863. %else
  1864. movhlps m1, m0
  1865. movd [r0], m0
  1866. movd [r1], m1
  1867. add r0, r7
  1868. add r1, r7
  1869. psrldq m0, 4
  1870. psrldq m1, 4
  1871. movd [r0], m0
  1872. movd [r1], m1
  1873. %endif
  1874. %endif ; HIGH_BIT_DEPTH
  1875. add r3, r4
  1876. add r0, r2
  1877. add r1, r2
  1878. dec r5d
  1879. jg .loop1d_w4
  1880. RET
  1881. .mc1d_w8:
  1882. sub r2, 4*SIZEOF_PIXEL
  1883. sub r4, 8*SIZEOF_PIXEL
  1884. mov r7, 4*SIZEOF_PIXEL
  1885. mov r8, 8*SIZEOF_PIXEL
  1886. %if mmsize==8
  1887. shl r5d, 1
  1888. %endif
  1889. jmp .loop1d_w4
  1890. %endif ; ARCH_X86_64
  1891. %endmacro ; MC_CHROMA
  1892. %macro MC_CHROMA_SSSE3 0
  1893. cglobal mc_chroma
  1894. MC_CHROMA_START 10-cpuflag(avx2)
  1895. and r5d, 7
  1896. and t2d, 7
  1897. mov t0d, r5d
  1898. shl t0d, 8
  1899. sub t0d, r5d
  1900. mov r5d, 8
  1901. add t0d, 8
  1902. sub r5d, t2d
  1903. imul t2d, t0d ; (x*255+8)*y
  1904. imul r5d, t0d ; (x*255+8)*(8-y)
  1905. movd xm6, t2d
  1906. movd xm7, r5d
  1907. %if cpuflag(cache64)
  1908. mov t0d, r3d
  1909. and t0d, 7
  1910. %ifdef PIC
  1911. lea t1, [ch_shuf_adj]
  1912. movddup xm5, [t1 + t0*4]
  1913. %else
  1914. movddup xm5, [ch_shuf_adj + t0*4]
  1915. %endif
  1916. paddb xm5, [ch_shuf]
  1917. and r3, ~7
  1918. %else
  1919. mova m5, [ch_shuf]
  1920. %endif
  1921. movifnidn r0, r0mp
  1922. movifnidn r1, r1mp
  1923. movifnidn r2d, r2m
  1924. movifnidn r5d, r8m
  1925. %if cpuflag(avx2)
  1926. vpbroadcastw m6, xm6
  1927. vpbroadcastw m7, xm7
  1928. %else
  1929. SPLATW m6, m6
  1930. SPLATW m7, m7
  1931. %endif
  1932. %if ARCH_X86_64
  1933. %define shiftround m8
  1934. mova m8, [pw_512]
  1935. %else
  1936. %define shiftround [pw_512]
  1937. %endif
  1938. cmp dword r7m, 4
  1939. jg .width8
  1940. %if cpuflag(avx2)
  1941. .loop4:
  1942. movu xm0, [r3]
  1943. movu xm1, [r3+r4]
  1944. vinserti128 m0, m0, [r3+r4], 1
  1945. vinserti128 m1, m1, [r3+r4*2], 1
  1946. pshufb m0, m5
  1947. pshufb m1, m5
  1948. pmaddubsw m0, m7
  1949. pmaddubsw m1, m6
  1950. paddw m0, m1
  1951. pmulhrsw m0, shiftround
  1952. packuswb m0, m0
  1953. vextracti128 xm1, m0, 1
  1954. movd [r0], xm0
  1955. movd [r0+r2], xm1
  1956. psrldq xm0, 4
  1957. psrldq xm1, 4
  1958. movd [r1], xm0
  1959. movd [r1+r2], xm1
  1960. lea r3, [r3+r4*2]
  1961. lea r0, [r0+r2*2]
  1962. lea r1, [r1+r2*2]
  1963. sub r5d, 2
  1964. jg .loop4
  1965. RET
  1966. .width8:
  1967. movu xm0, [r3]
  1968. vinserti128 m0, m0, [r3+8], 1
  1969. pshufb m0, m5
  1970. .loop8:
  1971. movu xm3, [r3+r4]
  1972. vinserti128 m3, m3, [r3+r4+8], 1
  1973. pshufb m3, m5
  1974. pmaddubsw m1, m0, m7
  1975. pmaddubsw m2, m3, m6
  1976. pmaddubsw m3, m3, m7
  1977. movu xm0, [r3+r4*2]
  1978. vinserti128 m0, m0, [r3+r4*2+8], 1
  1979. pshufb m0, m5
  1980. pmaddubsw m4, m0, m6
  1981. paddw m1, m2
  1982. paddw m3, m4
  1983. pmulhrsw m1, shiftround
  1984. pmulhrsw m3, shiftround
  1985. packuswb m1, m3
  1986. mova m2, [deinterleave_shufd]
  1987. vpermd m1, m2, m1
  1988. vextracti128 xm2, m1, 1
  1989. movq [r0], xm1
  1990. movhps [r1], xm1
  1991. movq [r0+r2], xm2
  1992. movhps [r1+r2], xm2
  1993. %else
  1994. movu m0, [r3]
  1995. pshufb m0, m5
  1996. .loop4:
  1997. movu m1, [r3+r4]
  1998. pshufb m1, m5
  1999. movu m3, [r3+r4*2]
  2000. pshufb m3, m5
  2001. mova m4, m3
  2002. pmaddubsw m0, m7
  2003. pmaddubsw m2, m1, m7
  2004. pmaddubsw m1, m6
  2005. pmaddubsw m3, m6
  2006. paddw m1, m0
  2007. paddw m3, m2
  2008. pmulhrsw m1, shiftround
  2009. pmulhrsw m3, shiftround
  2010. mova m0, m4
  2011. packuswb m1, m3
  2012. movd [r0], m1
  2013. %if cpuflag(sse4)
  2014. pextrd [r1], m1, 1
  2015. pextrd [r0+r2], m1, 2
  2016. pextrd [r1+r2], m1, 3
  2017. %else
  2018. movhlps m3, m1
  2019. movd [r0+r2], m3
  2020. psrldq m1, 4
  2021. psrldq m3, 4
  2022. movd [r1], m1
  2023. movd [r1+r2], m3
  2024. %endif
  2025. lea r3, [r3+r4*2]
  2026. lea r0, [r0+r2*2]
  2027. lea r1, [r1+r2*2]
  2028. sub r5d, 2
  2029. jg .loop4
  2030. RET
  2031. .width8:
  2032. movu m0, [r3]
  2033. pshufb m0, m5
  2034. movu m1, [r3+8]
  2035. pshufb m1, m5
  2036. %if ARCH_X86_64
  2037. SWAP 9, 6
  2038. %define mult1 m9
  2039. %else
  2040. mova r0m, m6
  2041. %define mult1 r0m
  2042. %endif
  2043. .loop8:
  2044. movu m2, [r3+r4]
  2045. pshufb m2, m5
  2046. movu m3, [r3+r4+8]
  2047. pshufb m3, m5
  2048. mova m4, m2
  2049. mova m6, m3
  2050. pmaddubsw m0, m7
  2051. pmaddubsw m1, m7
  2052. pmaddubsw m2, mult1
  2053. pmaddubsw m3, mult1
  2054. paddw m0, m2
  2055. paddw m1, m3
  2056. pmulhrsw m0, shiftround ; x + 32 >> 6
  2057. pmulhrsw m1, shiftround
  2058. packuswb m0, m1
  2059. pshufd m0, m0, q3120
  2060. movq [r0], m0
  2061. movhps [r1], m0
  2062. movu m2, [r3+r4*2]
  2063. pshufb m2, m5
  2064. movu m3, [r3+r4*2+8]
  2065. pshufb m3, m5
  2066. mova m0, m2
  2067. mova m1, m3
  2068. pmaddubsw m4, m7
  2069. pmaddubsw m6, m7
  2070. pmaddubsw m2, mult1
  2071. pmaddubsw m3, mult1
  2072. paddw m2, m4
  2073. paddw m3, m6
  2074. pmulhrsw m2, shiftround
  2075. pmulhrsw m3, shiftround
  2076. packuswb m2, m3
  2077. pshufd m2, m2, q3120
  2078. movq [r0+r2], m2
  2079. movhps [r1+r2], m2
  2080. %endif
  2081. lea r3, [r3+r4*2]
  2082. lea r0, [r0+r2*2]
  2083. lea r1, [r1+r2*2]
  2084. sub r5d, 2
  2085. jg .loop8
  2086. RET
  2087. %endmacro
  2088. %if HIGH_BIT_DEPTH
  2089. INIT_MMX mmx2
  2090. MC_CHROMA
  2091. INIT_XMM sse2
  2092. MC_CHROMA
  2093. INIT_XMM avx
  2094. MC_CHROMA
  2095. %else ; !HIGH_BIT_DEPTH
  2096. INIT_MMX mmx2
  2097. MC_CHROMA
  2098. INIT_XMM sse2
  2099. MC_CHROMA
  2100. INIT_XMM ssse3
  2101. MC_CHROMA_SSSE3
  2102. INIT_XMM cache64, ssse3
  2103. MC_CHROMA_SSSE3
  2104. INIT_XMM avx
  2105. MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
  2106. INIT_YMM avx2
  2107. MC_CHROMA_SSSE3
  2108. %endif ; HIGH_BIT_DEPTH