pixel.c 66 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830
  1. /*****************************************************************************
  2. * pixel.c: ppc pixel metrics
  3. *****************************************************************************
  4. * Copyright (C) 2003-2018 x264 project
  5. *
  6. * Authors: Eric Petit <eric.petit@lapsus.org>
  7. * Guillaume Poirier <gpoirier@mplayerhq.hu>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "common/common.h"
  27. #include "ppccommon.h"
  28. #include "pixel.h"
  29. #if !HIGH_BIT_DEPTH
  30. /***********************************************************************
  31. * SAD routines
  32. **********************************************************************/
  33. #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
  34. static int name( uint8_t *pix1, intptr_t i_pix1, \
  35. uint8_t *pix2, intptr_t i_pix2 ) \
  36. { \
  37. ALIGNED_16( int sum ); \
  38. \
  39. LOAD_ZERO; \
  40. vec_u8_t pix1v, pix2v; \
  41. vec_s32_t sumv = zero_s32v; \
  42. for( int y = 0; y < ly; y++ ) \
  43. { \
  44. pix1v = vec_vsx_ld( 0, pix1 ); \
  45. pix2v = vec_vsx_ld( 0, pix2 ); \
  46. sumv = (vec_s32_t) vec_sum4s( \
  47. vec_absd( pix1v, pix2v ), \
  48. (vec_u32_t) sumv ); \
  49. pix1 += i_pix1; \
  50. pix2 += i_pix2; \
  51. } \
  52. sumv = vec_sum##a( sumv, zero_s32v ); \
  53. sumv = vec_splat( sumv, b ); \
  54. vec_ste( sumv, 0, &sum ); \
  55. return sum; \
  56. }
  57. PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 )
  58. PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 )
  59. PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 )
  60. PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
  61. /***********************************************************************
  62. * SATD routines
  63. **********************************************************************/
  64. /***********************************************************************
  65. * VEC_HADAMAR
  66. ***********************************************************************
  67. * b[0] = a[0] + a[1] + a[2] + a[3]
  68. * b[1] = a[0] + a[1] - a[2] - a[3]
  69. * b[2] = a[0] - a[1] - a[2] + a[3]
  70. * b[3] = a[0] - a[1] + a[2] - a[3]
  71. **********************************************************************/
  72. #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
  73. b2 = vec_add( a0, a1 ); \
  74. b3 = vec_add( a2, a3 ); \
  75. a0 = vec_sub( a0, a1 ); \
  76. a2 = vec_sub( a2, a3 ); \
  77. b0 = vec_add( b2, b3 ); \
  78. b1 = vec_sub( b2, b3 ); \
  79. b2 = vec_sub( a0, a2 ); \
  80. b3 = vec_add( a0, a2 )
  81. /***********************************************************************
  82. * VEC_ABS
  83. ***********************************************************************
  84. * a: s16v
  85. *
  86. * a = abs(a)
  87. *
  88. * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
  89. * actually also calls vec_splat(0), but we already have a null vector.
  90. **********************************************************************/
  91. #define VEC_ABS(a) \
  92. a = vec_max( a, vec_sub( zero_s16v, a ) );
  93. #define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )
  94. /***********************************************************************
  95. * VEC_ADD_ABS
  96. ***********************************************************************
  97. * a: s16v
  98. * b, c: s32v
  99. *
  100. * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
  101. **********************************************************************/
  102. #define VEC_ADD_ABS(a,b,c) \
  103. VEC_ABS( a ); \
  104. c = vec_sum4s( a, b )
  105. /***********************************************************************
  106. * SATD 4x4
  107. **********************************************************************/
  108. static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
  109. uint8_t *pix2, intptr_t i_pix2 )
  110. {
  111. ALIGNED_16( int i_satd );
  112. PREP_DIFF;
  113. vec_s16_t diff0v, diff1v, diff2v, diff3v;
  114. vec_s16_t temp0v, temp1v, temp2v, temp3v;
  115. vec_s32_t satdv;
  116. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
  117. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
  118. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
  119. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
  120. /* Hadamar H */
  121. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  122. temp0v, temp1v, temp2v, temp3v );
  123. VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
  124. diff0v, diff1v, diff2v, diff3v );
  125. /* Hadamar V */
  126. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  127. temp0v, temp1v, temp2v, temp3v );
  128. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  129. VEC_ADD_ABS( temp1v, satdv, satdv );
  130. VEC_ADD_ABS( temp2v, satdv, satdv );
  131. VEC_ADD_ABS( temp3v, satdv, satdv );
  132. satdv = vec_sum2s( satdv, zero_s32v );
  133. satdv = vec_splat( satdv, 1 );
  134. vec_ste( satdv, 0, &i_satd );
  135. return i_satd >> 1;
  136. }
  137. /***********************************************************************
  138. * SATD 4x8
  139. **********************************************************************/
  140. static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
  141. uint8_t *pix2, intptr_t i_pix2 )
  142. {
  143. ALIGNED_16( int i_satd );
  144. PREP_DIFF;
  145. vec_s16_t diff0v, diff1v, diff2v, diff3v;
  146. vec_s16_t temp0v, temp1v, temp2v, temp3v;
  147. vec_s32_t satdv;
  148. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
  149. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
  150. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
  151. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
  152. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  153. temp0v, temp1v, temp2v, temp3v );
  154. VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
  155. diff0v, diff1v, diff2v, diff3v );
  156. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  157. temp0v, temp1v, temp2v, temp3v );
  158. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  159. VEC_ADD_ABS( temp1v, satdv, satdv );
  160. VEC_ADD_ABS( temp2v, satdv, satdv );
  161. VEC_ADD_ABS( temp3v, satdv, satdv );
  162. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
  163. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
  164. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
  165. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
  166. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  167. temp0v, temp1v, temp2v, temp3v );
  168. VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
  169. diff0v, diff1v, diff2v, diff3v );
  170. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  171. temp0v, temp1v, temp2v, temp3v );
  172. VEC_ADD_ABS( temp0v, satdv, satdv );
  173. VEC_ADD_ABS( temp1v, satdv, satdv );
  174. VEC_ADD_ABS( temp2v, satdv, satdv );
  175. VEC_ADD_ABS( temp3v, satdv, satdv );
  176. satdv = vec_sum2s( satdv, zero_s32v );
  177. satdv = vec_splat( satdv, 1 );
  178. vec_ste( satdv, 0, &i_satd );
  179. return i_satd >> 1;
  180. }
  181. /***********************************************************************
  182. * SATD 8x4
  183. **********************************************************************/
  184. static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
  185. uint8_t *pix2, intptr_t i_pix2 )
  186. {
  187. ALIGNED_16( int i_satd );
  188. PREP_DIFF;
  189. vec_s16_t diff0v, diff1v, diff2v, diff3v,
  190. diff4v, diff5v, diff6v, diff7v;
  191. vec_s16_t temp0v, temp1v, temp2v, temp3v,
  192. temp4v, temp5v, temp6v, temp7v;
  193. vec_s32_t satdv;
  194. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
  195. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
  196. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
  197. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
  198. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  199. temp0v, temp1v, temp2v, temp3v );
  200. /* This causes warnings because temp4v...temp7v haven't be set,
  201. but we don't care */
  202. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  203. temp4v, temp5v, temp6v, temp7v,
  204. diff0v, diff1v, diff2v, diff3v,
  205. diff4v, diff5v, diff6v, diff7v );
  206. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  207. temp0v, temp1v, temp2v, temp3v );
  208. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  209. temp4v, temp5v, temp6v, temp7v );
  210. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  211. VEC_ADD_ABS( temp1v, satdv, satdv );
  212. VEC_ADD_ABS( temp2v, satdv, satdv );
  213. VEC_ADD_ABS( temp3v, satdv, satdv );
  214. VEC_ADD_ABS( temp4v, satdv, satdv );
  215. VEC_ADD_ABS( temp5v, satdv, satdv );
  216. VEC_ADD_ABS( temp6v, satdv, satdv );
  217. VEC_ADD_ABS( temp7v, satdv, satdv );
  218. satdv = vec_sum2s( satdv, zero_s32v );
  219. satdv = vec_splat( satdv, 1 );
  220. vec_ste( satdv, 0, &i_satd );
  221. return i_satd >> 1;
  222. }
  223. /***********************************************************************
  224. * SATD 8x8
  225. **********************************************************************/
  226. static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
  227. uint8_t *pix2, intptr_t i_pix2 )
  228. {
  229. ALIGNED_16( int i_satd );
  230. PREP_DIFF;
  231. vec_s16_t diff0v, diff1v, diff2v, diff3v,
  232. diff4v, diff5v, diff6v, diff7v;
  233. vec_s16_t temp0v, temp1v, temp2v, temp3v,
  234. temp4v, temp5v, temp6v, temp7v;
  235. vec_s32_t satdv;
  236. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
  237. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
  238. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
  239. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
  240. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
  241. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
  242. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
  243. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
  244. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  245. temp0v, temp1v, temp2v, temp3v );
  246. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  247. temp4v, temp5v, temp6v, temp7v );
  248. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  249. temp4v, temp5v, temp6v, temp7v,
  250. diff0v, diff1v, diff2v, diff3v,
  251. diff4v, diff5v, diff6v, diff7v );
  252. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  253. temp0v, temp1v, temp2v, temp3v );
  254. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  255. temp4v, temp5v, temp6v, temp7v );
  256. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  257. VEC_ADD_ABS( temp1v, satdv, satdv );
  258. VEC_ADD_ABS( temp2v, satdv, satdv );
  259. VEC_ADD_ABS( temp3v, satdv, satdv );
  260. VEC_ADD_ABS( temp4v, satdv, satdv );
  261. VEC_ADD_ABS( temp5v, satdv, satdv );
  262. VEC_ADD_ABS( temp6v, satdv, satdv );
  263. VEC_ADD_ABS( temp7v, satdv, satdv );
  264. satdv = vec_sums( satdv, zero_s32v );
  265. satdv = vec_splat( satdv, 3 );
  266. vec_ste( satdv, 0, &i_satd );
  267. return i_satd >> 1;
  268. }
  269. /***********************************************************************
  270. * SATD 8x16
  271. **********************************************************************/
  272. static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
  273. uint8_t *pix2, intptr_t i_pix2 )
  274. {
  275. ALIGNED_16( int i_satd );
  276. PREP_DIFF;
  277. vec_s16_t diff0v, diff1v, diff2v, diff3v,
  278. diff4v, diff5v, diff6v, diff7v;
  279. vec_s16_t temp0v, temp1v, temp2v, temp3v,
  280. temp4v, temp5v, temp6v, temp7v;
  281. vec_s32_t satdv;
  282. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
  283. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
  284. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
  285. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
  286. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
  287. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
  288. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
  289. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
  290. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  291. temp0v, temp1v, temp2v, temp3v );
  292. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  293. temp4v, temp5v, temp6v, temp7v );
  294. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  295. temp4v, temp5v, temp6v, temp7v,
  296. diff0v, diff1v, diff2v, diff3v,
  297. diff4v, diff5v, diff6v, diff7v );
  298. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  299. temp0v, temp1v, temp2v, temp3v );
  300. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  301. temp4v, temp5v, temp6v, temp7v );
  302. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  303. VEC_ADD_ABS( temp1v, satdv, satdv );
  304. VEC_ADD_ABS( temp2v, satdv, satdv );
  305. VEC_ADD_ABS( temp3v, satdv, satdv );
  306. VEC_ADD_ABS( temp4v, satdv, satdv );
  307. VEC_ADD_ABS( temp5v, satdv, satdv );
  308. VEC_ADD_ABS( temp6v, satdv, satdv );
  309. VEC_ADD_ABS( temp7v, satdv, satdv );
  310. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
  311. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
  312. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
  313. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
  314. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
  315. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
  316. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
  317. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
  318. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  319. temp0v, temp1v, temp2v, temp3v );
  320. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  321. temp4v, temp5v, temp6v, temp7v );
  322. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  323. temp4v, temp5v, temp6v, temp7v,
  324. diff0v, diff1v, diff2v, diff3v,
  325. diff4v, diff5v, diff6v, diff7v );
  326. VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
  327. temp0v, temp1v, temp2v, temp3v );
  328. VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
  329. temp4v, temp5v, temp6v, temp7v );
  330. VEC_ADD_ABS( temp0v, satdv, satdv );
  331. VEC_ADD_ABS( temp1v, satdv, satdv );
  332. VEC_ADD_ABS( temp2v, satdv, satdv );
  333. VEC_ADD_ABS( temp3v, satdv, satdv );
  334. VEC_ADD_ABS( temp4v, satdv, satdv );
  335. VEC_ADD_ABS( temp5v, satdv, satdv );
  336. VEC_ADD_ABS( temp6v, satdv, satdv );
  337. VEC_ADD_ABS( temp7v, satdv, satdv );
  338. satdv = vec_sums( satdv, zero_s32v );
  339. satdv = vec_splat( satdv, 3 );
  340. vec_ste( satdv, 0, &i_satd );
  341. return i_satd >> 1;
  342. }
  343. /***********************************************************************
  344. * SATD 16x8
  345. **********************************************************************/
  346. static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
  347. uint8_t *pix2, intptr_t i_pix2 )
  348. {
  349. ALIGNED_16( int i_satd );
  350. LOAD_ZERO;
  351. vec_s32_t satdv;
  352. vec_s16_t pix1v, pix2v;
  353. vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
  354. diffh4v, diffh5v, diffh6v, diffh7v;
  355. vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
  356. diffl4v, diffl5v, diffl6v, diffl7v;
  357. vec_s16_t temp0v, temp1v, temp2v, temp3v,
  358. temp4v, temp5v, temp6v, temp7v;
  359. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
  360. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
  361. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
  362. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
  363. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
  364. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
  365. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
  366. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
  367. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  368. temp0v, temp1v, temp2v, temp3v );
  369. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  370. temp4v, temp5v, temp6v, temp7v );
  371. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  372. temp4v, temp5v, temp6v, temp7v,
  373. diffh0v, diffh1v, diffh2v, diffh3v,
  374. diffh4v, diffh5v, diffh6v, diffh7v );
  375. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  376. temp0v, temp1v, temp2v, temp3v );
  377. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  378. temp4v, temp5v, temp6v, temp7v );
  379. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  380. VEC_ADD_ABS( temp1v, satdv, satdv );
  381. VEC_ADD_ABS( temp2v, satdv, satdv );
  382. VEC_ADD_ABS( temp3v, satdv, satdv );
  383. VEC_ADD_ABS( temp4v, satdv, satdv );
  384. VEC_ADD_ABS( temp5v, satdv, satdv );
  385. VEC_ADD_ABS( temp6v, satdv, satdv );
  386. VEC_ADD_ABS( temp7v, satdv, satdv );
  387. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  388. temp0v, temp1v, temp2v, temp3v );
  389. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  390. temp4v, temp5v, temp6v, temp7v );
  391. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  392. temp4v, temp5v, temp6v, temp7v,
  393. diffl0v, diffl1v, diffl2v, diffl3v,
  394. diffl4v, diffl5v, diffl6v, diffl7v );
  395. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  396. temp0v, temp1v, temp2v, temp3v );
  397. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  398. temp4v, temp5v, temp6v, temp7v );
  399. VEC_ADD_ABS( temp0v, satdv, satdv );
  400. VEC_ADD_ABS( temp1v, satdv, satdv );
  401. VEC_ADD_ABS( temp2v, satdv, satdv );
  402. VEC_ADD_ABS( temp3v, satdv, satdv );
  403. VEC_ADD_ABS( temp4v, satdv, satdv );
  404. VEC_ADD_ABS( temp5v, satdv, satdv );
  405. VEC_ADD_ABS( temp6v, satdv, satdv );
  406. VEC_ADD_ABS( temp7v, satdv, satdv );
  407. satdv = vec_sums( satdv, zero_s32v );
  408. satdv = vec_splat( satdv, 3 );
  409. vec_ste( satdv, 0, &i_satd );
  410. return i_satd >> 1;
  411. }
  412. /***********************************************************************
  413. * SATD 16x16
  414. **********************************************************************/
  415. static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
  416. uint8_t *pix2, intptr_t i_pix2 )
  417. {
  418. ALIGNED_16( int i_satd );
  419. LOAD_ZERO;
  420. vec_s32_t satdv;
  421. vec_s16_t pix1v, pix2v;
  422. vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
  423. diffh4v, diffh5v, diffh6v, diffh7v;
  424. vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
  425. diffl4v, diffl5v, diffl6v, diffl7v;
  426. vec_s16_t temp0v, temp1v, temp2v, temp3v,
  427. temp4v, temp5v, temp6v, temp7v;
  428. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
  429. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
  430. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
  431. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
  432. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
  433. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
  434. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
  435. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
  436. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  437. temp0v, temp1v, temp2v, temp3v );
  438. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  439. temp4v, temp5v, temp6v, temp7v );
  440. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  441. temp4v, temp5v, temp6v, temp7v,
  442. diffh0v, diffh1v, diffh2v, diffh3v,
  443. diffh4v, diffh5v, diffh6v, diffh7v );
  444. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  445. temp0v, temp1v, temp2v, temp3v );
  446. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  447. temp4v, temp5v, temp6v, temp7v );
  448. VEC_ADD_ABS( temp0v, zero_s32v, satdv );
  449. VEC_ADD_ABS( temp1v, satdv, satdv );
  450. VEC_ADD_ABS( temp2v, satdv, satdv );
  451. VEC_ADD_ABS( temp3v, satdv, satdv );
  452. VEC_ADD_ABS( temp4v, satdv, satdv );
  453. VEC_ADD_ABS( temp5v, satdv, satdv );
  454. VEC_ADD_ABS( temp6v, satdv, satdv );
  455. VEC_ADD_ABS( temp7v, satdv, satdv );
  456. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  457. temp0v, temp1v, temp2v, temp3v );
  458. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  459. temp4v, temp5v, temp6v, temp7v );
  460. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  461. temp4v, temp5v, temp6v, temp7v,
  462. diffl0v, diffl1v, diffl2v, diffl3v,
  463. diffl4v, diffl5v, diffl6v, diffl7v );
  464. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  465. temp0v, temp1v, temp2v, temp3v );
  466. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  467. temp4v, temp5v, temp6v, temp7v );
  468. VEC_ADD_ABS( temp0v, satdv, satdv );
  469. VEC_ADD_ABS( temp1v, satdv, satdv );
  470. VEC_ADD_ABS( temp2v, satdv, satdv );
  471. VEC_ADD_ABS( temp3v, satdv, satdv );
  472. VEC_ADD_ABS( temp4v, satdv, satdv );
  473. VEC_ADD_ABS( temp5v, satdv, satdv );
  474. VEC_ADD_ABS( temp6v, satdv, satdv );
  475. VEC_ADD_ABS( temp7v, satdv, satdv );
  476. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
  477. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
  478. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
  479. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
  480. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
  481. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
  482. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
  483. VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
  484. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  485. temp0v, temp1v, temp2v, temp3v );
  486. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  487. temp4v, temp5v, temp6v, temp7v );
  488. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  489. temp4v, temp5v, temp6v, temp7v,
  490. diffh0v, diffh1v, diffh2v, diffh3v,
  491. diffh4v, diffh5v, diffh6v, diffh7v );
  492. VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
  493. temp0v, temp1v, temp2v, temp3v );
  494. VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
  495. temp4v, temp5v, temp6v, temp7v );
  496. VEC_ADD_ABS( temp0v, satdv, satdv );
  497. VEC_ADD_ABS( temp1v, satdv, satdv );
  498. VEC_ADD_ABS( temp2v, satdv, satdv );
  499. VEC_ADD_ABS( temp3v, satdv, satdv );
  500. VEC_ADD_ABS( temp4v, satdv, satdv );
  501. VEC_ADD_ABS( temp5v, satdv, satdv );
  502. VEC_ADD_ABS( temp6v, satdv, satdv );
  503. VEC_ADD_ABS( temp7v, satdv, satdv );
  504. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  505. temp0v, temp1v, temp2v, temp3v );
  506. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  507. temp4v, temp5v, temp6v, temp7v );
  508. VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
  509. temp4v, temp5v, temp6v, temp7v,
  510. diffl0v, diffl1v, diffl2v, diffl3v,
  511. diffl4v, diffl5v, diffl6v, diffl7v );
  512. VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
  513. temp0v, temp1v, temp2v, temp3v );
  514. VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
  515. temp4v, temp5v, temp6v, temp7v );
  516. VEC_ADD_ABS( temp0v, satdv, satdv );
  517. VEC_ADD_ABS( temp1v, satdv, satdv );
  518. VEC_ADD_ABS( temp2v, satdv, satdv );
  519. VEC_ADD_ABS( temp3v, satdv, satdv );
  520. VEC_ADD_ABS( temp4v, satdv, satdv );
  521. VEC_ADD_ABS( temp5v, satdv, satdv );
  522. VEC_ADD_ABS( temp6v, satdv, satdv );
  523. VEC_ADD_ABS( temp7v, satdv, satdv );
  524. satdv = vec_sums( satdv, zero_s32v );
  525. satdv = vec_splat( satdv, 3 );
  526. vec_ste( satdv, 0, &i_satd );
  527. return i_satd >> 1;
  528. }
  529. /***********************************************************************
  530. * Interleaved SAD routines
  531. **********************************************************************/
  532. static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
  533. uint8_t *pix0, uint8_t *pix1,
  534. uint8_t *pix2, uint8_t *pix3,
  535. intptr_t i_stride, int scores[4] )
  536. {
  537. ALIGNED_16( int sum0 );
  538. ALIGNED_16( int sum1 );
  539. ALIGNED_16( int sum2 );
  540. ALIGNED_16( int sum3 );
  541. LOAD_ZERO;
  542. vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
  543. vec_s32_t sum0v, sum1v, sum2v, sum3v;
  544. sum0v = vec_splat_s32(0);
  545. sum1v = vec_splat_s32(0);
  546. sum2v = vec_splat_s32(0);
  547. sum3v = vec_splat_s32(0);
  548. for( int y = 0; y < 8; y++ )
  549. {
  550. pix0v = vec_vsx_ld( 0, pix0 );
  551. pix0 += i_stride;
  552. pix1v = vec_vsx_ld( 0, pix1 );
  553. pix1 += i_stride;
  554. fencv = vec_ld(0, fenc);
  555. fenc += FENC_STRIDE;
  556. pix2v = vec_vsx_ld( 0, pix2 );
  557. pix2 += i_stride;
  558. pix3v = vec_vsx_ld( 0, pix3 );
  559. pix3 += i_stride;
  560. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  561. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  562. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  563. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  564. pix0v = vec_vsx_ld( 0, pix0 );
  565. pix0 += i_stride;
  566. pix1v = vec_vsx_ld( 0, pix1 );
  567. pix1 += i_stride;
  568. fencv = vec_ld(0, fenc);
  569. fenc += FENC_STRIDE;
  570. pix2v = vec_vsx_ld( 0, pix2 );
  571. pix2 += i_stride;
  572. pix3v = vec_vsx_ld( 0, pix3 );
  573. pix3 += i_stride;
  574. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  575. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  576. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  577. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  578. }
  579. sum0v = vec_sums( sum0v, zero_s32v );
  580. sum1v = vec_sums( sum1v, zero_s32v );
  581. sum2v = vec_sums( sum2v, zero_s32v );
  582. sum3v = vec_sums( sum3v, zero_s32v );
  583. sum0v = vec_splat( sum0v, 3 );
  584. sum1v = vec_splat( sum1v, 3 );
  585. sum2v = vec_splat( sum2v, 3 );
  586. sum3v = vec_splat( sum3v, 3 );
  587. vec_ste( sum0v, 0, &sum0);
  588. vec_ste( sum1v, 0, &sum1);
  589. vec_ste( sum2v, 0, &sum2);
  590. vec_ste( sum3v, 0, &sum3);
  591. scores[0] = sum0;
  592. scores[1] = sum1;
  593. scores[2] = sum2;
  594. scores[3] = sum3;
  595. }
  596. static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
  597. uint8_t *pix1, uint8_t *pix2,
  598. intptr_t i_stride, int scores[3] )
  599. {
  600. ALIGNED_16( int sum0 );
  601. ALIGNED_16( int sum1 );
  602. ALIGNED_16( int sum2 );
  603. LOAD_ZERO;
  604. vec_u8_t fencv, pix0v, pix1v, pix2v;
  605. vec_s32_t sum0v, sum1v, sum2v;
  606. sum0v = vec_splat_s32(0);
  607. sum1v = vec_splat_s32(0);
  608. sum2v = vec_splat_s32(0);
  609. for( int y = 0; y < 8; y++ )
  610. {
  611. pix0v = vec_vsx_ld( 0, pix0 );
  612. pix0 += i_stride;
  613. pix1v = vec_vsx_ld( 0, pix1 );
  614. pix1 += i_stride;
  615. fencv = vec_ld(0, fenc);
  616. fenc += FENC_STRIDE;
  617. pix2v = vec_vsx_ld( 0, pix2 );
  618. pix2 += i_stride;
  619. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  620. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  621. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  622. pix0v = vec_vsx_ld( 0, pix0 );
  623. pix0 += i_stride;
  624. pix1v = vec_vsx_ld( 0, pix1 );
  625. pix1 += i_stride;
  626. fencv = vec_ld(0, fenc);
  627. fenc += FENC_STRIDE;
  628. pix2v = vec_vsx_ld( 0, pix2 );
  629. pix2 += i_stride;
  630. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  631. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  632. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  633. }
  634. sum0v = vec_sums( sum0v, zero_s32v );
  635. sum1v = vec_sums( sum1v, zero_s32v );
  636. sum2v = vec_sums( sum2v, zero_s32v );
  637. sum0v = vec_splat( sum0v, 3 );
  638. sum1v = vec_splat( sum1v, 3 );
  639. sum2v = vec_splat( sum2v, 3 );
  640. vec_ste( sum0v, 0, &sum0);
  641. vec_ste( sum1v, 0, &sum1);
  642. vec_ste( sum2v, 0, &sum2);
  643. scores[0] = sum0;
  644. scores[1] = sum1;
  645. scores[2] = sum2;
  646. }
  647. static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
  648. uint8_t *pix3, intptr_t i_stride, int scores[4] )
  649. {
  650. ALIGNED_16( int sum0 );
  651. ALIGNED_16( int sum1 );
  652. ALIGNED_16( int sum2 );
  653. ALIGNED_16( int sum3 );
  654. LOAD_ZERO;
  655. vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
  656. vec_s32_t sum0v, sum1v, sum2v, sum3v;
  657. sum0v = vec_splat_s32(0);
  658. sum1v = vec_splat_s32(0);
  659. sum2v = vec_splat_s32(0);
  660. sum3v = vec_splat_s32(0);
  661. for( int y = 0; y < 4; y++ )
  662. {
  663. pix0v = vec_vsx_ld( 0, pix0 );
  664. pix0 += i_stride;
  665. pix1v = vec_vsx_ld( 0, pix1 );
  666. pix1 += i_stride;
  667. fencv = vec_ld( 0, fenc );
  668. fenc += FENC_STRIDE;
  669. pix2v = vec_vsx_ld( 0, pix2 );
  670. pix2 += i_stride;
  671. pix3v = vec_vsx_ld( 0, pix3 );
  672. pix3 += i_stride;
  673. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  674. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  675. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  676. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  677. pix0v = vec_vsx_ld( 0, pix0 );
  678. pix0 += i_stride;
  679. pix1v = vec_vsx_ld( 0, pix1 );
  680. pix1 += i_stride;
  681. fencv = vec_ld(0, fenc);
  682. fenc += FENC_STRIDE;
  683. pix2v = vec_vsx_ld( 0, pix2 );
  684. pix2 += i_stride;
  685. pix3v = vec_vsx_ld( 0, pix3 );
  686. pix3 += i_stride;
  687. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  688. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  689. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  690. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  691. }
  692. sum0v = vec_sums( sum0v, zero_s32v );
  693. sum1v = vec_sums( sum1v, zero_s32v );
  694. sum2v = vec_sums( sum2v, zero_s32v );
  695. sum3v = vec_sums( sum3v, zero_s32v );
  696. sum0v = vec_splat( sum0v, 3 );
  697. sum1v = vec_splat( sum1v, 3 );
  698. sum2v = vec_splat( sum2v, 3 );
  699. sum3v = vec_splat( sum3v, 3 );
  700. vec_ste( sum0v, 0, &sum0);
  701. vec_ste( sum1v, 0, &sum1);
  702. vec_ste( sum2v, 0, &sum2);
  703. vec_ste( sum3v, 0, &sum3);
  704. scores[0] = sum0;
  705. scores[1] = sum1;
  706. scores[2] = sum2;
  707. scores[3] = sum3;
  708. }
  709. static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
  710. uint8_t *pix1, uint8_t *pix2,
  711. intptr_t i_stride, int scores[3] )
  712. {
  713. ALIGNED_16( int sum0 );
  714. ALIGNED_16( int sum1 );
  715. ALIGNED_16( int sum2 );
  716. LOAD_ZERO;
  717. vec_u8_t fencv, pix0v, pix1v, pix2v;
  718. vec_s32_t sum0v, sum1v, sum2v;
  719. sum0v = vec_splat_s32(0);
  720. sum1v = vec_splat_s32(0);
  721. sum2v = vec_splat_s32(0);
  722. for( int y = 0; y < 4; y++ )
  723. {
  724. pix0v = vec_vsx_ld(0, pix0);
  725. pix0 += i_stride;
  726. pix1v = vec_vsx_ld(0, pix1);
  727. pix1 += i_stride;
  728. fencv = vec_ld(0, fenc);
  729. fenc += FENC_STRIDE;
  730. pix2v = vec_vsx_ld(0, pix2);
  731. pix2 += i_stride;
  732. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  733. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  734. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  735. pix0v = vec_vsx_ld(0, pix0);
  736. pix0 += i_stride;
  737. pix1v = vec_vsx_ld(0, pix1);
  738. pix1 += i_stride;
  739. fencv = vec_ld(0, fenc);
  740. fenc += FENC_STRIDE;
  741. pix2v = vec_vsx_ld(0, pix2);
  742. pix2 += i_stride;
  743. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  744. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  745. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  746. }
  747. sum0v = vec_sums( sum0v, zero_s32v );
  748. sum1v = vec_sums( sum1v, zero_s32v );
  749. sum2v = vec_sums( sum2v, zero_s32v );
  750. sum0v = vec_splat( sum0v, 3 );
  751. sum1v = vec_splat( sum1v, 3 );
  752. sum2v = vec_splat( sum2v, 3 );
  753. vec_ste( sum0v, 0, &sum0);
  754. vec_ste( sum1v, 0, &sum1);
  755. vec_ste( sum2v, 0, &sum2);
  756. scores[0] = sum0;
  757. scores[1] = sum1;
  758. scores[2] = sum2;
  759. }
  760. static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
  761. uint8_t *pix0, uint8_t *pix1,
  762. uint8_t *pix2, uint8_t *pix3,
  763. intptr_t i_stride, int scores[4] )
  764. {
  765. ALIGNED_16( int sum0 );
  766. ALIGNED_16( int sum1 );
  767. ALIGNED_16( int sum2 );
  768. ALIGNED_16( int sum3 );
  769. LOAD_ZERO;
  770. vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
  771. vec_s32_t sum0v, sum1v, sum2v, sum3v;
  772. sum0v = vec_splat_s32(0);
  773. sum1v = vec_splat_s32(0);
  774. sum2v = vec_splat_s32(0);
  775. sum3v = vec_splat_s32(0);
  776. for( int y = 0; y < 8; y++ )
  777. {
  778. pix0v = vec_vsx_ld(0, pix0);
  779. pix0 += i_stride;
  780. pix1v = vec_vsx_ld(0, pix1);
  781. pix1 += i_stride;
  782. fencv = vec_vsx_ld(0, fenc);
  783. fenc += FENC_STRIDE;
  784. pix2v = vec_vsx_ld(0, pix2);
  785. pix2 += i_stride;
  786. pix3v = vec_vsx_ld(0, pix3);
  787. pix3 += i_stride;
  788. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  789. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  790. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  791. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  792. pix0v = vec_vsx_ld(0, pix0);
  793. pix0 += i_stride;
  794. pix1v = vec_vsx_ld(0, pix1);
  795. pix1 += i_stride;
  796. fencv = vec_vsx_ld(0, fenc);
  797. fenc += FENC_STRIDE;
  798. pix2v = vec_vsx_ld(0, pix2);
  799. pix2 += i_stride;
  800. pix3v = vec_vsx_ld(0, pix3);
  801. pix3 += i_stride;
  802. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  803. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  804. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  805. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  806. }
  807. sum0v = vec_sum2s( sum0v, zero_s32v );
  808. sum1v = vec_sum2s( sum1v, zero_s32v );
  809. sum2v = vec_sum2s( sum2v, zero_s32v );
  810. sum3v = vec_sum2s( sum3v, zero_s32v );
  811. sum0v = vec_splat( sum0v, 1 );
  812. sum1v = vec_splat( sum1v, 1 );
  813. sum2v = vec_splat( sum2v, 1 );
  814. sum3v = vec_splat( sum3v, 1 );
  815. vec_ste( sum0v, 0, &sum0);
  816. vec_ste( sum1v, 0, &sum1);
  817. vec_ste( sum2v, 0, &sum2);
  818. vec_ste( sum3v, 0, &sum3);
  819. scores[0] = sum0;
  820. scores[1] = sum1;
  821. scores[2] = sum2;
  822. scores[3] = sum3;
  823. }
  824. static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
  825. uint8_t *pix1, uint8_t *pix2,
  826. intptr_t i_stride, int scores[3] )
  827. {
  828. ALIGNED_16( int sum0 );
  829. ALIGNED_16( int sum1 );
  830. ALIGNED_16( int sum2 );
  831. LOAD_ZERO;
  832. vec_u8_t fencv, pix0v, pix1v, pix2v;
  833. vec_s32_t sum0v, sum1v, sum2v;
  834. sum0v = vec_splat_s32(0);
  835. sum1v = vec_splat_s32(0);
  836. sum2v = vec_splat_s32(0);
  837. for( int y = 0; y < 8; y++ )
  838. {
  839. pix0v = vec_vsx_ld(0, pix0);
  840. pix0 += i_stride;
  841. pix1v = vec_vsx_ld(0, pix1);
  842. pix1 += i_stride;
  843. fencv = vec_vsx_ld(0, fenc);
  844. fenc += FENC_STRIDE;
  845. pix2v = vec_vsx_ld(0, pix2);
  846. pix2 += i_stride;
  847. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  848. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  849. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  850. pix0v = vec_vsx_ld(0, pix0);
  851. pix0 += i_stride;
  852. pix1v = vec_vsx_ld(0, pix1);
  853. pix1 += i_stride;
  854. fencv = vec_vsx_ld(0, fenc);
  855. fenc += FENC_STRIDE;
  856. pix2v = vec_vsx_ld(0, pix2);
  857. pix2 += i_stride;
  858. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  859. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  860. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  861. }
  862. sum0v = vec_sum2s( sum0v, zero_s32v );
  863. sum1v = vec_sum2s( sum1v, zero_s32v );
  864. sum2v = vec_sum2s( sum2v, zero_s32v );
  865. sum0v = vec_splat( sum0v, 1 );
  866. sum1v = vec_splat( sum1v, 1 );
  867. sum2v = vec_splat( sum2v, 1 );
  868. vec_ste( sum0v, 0, &sum0);
  869. vec_ste( sum1v, 0, &sum1);
  870. vec_ste( sum2v, 0, &sum2);
  871. scores[0] = sum0;
  872. scores[1] = sum1;
  873. scores[2] = sum2;
  874. }
  875. static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
  876. uint8_t *pix0, uint8_t *pix1,
  877. uint8_t *pix2, uint8_t *pix3,
  878. intptr_t i_stride, int scores[4] )
  879. {
  880. ALIGNED_16( int sum0 );
  881. ALIGNED_16( int sum1 );
  882. ALIGNED_16( int sum2 );
  883. ALIGNED_16( int sum3 );
  884. LOAD_ZERO;
  885. vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
  886. vec_s32_t sum0v, sum1v, sum2v, sum3v;
  887. sum0v = vec_splat_s32(0);
  888. sum1v = vec_splat_s32(0);
  889. sum2v = vec_splat_s32(0);
  890. sum3v = vec_splat_s32(0);
  891. for( int y = 0; y < 4; y++ )
  892. {
  893. pix0v = vec_vsx_ld(0, pix0);
  894. pix0 += i_stride;
  895. pix1v = vec_vsx_ld(0, pix1);
  896. pix1 += i_stride;
  897. fencv = vec_vsx_ld(0, fenc);
  898. fenc += FENC_STRIDE;
  899. pix2v = vec_vsx_ld(0, pix2);
  900. pix2 += i_stride;
  901. pix3v = vec_vsx_ld(0, pix3);
  902. pix3 += i_stride;
  903. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  904. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  905. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  906. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  907. pix0v = vec_vsx_ld(0, pix0);
  908. pix0 += i_stride;
  909. pix1v = vec_vsx_ld(0, pix1);
  910. pix1 += i_stride;
  911. fencv = vec_vsx_ld(0, fenc);
  912. fenc += FENC_STRIDE;
  913. pix2v = vec_vsx_ld(0, pix2);
  914. pix2 += i_stride;
  915. pix3v = vec_vsx_ld(0, pix3);
  916. pix3 += i_stride;
  917. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  918. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  919. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  920. sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
  921. }
  922. sum0v = vec_sum2s( sum0v, zero_s32v );
  923. sum1v = vec_sum2s( sum1v, zero_s32v );
  924. sum2v = vec_sum2s( sum2v, zero_s32v );
  925. sum3v = vec_sum2s( sum3v, zero_s32v );
  926. sum0v = vec_splat( sum0v, 1 );
  927. sum1v = vec_splat( sum1v, 1 );
  928. sum2v = vec_splat( sum2v, 1 );
  929. sum3v = vec_splat( sum3v, 1 );
  930. vec_ste( sum0v, 0, &sum0);
  931. vec_ste( sum1v, 0, &sum1);
  932. vec_ste( sum2v, 0, &sum2);
  933. vec_ste( sum3v, 0, &sum3);
  934. scores[0] = sum0;
  935. scores[1] = sum1;
  936. scores[2] = sum2;
  937. scores[3] = sum3;
  938. }
  939. static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
  940. uint8_t *pix1, uint8_t *pix2,
  941. intptr_t i_stride, int scores[3] )
  942. {
  943. ALIGNED_16( int sum0 );
  944. ALIGNED_16( int sum1 );
  945. ALIGNED_16( int sum2 );
  946. LOAD_ZERO;
  947. vec_u8_t fencv, pix0v, pix1v, pix2v;
  948. vec_s32_t sum0v, sum1v, sum2v;
  949. sum0v = vec_splat_s32(0);
  950. sum1v = vec_splat_s32(0);
  951. sum2v = vec_splat_s32(0);
  952. for( int y = 0; y < 4; y++ )
  953. {
  954. pix0v = vec_vsx_ld(0, pix0);
  955. pix0 += i_stride;
  956. pix1v = vec_vsx_ld(0, pix1);
  957. pix1 += i_stride;
  958. fencv = vec_vsx_ld(0, fenc);
  959. fenc += FENC_STRIDE;
  960. pix2v = vec_vsx_ld(0, pix2);
  961. pix2 += i_stride;
  962. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  963. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  964. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  965. pix0v = vec_vsx_ld(0, pix0);
  966. pix0 += i_stride;
  967. pix1v = vec_vsx_ld(0, pix1);
  968. pix1 += i_stride;
  969. fencv = vec_vsx_ld(0, fenc);
  970. fenc += FENC_STRIDE;
  971. pix2v = vec_vsx_ld(0, pix2);
  972. pix2 += i_stride;
  973. sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
  974. sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
  975. sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
  976. }
  977. sum0v = vec_sum2s( sum0v, zero_s32v );
  978. sum1v = vec_sum2s( sum1v, zero_s32v );
  979. sum2v = vec_sum2s( sum2v, zero_s32v );
  980. sum0v = vec_splat( sum0v, 1 );
  981. sum1v = vec_splat( sum1v, 1 );
  982. sum2v = vec_splat( sum2v, 1 );
  983. vec_ste( sum0v, 0, &sum0);
  984. vec_ste( sum1v, 0, &sum1);
  985. vec_ste( sum2v, 0, &sum2);
  986. scores[0] = sum0;
  987. scores[1] = sum1;
  988. scores[2] = sum2;
  989. }
  990. /***********************************************************************
  991. * SSD routines
  992. **********************************************************************/
  993. static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
  994. uint8_t *pix2, intptr_t i_stride_pix2 )
  995. {
  996. ALIGNED_16( int sum );
  997. LOAD_ZERO;
  998. vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
  999. vec_u32_t sumv;
  1000. vec_u8_t diffA, diffB;
  1001. sumv = vec_splat_u32(0);
  1002. pix2vA = vec_vsx_ld(0, pix2);
  1003. pix1vA = vec_ld(0, pix1);
  1004. for( int y = 0; y < 7; y++ )
  1005. {
  1006. pix1 += i_stride_pix1;
  1007. pix2 += i_stride_pix2;
  1008. pix2vB = vec_vsx_ld(0, pix2);
  1009. pix1vB = vec_ld(0, pix1);
  1010. diffA = vec_absd(pix1vA, pix2vA);
  1011. sumv = vec_msum(diffA, diffA, sumv);
  1012. pix1 += i_stride_pix1;
  1013. pix2 += i_stride_pix2;
  1014. pix2vA = vec_vsx_ld(0, pix2);
  1015. pix1vA = vec_ld(0, pix1);
  1016. diffB = vec_absd(pix1vB, pix2vB);
  1017. sumv = vec_msum(diffB, diffB, sumv);
  1018. }
  1019. pix1 += i_stride_pix1;
  1020. pix2 += i_stride_pix2;
  1021. pix2vB = vec_vsx_ld(0, pix2);
  1022. pix1vB = vec_ld(0, pix1);
  1023. diffA = vec_absd(pix1vA, pix2vA);
  1024. sumv = vec_msum(diffA, diffA, sumv);
  1025. diffB = vec_absd(pix1vB, pix2vB);
  1026. sumv = vec_msum(diffB, diffB, sumv);
  1027. sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
  1028. sumv = vec_splat(sumv, 3);
  1029. vec_ste((vec_s32_t) sumv, 0, &sum);
  1030. return sum;
  1031. }
  1032. static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
  1033. uint8_t *pix2, intptr_t i_stride_pix2 )
  1034. {
  1035. ALIGNED_16( int sum );
  1036. LOAD_ZERO;
  1037. vec_u8_t pix1v, pix2v;
  1038. vec_u32_t sumv;
  1039. vec_u8_t diffv;
  1040. const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
  1041. sumv = vec_splat_u32(0);
  1042. for( int y = 0; y < 8; y++ )
  1043. {
  1044. pix1v = vec_vsx_ld(0, pix1);
  1045. pix2v = vec_vsx_ld(0, pix2);
  1046. diffv = vec_absd( pix1v, pix2v );
  1047. sumv = vec_msum(diffv, diffv, sumv);
  1048. pix1 += i_stride_pix1;
  1049. pix2 += i_stride_pix2;
  1050. }
  1051. sumv = vec_sel( zero_u32v, sumv, sel );
  1052. sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
  1053. sumv = vec_splat(sumv, 3);
  1054. vec_ste((vec_s32_t) sumv, 0, &sum);
  1055. return sum;
  1056. }
  1057. /****************************************************************************
  1058. * variance
  1059. ****************************************************************************/
  1060. static uint64_t pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
  1061. {
  1062. ALIGNED_16(uint32_t sum_tab[4]);
  1063. ALIGNED_16(uint32_t sqr_tab[4]);
  1064. LOAD_ZERO;
  1065. vec_u32_t sqr_v = zero_u32v;
  1066. vec_u32_t sum_v = zero_u32v;
  1067. for( int y = 0; y < 16; y++ )
  1068. {
  1069. vec_u8_t pix0_v = vec_ld(0, pix);
  1070. sum_v = vec_sum4s(pix0_v, sum_v);
  1071. sqr_v = vec_msum(pix0_v, pix0_v, sqr_v);
  1072. pix += i_stride;
  1073. }
  1074. sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
  1075. sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
  1076. vec_ste(sum_v, 12, sum_tab);
  1077. vec_ste(sqr_v, 12, sqr_tab);
  1078. uint32_t sum = sum_tab[3];
  1079. uint32_t sqr = sqr_tab[3];
  1080. return sum + ((uint64_t)sqr<<32);
  1081. }
  1082. static uint64_t pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
  1083. {
  1084. ALIGNED_16(uint32_t sum_tab[4]);
  1085. ALIGNED_16(uint32_t sqr_tab[4]);
  1086. LOAD_ZERO;
  1087. vec_u32_t sqr_v = zero_u32v;
  1088. vec_u32_t sum_v = zero_u32v;
  1089. static const vec_u8_t perm_tab[] =
  1090. {
  1091. CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* pix=mod16, i_stride=mod16 */
  1092. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
  1093. CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, /* pix=mod8, i_stride=mod16 */
  1094. 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F),
  1095. };
  1096. vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ];
  1097. for( int y = 0; y < 4; y++ )
  1098. {
  1099. vec_u8_t pix0_v = vec_ld(0, pix);
  1100. vec_u8_t pix1_v = vec_ld(i_stride, pix);
  1101. vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm);
  1102. sum_v = vec_sum4s(pix_v, sum_v);
  1103. sqr_v = vec_msum(pix_v, pix_v, sqr_v);
  1104. pix += i_stride<<1;
  1105. }
  1106. sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
  1107. sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
  1108. vec_ste(sum_v, 12, sum_tab);
  1109. vec_ste(sqr_v, 12, sqr_tab);
  1110. uint32_t sum = sum_tab[3];
  1111. uint32_t sqr = sqr_tab[3];
  1112. return sum + ((uint64_t)sqr<<32);
  1113. }
  1114. /**********************************************************************
  1115. * SA8D routines: sum of 8x8 Hadamard transformed differences
  1116. **********************************************************************/
  1117. /* SA8D_1D unrolled by 8 in Altivec */
  1118. #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
  1119. sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
  1120. { \
  1121. /* int a0 = SRC(0) + SRC(4) */ \
  1122. vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
  1123. /* int a4 = SRC(0) - SRC(4) */ \
  1124. vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
  1125. /* int a1 = SRC(1) + SRC(5) */ \
  1126. vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
  1127. /* int a5 = SRC(1) - SRC(5) */ \
  1128. vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
  1129. /* int a2 = SRC(2) + SRC(6) */ \
  1130. vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
  1131. /* int a6 = SRC(2) - SRC(6) */ \
  1132. vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
  1133. /* int a3 = SRC(3) + SRC(7) */ \
  1134. vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
  1135. /* int a7 = SRC(3) - SRC(7) */ \
  1136. vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
  1137. \
  1138. /* int b0 = a0 + a2 */ \
  1139. vec_s16_t b0v = vec_add(a0v, a2v); \
  1140. /* int b2 = a0 - a2; */ \
  1141. vec_s16_t b2v = vec_sub(a0v, a2v); \
  1142. /* int b1 = a1 + a3; */ \
  1143. vec_s16_t b1v = vec_add(a1v, a3v); \
  1144. /* int b3 = a1 - a3; */ \
  1145. vec_s16_t b3v = vec_sub(a1v, a3v); \
  1146. /* int b4 = a4 + a6; */ \
  1147. vec_s16_t b4v = vec_add(a4v, a6v); \
  1148. /* int b6 = a4 - a6; */ \
  1149. vec_s16_t b6v = vec_sub(a4v, a6v); \
  1150. /* int b5 = a5 + a7; */ \
  1151. vec_s16_t b5v = vec_add(a5v, a7v); \
  1152. /* int b7 = a5 - a7; */ \
  1153. vec_s16_t b7v = vec_sub(a5v, a7v); \
  1154. \
  1155. /* DST(0, b0 + b1) */ \
  1156. sa8d0v = vec_add(b0v, b1v); \
  1157. /* DST(1, b0 - b1) */ \
  1158. sa8d1v = vec_sub(b0v, b1v); \
  1159. /* DST(2, b2 + b3) */ \
  1160. sa8d2v = vec_add(b2v, b3v); \
  1161. /* DST(3, b2 - b3) */ \
  1162. sa8d3v = vec_sub(b2v, b3v); \
  1163. /* DST(4, b4 + b5) */ \
  1164. sa8d4v = vec_add(b4v, b5v); \
  1165. /* DST(5, b4 - b5) */ \
  1166. sa8d5v = vec_sub(b4v, b5v); \
  1167. /* DST(6, b6 + b7) */ \
  1168. sa8d6v = vec_add(b6v, b7v); \
  1169. /* DST(7, b6 - b7) */ \
  1170. sa8d7v = vec_sub(b6v, b7v); \
  1171. }
  1172. static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
  1173. uint8_t *pix2, intptr_t i_pix2 )
  1174. {
  1175. int32_t i_satd=0;
  1176. PREP_DIFF;
  1177. vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
  1178. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
  1179. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
  1180. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
  1181. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
  1182. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
  1183. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
  1184. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
  1185. VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
  1186. vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
  1187. SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
  1188. diff4v, diff5v, diff6v, diff7v);
  1189. VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
  1190. diff4v, diff5v, diff6v, diff7v,
  1191. sa8d0v, sa8d1v, sa8d2v, sa8d3v,
  1192. sa8d4v, sa8d5v, sa8d6v, sa8d7v );
  1193. SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
  1194. sa8d4v, sa8d5v, sa8d6v, sa8d7v );
  1195. /* accumulation of the absolute value of all elements of the resulting bloc */
  1196. vec_s16_t abs0v = VEC_ABS(sa8d0v);
  1197. vec_s16_t abs1v = VEC_ABS(sa8d1v);
  1198. vec_s16_t sum01v = vec_add(abs0v, abs1v);
  1199. vec_s16_t abs2v = VEC_ABS(sa8d2v);
  1200. vec_s16_t abs3v = VEC_ABS(sa8d3v);
  1201. vec_s16_t sum23v = vec_add(abs2v, abs3v);
  1202. vec_s16_t abs4v = VEC_ABS(sa8d4v);
  1203. vec_s16_t abs5v = VEC_ABS(sa8d5v);
  1204. vec_s16_t sum45v = vec_add(abs4v, abs5v);
  1205. vec_s16_t abs6v = VEC_ABS(sa8d6v);
  1206. vec_s16_t abs7v = VEC_ABS(sa8d7v);
  1207. vec_s16_t sum67v = vec_add(abs6v, abs7v);
  1208. vec_s16_t sum0123v = vec_add(sum01v, sum23v);
  1209. vec_s16_t sum4567v = vec_add(sum45v, sum67v);
  1210. vec_s32_t sumblocv;
  1211. sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
  1212. sumblocv = vec_sum4s(sum4567v, sumblocv );
  1213. sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
  1214. sumblocv = vec_splat(sumblocv, 3);
  1215. vec_ste(sumblocv, 0, &i_satd);
  1216. return i_satd;
  1217. }
  1218. static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
  1219. uint8_t *pix2, intptr_t i_pix2 )
  1220. {
  1221. int32_t i_satd;
  1222. i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
  1223. return i_satd;
  1224. }
  1225. static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
  1226. uint8_t *pix2, intptr_t i_pix2 )
  1227. {
  1228. int32_t i_satd;
  1229. i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
  1230. + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
  1231. + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
  1232. + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
  1233. return i_satd;
  1234. }
  1235. #define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
  1236. vec_s16_t t0 = vec_add(s0, s1); \
  1237. vec_s16_t t1 = vec_sub(s0, s1); \
  1238. vec_s16_t t2 = vec_add(s2, s3); \
  1239. vec_s16_t t3 = vec_sub(s2, s3); \
  1240. d0 = vec_add(t0, t2); \
  1241. d2 = vec_sub(t0, t2); \
  1242. d1 = vec_add(t1, t3); \
  1243. d3 = vec_sub(t1, t3); \
  1244. }
  1245. #ifdef WORDS_BIGENDIAN
  1246. #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(val, zero_u8v, perm)
  1247. #else
  1248. #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(zero_u8v, val, perm)
  1249. #endif
  1250. #define VEC_LOAD_HIGH( p, num ) \
  1251. vec_u8_t pix8_##num = vec_ld( stride*num, p ); \
  1252. vec_s16_t pix16_s##num = vec_perm_extend_s16( pix8_##num, perm ); \
  1253. vec_s16_t pix16_d##num;
  1254. static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
  1255. {
  1256. ALIGNED_16( int32_t sum4_tab[4] );
  1257. ALIGNED_16( int32_t sum8_tab[4] );
  1258. LOAD_ZERO;
  1259. VEC_LOAD_HIGH( pix, 0 );
  1260. VEC_LOAD_HIGH( pix, 1 );
  1261. VEC_LOAD_HIGH( pix, 2 );
  1262. VEC_LOAD_HIGH( pix, 3 );
  1263. HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
  1264. pix16_s0,pix16_s1,pix16_s2,pix16_s3);
  1265. VEC_LOAD_HIGH( pix, 4 );
  1266. VEC_LOAD_HIGH( pix, 5 );
  1267. VEC_LOAD_HIGH( pix, 6 );
  1268. VEC_LOAD_HIGH( pix, 7 );
  1269. HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
  1270. pix16_s4,pix16_s5,pix16_s6,pix16_s7);
  1271. VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
  1272. pix16_d4, pix16_d5, pix16_d6, pix16_d7,
  1273. pix16_s0, pix16_s1, pix16_s2, pix16_s3,
  1274. pix16_s4, pix16_s5, pix16_s6, pix16_s7);
  1275. HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
  1276. pix16_s0,pix16_s1,pix16_s2,pix16_s3);
  1277. HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
  1278. pix16_s4,pix16_s5,pix16_s6,pix16_s7);
  1279. vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
  1280. vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
  1281. vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
  1282. vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );
  1283. vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
  1284. vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);
  1285. vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
  1286. vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
  1287. vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
  1288. vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
  1289. vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
  1290. vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
  1291. vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
  1292. vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);
  1293. int sum4 = sum4_tab[3];
  1294. VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
  1295. tmpi4, tmpi5, tmpi6, tmpi7,
  1296. pix16_d0, pix16_d1, pix16_d2, pix16_d3,
  1297. pix16_d4, pix16_d5, pix16_d6, pix16_d7);
  1298. vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
  1299. VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
  1300. vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
  1301. VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
  1302. vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
  1303. VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
  1304. vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
  1305. VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );
  1306. vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
  1307. vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);
  1308. int sum8 = sum8_tab[3];
  1309. ALIGNED_16( int16_t tmp0_4_tab[8] );
  1310. vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
  1311. sum4 -= tmp0_4_tab[0];
  1312. sum8 -= tmp0_4_tab[0];
  1313. return ((uint64_t)sum8<<32) + sum4;
  1314. }
  1315. static const vec_u8_t hadamard_permtab[] =
  1316. {
  1317. CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03, /* pix = mod16 */
  1318. 0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
  1319. CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B, /* pix = mod8 */
  1320. 0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
  1321. };
  1322. static uint64_t pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
  1323. {
  1324. int idx = ((uintptr_t)pix & 8) >> 3;
  1325. vec_u8_t permh = hadamard_permtab[idx];
  1326. vec_u8_t perml = hadamard_permtab[!idx];
  1327. uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
  1328. sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
  1329. sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
  1330. sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
  1331. return ((sum>>34)<<32) + ((uint32_t)sum>>1);
  1332. }
  1333. static uint64_t pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
  1334. {
  1335. int idx = ((uintptr_t)pix & 8) >> 3;
  1336. vec_u8_t permh = hadamard_permtab[idx];
  1337. vec_u8_t perml = hadamard_permtab[!idx];
  1338. uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
  1339. sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
  1340. return ((sum>>34)<<32) + ((uint32_t)sum>>1);
  1341. }
  1342. static uint64_t pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
  1343. {
  1344. vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
  1345. uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
  1346. sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
  1347. return ((sum>>34)<<32) + ((uint32_t)sum>>1);
  1348. }
  1349. static uint64_t pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
  1350. {
  1351. vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
  1352. uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
  1353. return ((sum>>34)<<32) + ((uint32_t)sum>>1);
  1354. }
  1355. /****************************************************************************
  1356. * structural similarity metric
  1357. ****************************************************************************/
  1358. static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
  1359. const uint8_t *pix2, intptr_t stride2,
  1360. int sums[2][4] )
  1361. {
  1362. ALIGNED_16( int temp[4] );
  1363. vec_u8_t pix1v, pix2v;
  1364. vec_u32_t s1v, s2v, ssv, s12v;
  1365. LOAD_ZERO;
  1366. s1v = s2v = ssv = s12v = zero_u32v;
  1367. for( int y = 0; y < 4; y++ )
  1368. {
  1369. pix1v = vec_vsx_ld( y*stride1, pix1 );
  1370. pix2v = vec_vsx_ld( y*stride2, pix2 );
  1371. s1v = vec_sum4s( pix1v, s1v );
  1372. s2v = vec_sum4s( pix2v, s2v );
  1373. ssv = vec_msum( pix1v, pix1v, ssv );
  1374. ssv = vec_msum( pix2v, pix2v, ssv );
  1375. s12v = vec_msum( pix1v, pix2v, s12v );
  1376. }
  1377. vec_st( (vec_s32_t)s1v, 0, temp );
  1378. sums[0][0] = temp[0];
  1379. sums[1][0] = temp[1];
  1380. vec_st( (vec_s32_t)s2v, 0, temp );
  1381. sums[0][1] = temp[0];
  1382. sums[1][1] = temp[1];
  1383. vec_st( (vec_s32_t)ssv, 0, temp );
  1384. sums[0][2] = temp[0];
  1385. sums[1][2] = temp[1];
  1386. vec_st( (vec_s32_t)s12v, 0, temp );
  1387. sums[0][3] = temp[0];
  1388. sums[1][3] = temp[1];
  1389. }
  1390. #define SATD_X( size ) \
  1391. static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
  1392. intptr_t i_stride, int scores[3] )\
  1393. {\
  1394. scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
  1395. scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
  1396. scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
  1397. }\
  1398. static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
  1399. uint8_t *pix3, intptr_t i_stride, int scores[4] )\
  1400. {\
  1401. scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
  1402. scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
  1403. scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
  1404. scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\
  1405. }
  1406. SATD_X( 16x16 )\
  1407. SATD_X( 16x8 )\
  1408. SATD_X( 8x16 )\
  1409. SATD_X( 8x8 )\
  1410. SATD_X( 8x4 )\
  1411. SATD_X( 4x8 )\
  1412. SATD_X( 4x4 )
  1413. #define INTRA_MBCMP_8x8( mbcmp )\
  1414. static void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
  1415. {\
  1416. ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
  1417. x264_predict_8x8_v_c( pix, edge );\
  1418. res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1419. x264_predict_8x8_h_c( pix, edge );\
  1420. res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1421. x264_predict_8x8_dc_c( pix, edge );\
  1422. res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1423. }
  1424. INTRA_MBCMP_8x8(sad)
  1425. INTRA_MBCMP_8x8(sa8d)
  1426. #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
  1427. static void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\
  1428. {\
  1429. x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
  1430. res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1431. x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
  1432. res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1433. x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
  1434. res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
  1435. }
  1436. INTRA_MBCMP(satd, 4, v, h, dc, )
  1437. INTRA_MBCMP(sad, 8, dc, h, v, c )
  1438. INTRA_MBCMP(satd, 8, dc, h, v, c )
  1439. INTRA_MBCMP(sad, 16, v, h, dc, )
  1440. INTRA_MBCMP(satd, 16, v, h, dc, )
  1441. #endif // !HIGH_BIT_DEPTH
  1442. /****************************************************************************
  1443. * x264_pixel_init:
  1444. ****************************************************************************/
  1445. void x264_pixel_init_altivec( x264_pixel_function_t *pixf )
  1446. {
  1447. #if !HIGH_BIT_DEPTH
  1448. pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec;
  1449. pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec;
  1450. pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
  1451. pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
  1452. pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
  1453. pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
  1454. pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
  1455. pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
  1456. pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
  1457. pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
  1458. pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
  1459. pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
  1460. pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
  1461. pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
  1462. pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
  1463. pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec;
  1464. pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
  1465. pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
  1466. pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
  1467. pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec;
  1468. pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_altivec;
  1469. pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_altivec;
  1470. pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_altivec;
  1471. pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_altivec;
  1472. pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_altivec;
  1473. pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_altivec;
  1474. pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec;
  1475. pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_altivec;
  1476. pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_altivec;
  1477. pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_altivec;
  1478. pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_altivec;
  1479. pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_altivec;
  1480. pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_altivec;
  1481. pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_altivec;
  1482. pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_altivec;
  1483. pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_altivec;
  1484. pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_altivec;
  1485. pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_altivec;
  1486. pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec;
  1487. pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
  1488. pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;
  1489. pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
  1490. pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
  1491. pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_altivec;
  1492. pixf->var[PIXEL_16x16] = pixel_var_16x16_altivec;
  1493. pixf->var[PIXEL_8x8] = pixel_var_8x8_altivec;
  1494. pixf->hadamard_ac[PIXEL_16x16] = pixel_hadamard_ac_16x16_altivec;
  1495. pixf->hadamard_ac[PIXEL_16x8] = pixel_hadamard_ac_16x8_altivec;
  1496. pixf->hadamard_ac[PIXEL_8x16] = pixel_hadamard_ac_8x16_altivec;
  1497. pixf->hadamard_ac[PIXEL_8x8] = pixel_hadamard_ac_8x8_altivec;
  1498. pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
  1499. #endif // !HIGH_BIT_DEPTH
  1500. }