pixel-c.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491
  1. /*****************************************************************************
  2. * pixel-c.c: msa pixel metrics
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "pixel.h"
  28. #include "predict.h"
  29. #if !HIGH_BIT_DEPTH
  30. #define CALC_MSE_B( src, ref, var ) \
  31. { \
  32. v16u8 src_l0_m, src_l1_m; \
  33. v8i16 res_l0_m, res_l1_m; \
  34. \
  35. ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
  36. HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
  37. DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
  38. }
  39. #define CALC_MSE_AVG_B( src, ref, var, sub ) \
  40. { \
  41. v16u8 src_l0_m, src_l1_m; \
  42. v8i16 res_l0_m, res_l1_m; \
  43. \
  44. ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
  45. HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
  46. DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
  47. \
  48. sub += res_l0_m + res_l1_m; \
  49. }
  50. #define VARIANCE_WxH( sse, diff, shift ) \
  51. ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
  52. static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
  53. uint8_t *p_ref, int32_t i_ref_stride,
  54. int32_t i_height )
  55. {
  56. int32_t i_ht_cnt;
  57. uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
  58. v16u8 src = { 0 };
  59. v16u8 ref = { 0 };
  60. v16u8 diff;
  61. v8u16 sad = { 0 };
  62. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  63. {
  64. LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
  65. p_src += ( 4 * i_src_stride );
  66. LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
  67. p_ref += ( 4 * i_ref_stride );
  68. INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
  69. INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
  70. diff = __msa_asub_u_b( src, ref );
  71. sad += __msa_hadd_u_h( diff, diff );
  72. }
  73. return ( HADD_UH_U32( sad ) );
  74. }
  75. static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
  76. uint8_t *p_ref, int32_t i_ref_stride,
  77. int32_t i_height )
  78. {
  79. int32_t i_ht_cnt;
  80. v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  81. v8u16 sad = { 0 };
  82. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  83. {
  84. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  85. p_src += ( 4 * i_src_stride );
  86. LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  87. p_ref += ( 4 * i_ref_stride );
  88. PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
  89. src0, src1, ref0, ref1 );
  90. sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
  91. }
  92. return ( HADD_UH_U32( sad ) );
  93. }
  94. static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
  95. uint8_t *p_ref, int32_t i_ref_stride,
  96. int32_t i_height )
  97. {
  98. int32_t i_ht_cnt;
  99. v16u8 src0, src1, ref0, ref1;
  100. v8u16 sad = { 0 };
  101. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  102. {
  103. LD_UB2( p_src, i_src_stride, src0, src1 );
  104. p_src += ( 2 * i_src_stride );
  105. LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
  106. p_ref += ( 2 * i_ref_stride );
  107. sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
  108. LD_UB2( p_src, i_src_stride, src0, src1 );
  109. p_src += ( 2 * i_src_stride );
  110. LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
  111. p_ref += ( 2 * i_ref_stride );
  112. sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
  113. }
  114. return ( HADD_UH_U32( sad ) );
  115. }
  116. static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
  117. uint8_t *p_ref0, uint8_t *p_ref1,
  118. uint8_t *p_ref2, int32_t i_ref_stride,
  119. int32_t i_height, uint32_t *pu_sad_array )
  120. {
  121. int32_t i_ht_cnt;
  122. v16u8 src = { 0 };
  123. uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
  124. v16u8 ref0 = { 0 };
  125. v16u8 ref1 = { 0 };
  126. v16u8 ref2 = { 0 };
  127. v16u8 diff;
  128. v8u16 sad0 = { 0 };
  129. v8u16 sad1 = { 0 };
  130. v8u16 sad2 = { 0 };
  131. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  132. {
  133. LW4( p_src, i_src_stride, src0, src1, src2, src3 );
  134. INSERT_W4_UB( src0, src1, src2, src3, src );
  135. p_src += ( 4 * i_src_stride );
  136. LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
  137. INSERT_W4_UB( load0, load1, load2, load3, ref0 );
  138. p_ref0 += ( 4 * i_ref_stride );
  139. LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
  140. INSERT_W4_UB( load0, load1, load2, load3, ref1 );
  141. p_ref1 += ( 4 * i_ref_stride );
  142. LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
  143. INSERT_W4_UB( load0, load1, load2, load3, ref2 );
  144. p_ref2 += ( 4 * i_ref_stride );
  145. diff = __msa_asub_u_b( src, ref0 );
  146. sad0 += __msa_hadd_u_h( diff, diff );
  147. diff = __msa_asub_u_b( src, ref1 );
  148. sad1 += __msa_hadd_u_h( diff, diff );
  149. diff = __msa_asub_u_b( src, ref2 );
  150. sad2 += __msa_hadd_u_h( diff, diff );
  151. }
  152. pu_sad_array[0] = HADD_UH_U32( sad0 );
  153. pu_sad_array[1] = HADD_UH_U32( sad1 );
  154. pu_sad_array[2] = HADD_UH_U32( sad2 );
  155. }
  156. static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
  157. uint8_t *p_ref0, uint8_t *p_ref1,
  158. uint8_t *p_ref2, int32_t i_ref_stride,
  159. int32_t i_height, uint32_t *pu_sad_array )
  160. {
  161. int32_t i_ht_cnt;
  162. v16u8 src0, src1, src2, src3;
  163. v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
  164. v8u16 sad0 = { 0 };
  165. v8u16 sad1 = { 0 };
  166. v8u16 sad2 = { 0 };
  167. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  168. {
  169. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  170. p_src += ( 4 * i_src_stride );
  171. LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );
  172. p_ref0 += ( 4 * i_ref_stride );
  173. PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22,
  174. src0, src1, ref0, ref1 );
  175. sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  176. LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 );
  177. p_ref1 += ( 4 * i_ref_stride );
  178. PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
  179. sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  180. LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 );
  181. p_ref2 += ( 4 * i_ref_stride );
  182. PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
  183. sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  184. }
  185. pu_sad_array[0] = HADD_UH_U32( sad0 );
  186. pu_sad_array[1] = HADD_UH_U32( sad1 );
  187. pu_sad_array[2] = HADD_UH_U32( sad2 );
  188. }
  189. static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
  190. uint8_t *p_ref0, uint8_t *p_ref1,
  191. uint8_t *p_ref2, int32_t i_ref_stride,
  192. int32_t i_height, uint32_t *pu_sad_array )
  193. {
  194. int32_t i_ht_cnt;
  195. v16u8 src, ref;
  196. v16u8 diff;
  197. v8u16 sad0 = { 0 };
  198. v8u16 sad1 = { 0 };
  199. v8u16 sad2 = { 0 };
  200. for( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
  201. {
  202. src = LD_UB( p_src );
  203. p_src += i_src_stride;
  204. ref = LD_UB( p_ref0 );
  205. p_ref0 += i_ref_stride;
  206. diff = __msa_asub_u_b( src, ref );
  207. sad0 += __msa_hadd_u_h( diff, diff );
  208. ref = LD_UB( p_ref1 );
  209. p_ref1 += i_ref_stride;
  210. diff = __msa_asub_u_b( src, ref );
  211. sad1 += __msa_hadd_u_h( diff, diff );
  212. ref = LD_UB( p_ref2 );
  213. p_ref2 += i_ref_stride;
  214. diff = __msa_asub_u_b( src, ref );
  215. sad2 += __msa_hadd_u_h( diff, diff );
  216. src = LD_UB( p_src );
  217. p_src += i_src_stride;
  218. ref = LD_UB( p_ref0 );
  219. p_ref0 += i_ref_stride;
  220. diff = __msa_asub_u_b( src, ref );
  221. sad0 += __msa_hadd_u_h( diff, diff );
  222. ref = LD_UB( p_ref1 );
  223. p_ref1 += i_ref_stride;
  224. diff = __msa_asub_u_b( src, ref );
  225. sad1 += __msa_hadd_u_h( diff, diff );
  226. ref = LD_UB( p_ref2 );
  227. p_ref2 += i_ref_stride;
  228. diff = __msa_asub_u_b( src, ref );
  229. sad2 += __msa_hadd_u_h( diff, diff );
  230. }
  231. pu_sad_array[0] = HADD_UH_U32( sad0 );
  232. pu_sad_array[1] = HADD_UH_U32( sad1 );
  233. pu_sad_array[2] = HADD_UH_U32( sad2 );
  234. }
  235. static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
  236. uint8_t *p_aref[], int32_t i_ref_stride,
  237. int32_t i_height, uint32_t *pu_sad_array )
  238. {
  239. uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
  240. int32_t i_ht_cnt;
  241. uint32_t src0, src1, src2, src3;
  242. uint32_t ref0, ref1, ref2, ref3;
  243. v16u8 src = { 0 };
  244. v16u8 ref = { 0 };
  245. v16u8 diff;
  246. v8u16 sad0 = { 0 };
  247. v8u16 sad1 = { 0 };
  248. v8u16 sad2 = { 0 };
  249. v8u16 sad3 = { 0 };
  250. p_ref0 = p_aref[0];
  251. p_ref1 = p_aref[1];
  252. p_ref2 = p_aref[2];
  253. p_ref3 = p_aref[3];
  254. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  255. {
  256. LW4( p_src, i_src_stride, src0, src1, src2, src3 );
  257. INSERT_W4_UB( src0, src1, src2, src3, src );
  258. p_src += ( 4 * i_src_stride );
  259. LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
  260. INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
  261. p_ref0 += ( 4 * i_ref_stride );
  262. diff = __msa_asub_u_b( src, ref );
  263. sad0 += __msa_hadd_u_h( diff, diff );
  264. LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 );
  265. INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
  266. p_ref1 += ( 4 * i_ref_stride );
  267. diff = __msa_asub_u_b( src, ref );
  268. sad1 += __msa_hadd_u_h( diff, diff );
  269. LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 );
  270. INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
  271. p_ref2 += ( 4 * i_ref_stride );
  272. diff = __msa_asub_u_b( src, ref );
  273. sad2 += __msa_hadd_u_h( diff, diff );
  274. LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 );
  275. INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
  276. p_ref3 += ( 4 * i_ref_stride );
  277. diff = __msa_asub_u_b( src, ref );
  278. sad3 += __msa_hadd_u_h( diff, diff );
  279. }
  280. pu_sad_array[0] = HADD_UH_U32( sad0 );
  281. pu_sad_array[1] = HADD_UH_U32( sad1 );
  282. pu_sad_array[2] = HADD_UH_U32( sad2 );
  283. pu_sad_array[3] = HADD_UH_U32( sad3 );
  284. }
  285. static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
  286. uint8_t *p_aref[], int32_t i_ref_stride,
  287. int32_t i_height, uint32_t *pu_sad_array )
  288. {
  289. int32_t i_ht_cnt;
  290. uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
  291. v16u8 src0, src1, src2, src3;
  292. v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  293. v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
  294. v8u16 sad0 = { 0 };
  295. v8u16 sad1 = { 0 };
  296. v8u16 sad2 = { 0 };
  297. v8u16 sad3 = { 0 };
  298. p_ref0 = p_aref[0];
  299. p_ref1 = p_aref[1];
  300. p_ref2 = p_aref[2];
  301. p_ref3 = p_aref[3];
  302. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  303. {
  304. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  305. p_src += ( 4 * i_src_stride );
  306. LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
  307. p_ref0 += ( 4 * i_ref_stride );
  308. LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 );
  309. p_ref1 += ( 4 * i_ref_stride );
  310. LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 );
  311. p_ref2 += ( 4 * i_ref_stride );
  312. LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 );
  313. p_ref3 += ( 4 * i_ref_stride );
  314. PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 );
  315. PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 );
  316. sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  317. PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 );
  318. sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  319. PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 );
  320. sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  321. PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 );
  322. sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 );
  323. }
  324. pu_sad_array[0] = HADD_UH_U32( sad0 );
  325. pu_sad_array[1] = HADD_UH_U32( sad1 );
  326. pu_sad_array[2] = HADD_UH_U32( sad2 );
  327. pu_sad_array[3] = HADD_UH_U32( sad3 );
  328. }
  329. static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
  330. uint8_t *p_aref[], int32_t i_ref_stride,
  331. int32_t i_height, uint32_t *pu_sad_array )
  332. {
  333. int32_t i_ht_cnt;
  334. uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
  335. v16u8 src, ref0, ref1, ref2, ref3, diff;
  336. v8u16 sad0 = { 0 };
  337. v8u16 sad1 = { 0 };
  338. v8u16 sad2 = { 0 };
  339. v8u16 sad3 = { 0 };
  340. p_ref0 = p_aref[0];
  341. p_ref1 = p_aref[1];
  342. p_ref2 = p_aref[2];
  343. p_ref3 = p_aref[3];
  344. for( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
  345. {
  346. src = LD_UB( p_src );
  347. p_src += i_src_stride;
  348. ref0 = LD_UB( p_ref0 );
  349. p_ref0 += i_ref_stride;
  350. ref1 = LD_UB( p_ref1 );
  351. p_ref1 += i_ref_stride;
  352. ref2 = LD_UB( p_ref2 );
  353. p_ref2 += i_ref_stride;
  354. ref3 = LD_UB( p_ref3 );
  355. p_ref3 += i_ref_stride;
  356. diff = __msa_asub_u_b( src, ref0 );
  357. sad0 += __msa_hadd_u_h( diff, diff );
  358. diff = __msa_asub_u_b( src, ref1 );
  359. sad1 += __msa_hadd_u_h( diff, diff );
  360. diff = __msa_asub_u_b( src, ref2 );
  361. sad2 += __msa_hadd_u_h( diff, diff );
  362. diff = __msa_asub_u_b( src, ref3 );
  363. sad3 += __msa_hadd_u_h( diff, diff );
  364. src = LD_UB( p_src );
  365. p_src += i_src_stride;
  366. ref0 = LD_UB( p_ref0 );
  367. p_ref0 += i_ref_stride;
  368. ref1 = LD_UB( p_ref1 );
  369. p_ref1 += i_ref_stride;
  370. ref2 = LD_UB( p_ref2 );
  371. p_ref2 += i_ref_stride;
  372. ref3 = LD_UB( p_ref3 );
  373. p_ref3 += i_ref_stride;
  374. diff = __msa_asub_u_b( src, ref0 );
  375. sad0 += __msa_hadd_u_h( diff, diff );
  376. diff = __msa_asub_u_b( src, ref1 );
  377. sad1 += __msa_hadd_u_h( diff, diff );
  378. diff = __msa_asub_u_b( src, ref2 );
  379. sad2 += __msa_hadd_u_h( diff, diff );
  380. diff = __msa_asub_u_b( src, ref3 );
  381. sad3 += __msa_hadd_u_h( diff, diff );
  382. }
  383. pu_sad_array[0] = HADD_UH_U32( sad0 );
  384. pu_sad_array[1] = HADD_UH_U32( sad1 );
  385. pu_sad_array[2] = HADD_UH_U32( sad2 );
  386. pu_sad_array[3] = HADD_UH_U32( sad3 );
  387. }
  388. static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride,
  389. uint8_t i_height )
  390. {
  391. uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
  392. v16i8 pix, zero = { 0 };
  393. v8u16 add, pix_r, pix_l;
  394. v4u32 sqr = { 0 };
  395. for( u_cnt = i_height; u_cnt--; )
  396. {
  397. pix = LD_SB( p_pix );
  398. p_pix += i_stride;
  399. add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix );
  400. u_sum += HADD_UH_U32( add );
  401. ILVRL_B2_UH( zero, pix, pix_r, pix_l );
  402. sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
  403. sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l );
  404. }
  405. u_sqr_out = HADD_SW_S32( sqr );
  406. return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
  407. }
  408. static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride,
  409. uint8_t i_height )
  410. {
  411. uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
  412. v16i8 pix, zero = { 0 };
  413. v8u16 add, pix_r;
  414. v4u32 sqr = { 0 };
  415. for( u_cnt = i_height; u_cnt--; )
  416. {
  417. pix = LD_SB( p_pix );
  418. p_pix += i_stride;
  419. pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix );
  420. add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r );
  421. u_sum += HADD_UH_U32( add );
  422. sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
  423. }
  424. u_sqr_out = HADD_SW_S32( sqr );
  425. return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
  426. }
  427. static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride,
  428. uint8_t *p_ref, int32_t i_ref_stride,
  429. int32_t i_height, int32_t *p_diff )
  430. {
  431. int32_t i_ht_cnt;
  432. uint32_t u_sse;
  433. v16u8 src0, src1, src2, src3;
  434. v16u8 ref0, ref1, ref2, ref3;
  435. v8i16 avg = { 0 };
  436. v4i32 vec, var = { 0 };
  437. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  438. {
  439. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  440. p_src += ( 4 * i_src_stride );
  441. LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  442. p_ref += ( 4 * i_ref_stride );
  443. PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
  444. src0, src1, ref0, ref1 );
  445. CALC_MSE_AVG_B( src0, ref0, var, avg );
  446. CALC_MSE_AVG_B( src1, ref1, var, avg );
  447. }
  448. vec = __msa_hadd_s_w( avg, avg );
  449. *p_diff = HADD_SW_S32( vec );
  450. u_sse = HADD_SW_S32( var );
  451. return u_sse;
  452. }
  453. static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride,
  454. uint8_t *p_ref, int32_t i_ref_stride,
  455. int32_t i_height )
  456. {
  457. int32_t i_ht_cnt;
  458. uint32_t u_sse;
  459. uint32_t u_src0, u_src1, u_src2, u_src3;
  460. uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
  461. v16u8 src = { 0 };
  462. v16u8 ref = { 0 };
  463. v4i32 var = { 0 };
  464. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  465. {
  466. LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
  467. p_src += ( 4 * i_src_stride );
  468. LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
  469. p_ref += ( 4 * i_ref_stride );
  470. INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
  471. INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
  472. CALC_MSE_B( src, ref, var );
  473. }
  474. u_sse = HADD_SW_S32( var );
  475. return u_sse;
  476. }
  477. static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride,
  478. uint8_t *p_ref, int32_t i_ref_stride,
  479. int32_t i_height )
  480. {
  481. int32_t i_ht_cnt;
  482. uint32_t u_sse;
  483. v16u8 src0, src1, src2, src3;
  484. v16u8 ref0, ref1, ref2, ref3;
  485. v4i32 var = { 0 };
  486. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  487. {
  488. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  489. p_src += ( 4 * i_src_stride );
  490. LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  491. p_ref += ( 4 * i_ref_stride );
  492. PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
  493. src0, src1, ref0, ref1 );
  494. CALC_MSE_B( src0, ref0, var );
  495. CALC_MSE_B( src1, ref1, var );
  496. }
  497. u_sse = HADD_SW_S32( var );
  498. return u_sse;
  499. }
  500. static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride,
  501. uint8_t *p_ref, int32_t i_ref_stride,
  502. int32_t i_height )
  503. {
  504. int32_t i_ht_cnt;
  505. uint32_t u_sse;
  506. v16u8 src, ref;
  507. v4i32 var = { 0 };
  508. for( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
  509. {
  510. src = LD_UB( p_src );
  511. p_src += i_src_stride;
  512. ref = LD_UB( p_ref );
  513. p_ref += i_ref_stride;
  514. CALC_MSE_B( src, ref, var );
  515. src = LD_UB( p_src );
  516. p_src += i_src_stride;
  517. ref = LD_UB( p_ref );
  518. p_ref += i_ref_stride;
  519. CALC_MSE_B( src, ref, var );
  520. src = LD_UB( p_src );
  521. p_src += i_src_stride;
  522. ref = LD_UB( p_ref );
  523. p_ref += i_ref_stride;
  524. CALC_MSE_B( src, ref, var );
  525. src = LD_UB( p_src );
  526. p_src += i_src_stride;
  527. ref = LD_UB( p_ref );
  528. p_ref += i_ref_stride;
  529. CALC_MSE_B( src, ref, var );
  530. }
  531. u_sse = HADD_SW_S32( var );
  532. return u_sse;
  533. }
  534. static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride,
  535. const uint8_t *p_ref, int32_t i_ref_stride,
  536. int32_t pi_sum_array[2][4] )
  537. {
  538. v16i8 zero = { 0 };
  539. v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
  540. v8u16 temp0, temp1, temp2, temp3;
  541. v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  542. v4u32 tmp0;
  543. v4i32 tmp2, tmp3;
  544. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  545. p_src += ( 4 * i_src_stride );
  546. LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  547. p_ref += ( 4 * i_ref_stride );
  548. ILVR_D2_UB( src1, src0, src3, src2, src0, src2 );
  549. ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 );
  550. HADD_UB2_UH( src0, src2, temp0, temp1 );
  551. temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
  552. temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
  553. pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 );
  554. pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 );
  555. HADD_UB2_UH( ref0, ref2, temp0, temp1 );
  556. temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
  557. temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
  558. pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 );
  559. pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 );
  560. ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2,
  561. vec4, vec6 );
  562. ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3,
  563. vec5, vec7 );
  564. tmp0 = __msa_dotp_u_w( vec0, vec0 );
  565. tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 );
  566. tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 );
  567. tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 );
  568. tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 );
  569. tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 );
  570. tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 );
  571. tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 );
  572. tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
  573. tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
  574. tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
  575. tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
  576. pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 );
  577. pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 );
  578. tmp0 = __msa_dotp_u_w( vec4, vec0 );
  579. tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 );
  580. tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 );
  581. tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 );
  582. tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
  583. tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
  584. tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
  585. tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
  586. pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 );
  587. pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 );
  588. }
  589. static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride,
  590. uint8_t *p_ref, int32_t i_ref_stride,
  591. uint8_t i_height )
  592. {
  593. int32_t cnt;
  594. uint32_t u_sum = 0;
  595. v16i8 src0, src1, src2, src3;
  596. v16i8 ref0, ref1, ref2, ref3;
  597. v8i16 zero = { 0 };
  598. v8i16 diff0, diff1, diff2, diff3;
  599. v8i16 temp0, temp1, temp2, temp3;
  600. for( cnt = i_height >> 2; cnt--; )
  601. {
  602. LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
  603. p_src += 4 * i_src_stride;
  604. LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  605. p_ref += 4 * i_ref_stride;
  606. ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
  607. diff0, diff1, diff2, diff3 );
  608. HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
  609. TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
  610. diff0, diff1, diff2, diff3 );
  611. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  612. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  613. TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
  614. diff0, diff1, diff2, diff3 );
  615. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  616. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  617. diff0 = __msa_add_a_h( diff0, zero );
  618. diff1 = __msa_add_a_h( diff1, zero );
  619. diff2 = __msa_add_a_h( diff2, zero );
  620. diff3 = __msa_add_a_h( diff3, zero );
  621. diff0 = ( diff0 + diff1 + diff2 + diff3 );
  622. diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 );
  623. diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 );
  624. u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 );
  625. }
  626. return ( u_sum >> 1 );
  627. }
  628. static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride,
  629. uint8_t *p_ref, int32_t i_ref_stride,
  630. uint8_t i_height )
  631. {
  632. int32_t cnt;
  633. uint32_t u_sum = 0;
  634. v16i8 src0, src1, src2, src3;
  635. v16i8 ref0, ref1, ref2, ref3;
  636. v8i16 zero = { 0 };
  637. v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
  638. v8i16 temp0, temp1, temp2, temp3;
  639. for( cnt = i_height >> 2; cnt--; )
  640. {
  641. LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
  642. p_src += 4 * i_src_stride;
  643. LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
  644. p_ref += 4 * i_ref_stride;
  645. ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
  646. diff0, diff1, diff2, diff3 );
  647. HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
  648. TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3,
  649. diff0, diff2, diff4, diff6 );
  650. diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 );
  651. diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 );
  652. diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 );
  653. diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 );
  654. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  655. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  656. BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
  657. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
  658. TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6,
  659. diff7, diff0, diff1, diff2, diff3, diff4, diff5,
  660. diff6, diff7 );
  661. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  662. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  663. diff0 = __msa_add_a_h( diff0, zero );
  664. diff1 = __msa_add_a_h( diff1, zero );
  665. diff2 = __msa_add_a_h( diff2, zero );
  666. diff3 = __msa_add_a_h( diff3, zero );
  667. diff0 = ( diff0 + diff1 + diff2 + diff3 );
  668. u_sum += HADD_UH_U32( diff0 );
  669. }
  670. return ( u_sum >> 1 );
  671. }
  672. static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
  673. uint8_t *p_ref, int32_t i_ref_stride )
  674. {
  675. uint32_t u_sum = 0;
  676. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  677. v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
  678. v8i16 zero = { 0 };
  679. v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
  680. v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
  681. v8i16 temp0, temp1, temp2, temp3;
  682. LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
  683. LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 );
  684. ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1,
  685. sub2, sub3 );
  686. ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5,
  687. sub6, sub7 );
  688. HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 );
  689. HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 );
  690. TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
  691. sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
  692. BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 );
  693. BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 );
  694. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  695. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  696. BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
  697. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
  698. TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
  699. diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 );
  700. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  701. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  702. BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
  703. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
  704. temp0 = diff0 + diff4;
  705. temp1 = diff1 + diff5;
  706. temp2 = diff2 + diff6;
  707. temp3 = diff3 + diff7;
  708. temp0 = __msa_add_a_h( temp0, zero );
  709. temp1 = __msa_add_a_h( temp1, zero );
  710. temp2 = __msa_add_a_h( temp2, zero );
  711. temp3 = __msa_add_a_h( temp3, zero );
  712. diff0 = temp0 + __msa_asub_s_h( diff0, diff4 );
  713. diff1 = temp1 + __msa_asub_s_h( diff1, diff5 );
  714. diff2 = temp2 + __msa_asub_s_h( diff2, diff6 );
  715. diff3 = temp3 + __msa_asub_s_h( diff3, diff7 );
  716. diff0 = ( diff0 + diff1 + diff2 + diff3 );
  717. u_sum = HADD_UH_U32( diff0 );
  718. return u_sum;
  719. }
  720. static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride )
  721. {
  722. int16_t tmp0, tmp1, tmp2, tmp3;
  723. uint32_t u_sum4 = 0, u_sum8 = 0, u_dc;
  724. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  725. v8i16 zero = { 0 };
  726. v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
  727. v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
  728. v8i16 temp0, temp1, temp2, temp3;
  729. LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
  730. ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1,
  731. diff2, diff3 );
  732. ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5,
  733. diff6, diff7 );
  734. TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
  735. diff4, diff5, diff6, diff7,
  736. diff0, diff1, diff2, diff3,
  737. diff4, diff5, diff6, diff7 );
  738. BUTTERFLY_4( diff0, diff2, diff3, diff1,
  739. temp0, temp2, temp3, temp1 );
  740. BUTTERFLY_4( temp0, temp1, temp3, temp2,
  741. diff0, diff1, diff3, diff2 );
  742. BUTTERFLY_4( diff4, diff6, diff7, diff5,
  743. temp0, temp2, temp3, temp1 );
  744. BUTTERFLY_4( temp0, temp1, temp3, temp2,
  745. diff4, diff5, diff7, diff6 );
  746. TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
  747. diff4, diff5, diff6, diff7,
  748. diff0, diff1, diff2, diff3,
  749. diff4, diff5, diff6, diff7 );
  750. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  751. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  752. BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
  753. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
  754. tmp0 = diff0[0];
  755. tmp1 = diff0[4];
  756. tmp2 = diff4[0];
  757. tmp3 = diff4[4];
  758. sub0 = __msa_add_a_h( diff0, zero );
  759. sub1 = __msa_add_a_h( diff1, zero );
  760. sub2 = __msa_add_a_h( diff2, zero );
  761. sub3 = __msa_add_a_h( diff3, zero );
  762. sub4 = __msa_add_a_h( diff4, zero );
  763. sub5 = __msa_add_a_h( diff5, zero );
  764. sub6 = __msa_add_a_h( diff6, zero );
  765. sub7 = __msa_add_a_h( diff7, zero );
  766. sub0 = ( sub0 + sub1 + sub2 + sub3 );
  767. sub1 = ( sub4 + sub5 + sub6 + sub7 );
  768. sub0 += sub1;
  769. u_sum4 += HADD_UH_U32( sub0 );
  770. TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
  771. sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
  772. ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 );
  773. ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 );
  774. diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 );
  775. diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 );
  776. diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 );
  777. diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 );
  778. BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
  779. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
  780. BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
  781. BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
  782. sub0 = __msa_add_a_h( diff0, zero );
  783. sub1 = __msa_add_a_h( diff1, zero );
  784. sub2 = __msa_add_a_h( diff2, zero );
  785. sub3 = __msa_add_a_h( diff3, zero );
  786. sub4 = __msa_add_a_h( diff4, zero );
  787. sub5 = __msa_add_a_h( diff5, zero );
  788. sub6 = __msa_add_a_h( diff6, zero );
  789. sub7 = __msa_add_a_h( diff7, zero );
  790. sub0 = ( sub0 + sub1 + sub2 + sub3 );
  791. sub1 = ( sub4 + sub5 + sub6 + sub7 );
  792. sub0 += sub1;
  793. u_sum8 += HADD_UH_U32( sub0 );
  794. u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 );
  795. u_sum4 = u_sum4 - u_dc;
  796. u_sum8 = u_sum8 - u_dc;
  797. return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4;
  798. }
  799. int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  800. uint8_t *p_ref, intptr_t i_ref_stride )
  801. {
  802. return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  803. }
  804. int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  805. uint8_t *p_ref, intptr_t i_ref_stride )
  806. {
  807. return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  808. }
  809. int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  810. uint8_t *p_ref, intptr_t i_ref_stride )
  811. {
  812. return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  813. }
  814. int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  815. uint8_t *p_ref, intptr_t i_ref_stride )
  816. {
  817. return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  818. }
  819. int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
  820. uint8_t *p_ref, intptr_t i_ref_stride )
  821. {
  822. return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
  823. }
  824. int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  825. uint8_t *p_ref, intptr_t i_ref_stride )
  826. {
  827. return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  828. }
  829. int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  830. uint8_t *p_ref, intptr_t i_ref_stride )
  831. {
  832. return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  833. }
  834. int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
  835. uint8_t *p_ref, intptr_t i_ref_stride )
  836. {
  837. return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
  838. }
  839. void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
  840. uint8_t *p_ref1, uint8_t *p_ref2,
  841. uint8_t *p_ref3, intptr_t i_ref_stride,
  842. int32_t p_sad_array[4] )
  843. {
  844. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  845. sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
  846. ( uint32_t * ) p_sad_array );
  847. }
  848. void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  849. uint8_t *p_ref1, uint8_t *p_ref2,
  850. uint8_t *p_ref3, intptr_t i_ref_stride,
  851. int32_t p_sad_array[4] )
  852. {
  853. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  854. sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
  855. ( uint32_t * ) p_sad_array );
  856. }
  857. void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
  858. uint8_t *p_ref1, uint8_t *p_ref2,
  859. uint8_t *p_ref3, intptr_t i_ref_stride,
  860. int32_t p_sad_array[4] )
  861. {
  862. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  863. sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
  864. ( uint32_t * ) p_sad_array );
  865. }
  866. void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  867. uint8_t *p_ref1, uint8_t *p_ref2,
  868. uint8_t *p_ref3, intptr_t i_ref_stride,
  869. int32_t p_sad_array[4] )
  870. {
  871. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  872. sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
  873. ( uint32_t * ) p_sad_array );
  874. }
  875. void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
  876. uint8_t *p_ref1, uint8_t *p_ref2,
  877. uint8_t *p_ref3, intptr_t i_ref_stride,
  878. int32_t p_sad_array[4] )
  879. {
  880. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  881. sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
  882. ( uint32_t * ) p_sad_array );
  883. }
  884. void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  885. uint8_t *p_ref1, uint8_t *p_ref2,
  886. uint8_t *p_ref3, intptr_t i_ref_stride,
  887. int32_t p_sad_array[4] )
  888. {
  889. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  890. sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
  891. ( uint32_t * ) p_sad_array );
  892. }
  893. void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
  894. uint8_t *p_ref1, uint8_t *p_ref2,
  895. uint8_t *p_ref3, intptr_t i_ref_stride,
  896. int32_t p_sad_array[4] )
  897. {
  898. uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
  899. sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
  900. ( uint32_t * ) p_sad_array );
  901. }
  902. void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
  903. uint8_t *p_ref1, uint8_t *p_ref2,
  904. intptr_t i_ref_stride,
  905. int32_t p_sad_array[3] )
  906. {
  907. sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  908. i_ref_stride, 16, ( uint32_t * ) p_sad_array );
  909. }
  910. void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  911. uint8_t *p_ref1, uint8_t *p_ref2,
  912. intptr_t i_ref_stride,
  913. int32_t p_sad_array[3] )
  914. {
  915. sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  916. i_ref_stride, 8, ( uint32_t * ) p_sad_array );
  917. }
  918. void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
  919. uint8_t *p_ref1, uint8_t *p_ref2,
  920. intptr_t i_ref_stride,
  921. int32_t p_sad_array[3] )
  922. {
  923. sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  924. i_ref_stride, 16, ( uint32_t * ) p_sad_array );
  925. }
  926. void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  927. uint8_t *p_ref1, uint8_t *p_ref2,
  928. intptr_t i_ref_stride,
  929. int32_t p_sad_array[3] )
  930. {
  931. sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  932. i_ref_stride, 8, ( uint32_t * ) p_sad_array );
  933. }
  934. void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
  935. uint8_t *p_ref1, uint8_t *p_ref2,
  936. intptr_t i_ref_stride,
  937. int32_t p_sad_array[3] )
  938. {
  939. sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  940. i_ref_stride, 4, ( uint32_t * ) p_sad_array );
  941. }
  942. void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
  943. uint8_t *p_ref1, uint8_t *p_ref2,
  944. intptr_t i_ref_stride,
  945. int32_t p_sad_array[3] )
  946. {
  947. sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  948. i_ref_stride, 8, ( uint32_t * ) p_sad_array );
  949. }
  950. void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
  951. uint8_t *p_ref1, uint8_t *p_ref2,
  952. intptr_t i_ref_stride,
  953. int32_t p_sad_array[3] )
  954. {
  955. sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
  956. i_ref_stride, 4, ( uint32_t * ) p_sad_array );
  957. }
  958. int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  959. uint8_t *p_ref, intptr_t i_ref_stride )
  960. {
  961. return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  962. }
  963. int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  964. uint8_t *p_ref, intptr_t i_ref_stride )
  965. {
  966. return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  967. }
  968. int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  969. uint8_t *p_ref, intptr_t i_ref_stride )
  970. {
  971. return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  972. }
  973. int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  974. uint8_t *p_ref, intptr_t i_ref_stride )
  975. {
  976. return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  977. }
  978. int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
  979. uint8_t *p_ref, intptr_t i_ref_stride )
  980. {
  981. return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
  982. }
  983. int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
  984. uint8_t *p_ref, intptr_t i_ref_stride )
  985. {
  986. return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
  987. }
  988. int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
  989. uint8_t *p_ref, intptr_t i_ref_stride )
  990. {
  991. return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
  992. }
  993. int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
  994. uint8_t *p_ref, intptr_t i_ref_stride )
  995. {
  996. return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
  997. }
  998. void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
  999. int32_t p_sad_array[3] )
  1000. {
  1001. x264_intra_predict_vert_4x4_msa( p_dec );
  1002. p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
  1003. p_enc, FENC_STRIDE );
  1004. x264_intra_predict_hor_4x4_msa( p_dec );
  1005. p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
  1006. p_enc, FENC_STRIDE );
  1007. x264_intra_predict_dc_4x4_msa( p_dec );
  1008. p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
  1009. p_enc, FENC_STRIDE );
  1010. }
  1011. void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
  1012. int32_t p_sad_array[3] )
  1013. {
  1014. x264_intra_predict_vert_16x16_msa( p_dec );
  1015. p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
  1016. p_enc, FENC_STRIDE );
  1017. x264_intra_predict_hor_16x16_msa( p_dec );
  1018. p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
  1019. p_enc, FENC_STRIDE );
  1020. x264_intra_predict_dc_16x16_msa( p_dec );
  1021. p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
  1022. p_enc, FENC_STRIDE );
  1023. }
  1024. void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
  1025. int32_t p_sad_array[3] )
  1026. {
  1027. ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
  1028. x264_intra_predict_v_8x8_msa( pix, p_edge );
  1029. p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
  1030. p_enc, FENC_STRIDE );
  1031. x264_intra_predict_h_8x8_msa( pix, p_edge );
  1032. p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
  1033. p_enc, FENC_STRIDE );
  1034. x264_intra_predict_dc_8x8_msa( pix, p_edge );
  1035. p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
  1036. p_enc, FENC_STRIDE );
  1037. }
  1038. void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
  1039. int32_t p_sad_array[3] )
  1040. {
  1041. x264_intra_predict_dc_4blk_8x8_msa( p_dec );
  1042. p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
  1043. p_enc, FENC_STRIDE );
  1044. x264_intra_predict_hor_8x8_msa( p_dec );
  1045. p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
  1046. p_enc, FENC_STRIDE );
  1047. x264_intra_predict_vert_8x8_msa( p_dec );
  1048. p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
  1049. p_enc, FENC_STRIDE );
  1050. }
  1051. void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
  1052. const uint8_t *p_pix2, intptr_t i_stride2,
  1053. int32_t i_sums[2][4] )
  1054. {
  1055. ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums );
  1056. }
  1057. uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
  1058. {
  1059. uint64_t u_sum;
  1060. u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
  1061. return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
  1062. }
  1063. uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
  1064. {
  1065. uint64_t u_sum;
  1066. u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
  1067. u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
  1068. return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
  1069. }
  1070. uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride )
  1071. {
  1072. uint64_t u_sum;
  1073. u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
  1074. u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
  1075. return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
  1076. }
  1077. uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
  1078. {
  1079. uint64_t u_sum;
  1080. u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
  1081. u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
  1082. u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
  1083. u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride );
  1084. return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
  1085. }
  1086. int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
  1087. uint8_t *p_pix2, intptr_t i_stride2 )
  1088. {
  1089. return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
  1090. }
  1091. int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
  1092. uint8_t *p_pix2, intptr_t i_stride2 )
  1093. {
  1094. return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
  1095. }
  1096. int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
  1097. uint8_t *p_pix2, intptr_t i_stride2 )
  1098. {
  1099. return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
  1100. }
  1101. int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
  1102. uint8_t *p_pix2, intptr_t i_stride2 )
  1103. {
  1104. return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
  1105. }
  1106. int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
  1107. uint8_t *p_pix2, intptr_t i_stride2 )
  1108. {
  1109. return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
  1110. }
  1111. int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
  1112. uint8_t *p_pix2, intptr_t i_stride2 )
  1113. {
  1114. return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
  1115. }
  1116. int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
  1117. uint8_t *p_pix2, intptr_t i_stride2 )
  1118. {
  1119. uint32_t u32Sum = 0;
  1120. u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
  1121. u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
  1122. p_pix2 + 8, i_stride2, 8 );
  1123. return u32Sum;
  1124. }
  1125. int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
  1126. uint8_t *p_pix2, intptr_t i_stride2 )
  1127. {
  1128. uint32_t u32Sum = 0;
  1129. u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
  1130. u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
  1131. p_pix2 + 8, i_stride2, 16 );
  1132. return u32Sum;
  1133. }
  1134. int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
  1135. uint8_t *p_pix2, intptr_t i_stride2 )
  1136. {
  1137. int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 );
  1138. return ( i32Sum + 2 ) >> 2;
  1139. }
  1140. int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
  1141. uint8_t *p_pix2, intptr_t i_stride2 )
  1142. {
  1143. int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) +
  1144. sa8d_8x8_msa( p_pix1 + 8, i_stride,
  1145. p_pix2 + 8, i_stride2 ) +
  1146. sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride,
  1147. p_pix2 + 8 * i_stride2, i_stride2 ) +
  1148. sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride,
  1149. p_pix2 + 8 + 8 * i_stride2, i_stride2 );
  1150. return ( i32Sum + 2 ) >> 2;
  1151. }
  1152. void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
  1153. int32_t p_sad_array[3] )
  1154. {
  1155. x264_intra_predict_vert_4x4_msa( p_dec );
  1156. p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
  1157. p_enc, FENC_STRIDE );
  1158. x264_intra_predict_hor_4x4_msa( p_dec );
  1159. p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
  1160. p_enc, FENC_STRIDE );
  1161. x264_intra_predict_dc_4x4_msa( p_dec );
  1162. p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
  1163. p_enc, FENC_STRIDE );
  1164. }
  1165. void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
  1166. int32_t p_sad_array[3] )
  1167. {
  1168. x264_intra_predict_vert_16x16_msa( p_dec );
  1169. p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
  1170. p_enc, FENC_STRIDE );
  1171. x264_intra_predict_hor_16x16_msa( p_dec );
  1172. p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
  1173. p_enc, FENC_STRIDE );
  1174. x264_intra_predict_dc_16x16_msa( p_dec );
  1175. p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
  1176. p_enc, FENC_STRIDE );
  1177. }
  1178. void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
  1179. int32_t p_sad_array[3] )
  1180. {
  1181. ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
  1182. x264_intra_predict_v_8x8_msa( pix, p_edge );
  1183. p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
  1184. p_enc, FENC_STRIDE );
  1185. x264_intra_predict_h_8x8_msa( pix, p_edge );
  1186. p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
  1187. p_enc, FENC_STRIDE );
  1188. x264_intra_predict_dc_8x8_msa( pix, p_edge );
  1189. p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
  1190. p_enc, FENC_STRIDE );
  1191. }
  1192. void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
  1193. int32_t p_sad_array[3] )
  1194. {
  1195. x264_intra_predict_dc_4blk_8x8_msa( p_dec );
  1196. p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
  1197. p_enc, FENC_STRIDE );
  1198. x264_intra_predict_hor_8x8_msa( p_dec );
  1199. p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
  1200. p_enc, FENC_STRIDE );
  1201. x264_intra_predict_vert_8x8_msa( p_dec );
  1202. p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
  1203. p_enc, FENC_STRIDE );
  1204. }
  1205. uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
  1206. {
  1207. return avc_pixel_var16width_msa( p_pix, i_stride, 16 );
  1208. }
  1209. uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
  1210. {
  1211. return avc_pixel_var8width_msa( p_pix, i_stride, 16 );
  1212. }
  1213. uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
  1214. {
  1215. return avc_pixel_var8width_msa( p_pix, i_stride, 8 );
  1216. }
  1217. int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
  1218. uint8_t *p_pix2, intptr_t i_stride2,
  1219. int32_t *p_ssd )
  1220. {
  1221. int32_t i_var = 0, i_diff = 0, i_sqr = 0;
  1222. i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16,
  1223. &i_diff );
  1224. i_var = VARIANCE_WxH( i_sqr, i_diff, 7 );
  1225. *p_ssd = i_sqr;
  1226. return i_var;
  1227. }
  1228. int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
  1229. uint8_t *p_pix2, intptr_t i_stride2,
  1230. int32_t *p_ssd )
  1231. {
  1232. int32_t i_var = 0, i_diff = 0, i_sqr = 0;
  1233. i_sqr = sse_diff_8width_msa( p_pix1, i_stride1,
  1234. p_pix2, i_stride2, 8, &i_diff );
  1235. i_var = VARIANCE_WxH( i_sqr, i_diff, 6 );
  1236. *p_ssd = i_sqr;
  1237. return i_var;
  1238. }
  1239. #endif