predict-c.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. /*****************************************************************************
  2. * predict-c.c: msa intra prediction
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "predict.h"
  28. #if !HIGH_BIT_DEPTH
  29. static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
  30. int32_t i_dst_stride )
  31. {
  32. uint32_t u_src_data;
  33. u_src_data = LW( p_src );
  34. SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
  35. }
  36. static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
  37. int32_t i_dst_stride )
  38. {
  39. uint64_t u_out;
  40. u_out = LD( p_src );
  41. SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
  42. p_dst += ( 4 * i_dst_stride );
  43. SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
  44. }
  45. static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
  46. int32_t i_dst_stride )
  47. {
  48. v16u8 src0 = LD_UB( p_src );
  49. ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
  50. i_dst_stride );
  51. p_dst += ( 8 * i_dst_stride );
  52. ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
  53. i_dst_stride );
  54. }
  55. static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
  56. uint8_t *p_dst, int32_t i_dst_stride )
  57. {
  58. uint32_t u_out0, u_out1, u_out2, u_out3;
  59. u_out0 = p_src[0 * i_src_stride] * 0x01010101;
  60. u_out1 = p_src[1 * i_src_stride] * 0x01010101;
  61. u_out2 = p_src[2 * i_src_stride] * 0x01010101;
  62. u_out3 = p_src[3 * i_src_stride] * 0x01010101;
  63. SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  64. }
  65. static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
  66. uint8_t *p_dst, int32_t i_dst_stride )
  67. {
  68. uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
  69. u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
  70. u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
  71. u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
  72. u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
  73. u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
  74. u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
  75. u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
  76. u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
  77. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  78. p_dst += ( 4 * i_dst_stride );
  79. SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
  80. }
  81. static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
  82. uint8_t *p_dst,
  83. int32_t i_dst_stride )
  84. {
  85. uint32_t u_row;
  86. uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
  87. v16u8 src0, src1, src2, src3;
  88. for( u_row = 4; u_row--; )
  89. {
  90. u_inp0 = p_src[0];
  91. p_src += i_src_stride;
  92. u_inp1 = p_src[0];
  93. p_src += i_src_stride;
  94. u_inp2 = p_src[0];
  95. p_src += i_src_stride;
  96. u_inp3 = p_src[0];
  97. p_src += i_src_stride;
  98. src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
  99. src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
  100. src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
  101. src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
  102. ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
  103. p_dst += ( 4 * i_dst_stride );
  104. }
  105. }
  106. static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
  107. int32_t i_src_stride_left,
  108. uint8_t *p_dst, int32_t i_dst_stride,
  109. uint8_t is_above, uint8_t is_left )
  110. {
  111. uint32_t u_row;
  112. uint32_t u_out, u_addition = 0;
  113. v16u8 src_above, store;
  114. v8u16 sum_above;
  115. v4u32 sum;
  116. if( is_left && is_above )
  117. {
  118. src_above = LD_UB( p_src_top );
  119. sum_above = __msa_hadd_u_h( src_above, src_above );
  120. sum = __msa_hadd_u_w( sum_above, sum_above );
  121. u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
  122. for( u_row = 0; u_row < 4; u_row++ )
  123. {
  124. u_addition += p_src_left[u_row * i_src_stride_left];
  125. }
  126. u_addition = ( u_addition + 4 ) >> 3;
  127. store = ( v16u8 ) __msa_fill_b( u_addition );
  128. }
  129. else if( is_left )
  130. {
  131. for( u_row = 0; u_row < 4; u_row++ )
  132. {
  133. u_addition += p_src_left[u_row * i_src_stride_left];
  134. }
  135. u_addition = ( u_addition + 2 ) >> 2;
  136. store = ( v16u8 ) __msa_fill_b( u_addition );
  137. }
  138. else if( is_above )
  139. {
  140. src_above = LD_UB( p_src_top );
  141. sum_above = __msa_hadd_u_h( src_above, src_above );
  142. sum = __msa_hadd_u_w( sum_above, sum_above );
  143. sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
  144. store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
  145. }
  146. else
  147. {
  148. store = ( v16u8 ) __msa_ldi_b( 128 );
  149. }
  150. u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
  151. SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
  152. }
  153. static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
  154. uint8_t *p_dst, int32_t i_dst_stride )
  155. {
  156. uint64_t u_val0, u_val1;
  157. v16i8 store;
  158. v16u8 src = { 0 };
  159. v8u16 sum_h;
  160. v4u32 sum_w;
  161. v2u64 sum_d;
  162. u_val0 = LD( p_src_top );
  163. u_val1 = LD( p_src_left );
  164. INSERT_D2_UB( u_val0, u_val1, src );
  165. sum_h = __msa_hadd_u_h( src, src );
  166. sum_w = __msa_hadd_u_w( sum_h, sum_h );
  167. sum_d = __msa_hadd_u_d( sum_w, sum_w );
  168. sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
  169. sum_d = __msa_hadd_u_d( sum_w, sum_w );
  170. sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
  171. store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
  172. u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
  173. SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
  174. p_dst += ( 4 * i_dst_stride );
  175. SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
  176. }
  177. static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
  178. int32_t i_src_stride_left,
  179. uint8_t *p_dst, int32_t i_dst_stride,
  180. uint8_t is_above, uint8_t is_left )
  181. {
  182. uint32_t u_row;
  183. uint32_t u_addition = 0;
  184. v16u8 src_above, store;
  185. v8u16 sum_above;
  186. v4u32 sum_top;
  187. v2u64 sum;
  188. if( is_left && is_above )
  189. {
  190. src_above = LD_UB( p_src_top );
  191. sum_above = __msa_hadd_u_h( src_above, src_above );
  192. sum_top = __msa_hadd_u_w( sum_above, sum_above );
  193. sum = __msa_hadd_u_d( sum_top, sum_top );
  194. sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
  195. sum = __msa_hadd_u_d( sum_top, sum_top );
  196. u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
  197. for( u_row = 0; u_row < 16; u_row++ )
  198. {
  199. u_addition += p_src_left[u_row * i_src_stride_left];
  200. }
  201. u_addition = ( u_addition + 16 ) >> 5;
  202. store = ( v16u8 ) __msa_fill_b( u_addition );
  203. }
  204. else if( is_left )
  205. {
  206. for( u_row = 0; u_row < 16; u_row++ )
  207. {
  208. u_addition += p_src_left[u_row * i_src_stride_left];
  209. }
  210. u_addition = ( u_addition + 8 ) >> 4;
  211. store = ( v16u8 ) __msa_fill_b( u_addition );
  212. }
  213. else if( is_above )
  214. {
  215. src_above = LD_UB( p_src_top );
  216. sum_above = __msa_hadd_u_h( src_above, src_above );
  217. sum_top = __msa_hadd_u_w( sum_above, sum_above );
  218. sum = __msa_hadd_u_d( sum_top, sum_top );
  219. sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
  220. sum = __msa_hadd_u_d( sum_top, sum_top );
  221. sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
  222. store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
  223. }
  224. else
  225. {
  226. store = ( v16u8 ) __msa_ldi_b( 128 );
  227. }
  228. ST_UB8( store, store, store, store, store, store, store, store, p_dst,
  229. i_dst_stride );
  230. p_dst += ( 8 * i_dst_stride );
  231. ST_UB8( store, store, store, store, store, store, store, store, p_dst,
  232. i_dst_stride );
  233. }
  234. static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
  235. {
  236. uint8_t u_lpcnt;
  237. int32_t i_res, i_res0, i_res1, i_res2, i_res3;
  238. uint64_t u_out0, u_out1;
  239. v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
  240. v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
  241. v4i32 int_multiplier = { 0, 1, 2, 3 };
  242. v16u8 p_src_top;
  243. v8i16 vec9, vec10, vec11;
  244. v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
  245. v2i64 sum;
  246. p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
  247. p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
  248. ( v16i8 ) p_src_top );
  249. vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
  250. vec9 *= short_multiplier;
  251. vec8 = __msa_hadd_s_w( vec9, vec9 );
  252. sum = __msa_hadd_s_d( vec8, vec8 );
  253. i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
  254. i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
  255. 2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
  256. 3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
  257. 4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
  258. i_res0 *= 17;
  259. i_res1 *= 17;
  260. i_res0 = ( i_res0 + 16 ) >> 5;
  261. i_res1 = ( i_res1 + 16 ) >> 5;
  262. i_res3 = 3 * ( i_res0 + i_res1 );
  263. i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
  264. i_res = i_res2 - i_res3;
  265. vec8 = __msa_fill_w( i_res0 );
  266. vec4 = __msa_fill_w( i_res );
  267. vec2 = __msa_fill_w( i_res1 );
  268. vec5 = vec8 * int_multiplier;
  269. vec3 = vec8 * 4;
  270. for( u_lpcnt = 4; u_lpcnt--; )
  271. {
  272. vec0 = vec5;
  273. vec0 += vec4;
  274. vec1 = vec0 + vec3;
  275. vec6 = vec5;
  276. vec4 += vec2;
  277. vec6 += vec4;
  278. vec7 = vec6 + vec3;
  279. SRA_4V( vec0, vec1, vec6, vec7, 5 );
  280. PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
  281. CLIP_SH2_0_255( vec10, vec11 );
  282. PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
  283. u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
  284. u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
  285. SD( u_out0, p_src );
  286. p_src += i_stride;
  287. SD( u_out1, p_src );
  288. p_src += i_stride;
  289. vec4 += vec2;
  290. }
  291. }
  292. static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
  293. {
  294. uint8_t u_lpcnt;
  295. int32_t i_res0, i_res1, i_res2, i_res3;
  296. uint64_t u_load0, u_load1;
  297. v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
  298. v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
  299. v4i32 int_multiplier = { 0, 1, 2, 3 };
  300. v16u8 p_src_top = { 0 };
  301. v8i16 vec9, vec10;
  302. v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
  303. u_load0 = LD( p_src - ( i_stride + 1 ) );
  304. u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
  305. INSERT_D2_UB( u_load0, u_load1, p_src_top );
  306. p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
  307. ( v16i8 ) p_src_top );
  308. vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
  309. vec9 *= short_multiplier;
  310. vec8 = __msa_hadd_s_w( vec9, vec9 );
  311. res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
  312. i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
  313. i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
  314. 2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
  315. 3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
  316. 4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
  317. 5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
  318. 6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
  319. 7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
  320. 8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
  321. i_res0 *= 5;
  322. i_res1 *= 5;
  323. i_res0 = ( i_res0 + 32 ) >> 6;
  324. i_res1 = ( i_res1 + 32 ) >> 6;
  325. i_res3 = 7 * ( i_res0 + i_res1 );
  326. i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
  327. i_res2 -= i_res3;
  328. vec8 = __msa_fill_w( i_res0 );
  329. vec4 = __msa_fill_w( i_res2 );
  330. vec5 = __msa_fill_w( i_res1 );
  331. vec6 = vec8 * 4;
  332. vec7 = vec8 * int_multiplier;
  333. for( u_lpcnt = 16; u_lpcnt--; )
  334. {
  335. vec0 = vec7;
  336. vec0 += vec4;
  337. vec1 = vec0 + vec6;
  338. vec2 = vec1 + vec6;
  339. vec3 = vec2 + vec6;
  340. SRA_4V( vec0, vec1, vec2, vec3, 5 );
  341. PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
  342. CLIP_SH2_0_255( vec9, vec10 );
  343. PCKEV_ST_SB( vec9, vec10, p_src );
  344. p_src += i_stride;
  345. vec4 += vec5;
  346. }
  347. }
  348. static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
  349. {
  350. uint8_t u_lp_cnt;
  351. uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
  352. uint32_t u_out0, u_out1, u_out2, u_out3;
  353. v16u8 p_src_top;
  354. v8u16 add;
  355. v4u32 sum;
  356. p_src_top = LD_UB( p_src - i_stride );
  357. add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
  358. sum = __msa_hadd_u_w( add, add );
  359. u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
  360. u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
  361. for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
  362. {
  363. u_src0 += p_src[u_lp_cnt * i_stride - 1];
  364. u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
  365. }
  366. u_src0 = ( u_src0 + 4 ) >> 3;
  367. u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
  368. u_src1 = ( u_src1 + 2 ) >> 2;
  369. u_src2 = ( u_src2 + 2 ) >> 2;
  370. u_out0 = u_src0 * 0x01010101;
  371. u_out1 = u_src1 * 0x01010101;
  372. u_out2 = u_src2 * 0x01010101;
  373. u_out3 = u_src3 * 0x01010101;
  374. for( u_lp_cnt = 4; u_lp_cnt--; )
  375. {
  376. SW( u_out0, p_src );
  377. SW( u_out1, ( p_src + 4 ) );
  378. SW( u_out2, ( p_src + 4 * i_stride ) );
  379. SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
  380. p_src += i_stride;
  381. }
  382. }
  383. static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
  384. int32_t i_dst_stride )
  385. {
  386. uint8_t u_src_val = p_src[15];
  387. uint64_t u_out0, u_out1, u_out2, u_out3;
  388. v16u8 src, vec4, vec5, res0;
  389. v8u16 vec0, vec1, vec2, vec3;
  390. v2i64 res1, res2, res3;
  391. src = LD_UB( p_src );
  392. vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
  393. vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
  394. vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
  395. ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
  396. ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
  397. HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
  398. vec0 += vec1;
  399. vec2 += vec3;
  400. vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
  401. vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
  402. res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
  403. res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
  404. res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
  405. res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
  406. u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
  407. u_out1 = __msa_copy_u_d( res1, 0 );
  408. u_out2 = __msa_copy_u_d( res2, 0 );
  409. u_out3 = __msa_copy_u_d( res3, 0 );
  410. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  411. p_dst += ( 4 * i_dst_stride );
  412. res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
  413. res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
  414. res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
  415. res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
  416. u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
  417. u_out1 = __msa_copy_u_d( res1, 0 );
  418. u_out2 = __msa_copy_u_d( res2, 0 );
  419. u_out3 = __msa_copy_u_d( res3, 0 );
  420. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  421. }
  422. static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
  423. int32_t i_dst_stride )
  424. {
  425. v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
  426. ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
  427. p_dst += ( 8 * i_dst_stride );
  428. ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
  429. }
  430. void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
  431. {
  432. intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
  433. FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
  434. }
  435. void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
  436. {
  437. intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
  438. FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
  439. }
  440. void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
  441. {
  442. intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
  443. FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
  444. }
  445. void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
  446. {
  447. intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
  448. }
  449. void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
  450. {
  451. intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
  452. p_src, FDEC_STRIDE );
  453. }
  454. void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
  455. {
  456. intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
  457. }
  458. void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
  459. {
  460. intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
  461. }
  462. void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
  463. {
  464. intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
  465. }
  466. void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
  467. {
  468. intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
  469. p_src, FDEC_STRIDE );
  470. }
  471. void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
  472. {
  473. intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
  474. }
  475. void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
  476. {
  477. intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
  478. }
  479. void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
  480. {
  481. intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
  482. }
  483. void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
  484. {
  485. intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
  486. p_src, FDEC_STRIDE );
  487. }
  488. void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
  489. {
  490. intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
  491. }
  492. void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
  493. {
  494. intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
  495. }
  496. void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
  497. {
  498. intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
  499. FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
  500. }
  501. void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
  502. {
  503. intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
  504. p_src, FDEC_STRIDE );
  505. }
  506. void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
  507. {
  508. intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
  509. }
  510. #endif