dct-c.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. /*****************************************************************************
  2. * dct-c.c: msa transform and zigzag
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Rishikesh More <rishikesh.more@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "dct.h"
  28. #if !HIGH_BIT_DEPTH
  29. #define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \
  30. { \
  31. v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
  32. \
  33. tmp0_m = in0 + in2; \
  34. tmp1_m = in0 - in2; \
  35. tmp2_m = in1 >> 1; \
  36. tmp2_m = tmp2_m - in3; \
  37. tmp3_m = in3 >> 1; \
  38. tmp3_m = in1 + tmp3_m; \
  39. \
  40. BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \
  41. }
  42. static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
  43. int32_t i_src_stride )
  44. {
  45. v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
  46. v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
  47. v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
  48. v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
  49. LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
  50. UNPCK_R_SH_SW( src0, src0_r );
  51. UNPCK_R_SH_SW( src1, src1_r );
  52. UNPCK_R_SH_SW( src2, src2_r );
  53. UNPCK_R_SH_SW( src3, src3_r );
  54. BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
  55. tmp0, tmp3, tmp2, tmp1 );
  56. BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
  57. hor_res0, hor_res3, hor_res2, hor_res1 );
  58. TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
  59. hor_res0, hor_res1, hor_res2, hor_res3 );
  60. BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
  61. tmp0, tmp3, tmp2, tmp1 );
  62. BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
  63. ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
  64. SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
  65. PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
  66. ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
  67. ver_res0, ver_res1, ver_res2, ver_res3 );
  68. PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
  69. ST_SH2( ver_res0, ver_res2, p_dst, 8 );
  70. }
  71. static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
  72. uint8_t *p_ref, int32_t i_dst_stride,
  73. int16_t *p_dst )
  74. {
  75. uint32_t i_src0, i_src1, i_src2, i_src3;
  76. uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
  77. v16i8 src = { 0 };
  78. v16i8 ref = { 0 };
  79. v16u8 inp0, inp1;
  80. v8i16 diff0, diff1, diff2, diff3;
  81. v8i16 temp0, temp1, temp2, temp3;
  82. LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
  83. LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
  84. INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
  85. INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
  86. ILVRL_B2_UB( src, ref, inp0, inp1 );
  87. HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
  88. diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
  89. diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
  90. BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
  91. diff0 = temp0 + temp1;
  92. diff1 = ( temp3 << 1 ) + temp2;
  93. diff2 = temp0 - temp1;
  94. diff3 = temp3 - ( temp2 << 1 );
  95. TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
  96. temp0, temp1, temp2, temp3 );
  97. BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
  98. temp0 = diff0 + diff1;
  99. temp1 = ( diff3 << 1 ) + diff2;
  100. temp2 = diff0 - diff1;
  101. temp3 = diff3 - ( diff2 << 1 );
  102. ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
  103. ST_UB2( inp0, inp1, p_dst, 8 );
  104. }
  105. static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
  106. int16_t pi_level[16] )
  107. {
  108. v8i16 src0, src1;
  109. v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
  110. v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
  111. LD_SH2( pi_dct, 8, src0, src1 );
  112. VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
  113. ST_SH2( mask0, mask1, pi_level, 8 );
  114. }
  115. static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
  116. int32_t i_dst_stride )
  117. {
  118. v8i16 src0, src1, src2, src3;
  119. v8i16 hres0, hres1, hres2, hres3;
  120. v8i16 vres0, vres1, vres2, vres3;
  121. v8i16 zeros = { 0 };
  122. LD4x4_SH( p_src, src0, src1, src2, src3 );
  123. AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
  124. TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
  125. hres0, hres1, hres2, hres3 );
  126. AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
  127. SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
  128. ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
  129. ST_SH2( zeros, zeros, p_src, 8 );
  130. }
  131. static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
  132. int32_t i_dst_stride )
  133. {
  134. int16_t i_dc;
  135. uint32_t i_src0, i_src1, i_src2, i_src3;
  136. v16u8 pred = { 0 };
  137. v16i8 out;
  138. v8i16 input_dc, pred_r, pred_l;
  139. i_dc = ( p_src[0] + 32 ) >> 6;
  140. input_dc = __msa_fill_h( i_dc );
  141. p_src[ 0 ] = 0;
  142. LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
  143. INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
  144. UNPCK_UB_SH( pred, pred_r, pred_l );
  145. pred_r += input_dc;
  146. pred_l += input_dc;
  147. CLIP_SH2_0_255( pred_r, pred_l );
  148. out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
  149. ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
  150. }
  151. static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
  152. int32_t i_dst_stride )
  153. {
  154. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  155. v8i16 vec0, vec1, vec2, vec3;
  156. v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  157. v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
  158. v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
  159. v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
  160. v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
  161. v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
  162. v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
  163. v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  164. v16i8 zeros = { 0 };
  165. p_src[ 0 ] += 32;
  166. LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
  167. vec0 = src0 + src4;
  168. vec1 = src0 - src4;
  169. vec2 = src2 >> 1;
  170. vec2 = vec2 - src6;
  171. vec3 = src6 >> 1;
  172. vec3 = src2 + vec3;
  173. BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
  174. vec0 = src7 >> 1;
  175. vec0 = src5 - vec0 - src3 - src7;
  176. vec1 = src3 >> 1;
  177. vec1 = src1 - vec1 + src7 - src3;
  178. vec2 = src5 >> 1;
  179. vec2 = vec2 - src1 + src7 + src5;
  180. vec3 = src1 >> 1;
  181. vec3 = vec3 + src3 + src5 + src1;
  182. tmp4 = vec3 >> 2;
  183. tmp4 += vec0;
  184. tmp5 = vec2 >> 2;
  185. tmp5 += vec1;
  186. tmp6 = vec1 >> 2;
  187. tmp6 -= vec2;
  188. tmp7 = vec0 >> 2;
  189. tmp7 = vec3 - tmp7;
  190. BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
  191. res0, res1, res2, res3, res4, res5, res6, res7 );
  192. TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
  193. res0, res1, res2, res3, res4, res5, res6, res7 );
  194. UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
  195. UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
  196. UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
  197. UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
  198. UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
  199. UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
  200. UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
  201. UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
  202. BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
  203. vec0_r, vec0_l, vec1_l, vec1_r );
  204. vec2_r = tmp2_r >> 1;
  205. vec2_l = tmp2_l >> 1;
  206. vec2_r -= tmp6_r;
  207. vec2_l -= tmp6_l;
  208. vec3_r = tmp6_r >> 1;
  209. vec3_l = tmp6_l >> 1;
  210. vec3_r += tmp2_r;
  211. vec3_l += tmp2_l;
  212. BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
  213. tmp0_r, tmp2_r, tmp4_r, tmp6_r );
  214. BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
  215. tmp0_l, tmp2_l, tmp4_l, tmp6_l );
  216. vec0_r = tmp7_r >> 1;
  217. vec0_l = tmp7_l >> 1;
  218. vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
  219. vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
  220. vec1_r = tmp3_r >> 1;
  221. vec1_l = tmp3_l >> 1;
  222. vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
  223. vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
  224. vec2_r = tmp5_r >> 1;
  225. vec2_l = tmp5_l >> 1;
  226. vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
  227. vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
  228. vec3_r = tmp1_r >> 1;
  229. vec3_l = tmp1_l >> 1;
  230. vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
  231. vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
  232. tmp1_r = vec3_r >> 2;
  233. tmp1_l = vec3_l >> 2;
  234. tmp1_r += vec0_r;
  235. tmp1_l += vec0_l;
  236. tmp3_r = vec2_r >> 2;
  237. tmp3_l = vec2_l >> 2;
  238. tmp3_r += vec1_r;
  239. tmp3_l += vec1_l;
  240. tmp5_r = vec1_r >> 2;
  241. tmp5_l = vec1_l >> 2;
  242. tmp5_r -= vec2_r;
  243. tmp5_l -= vec2_l;
  244. tmp7_r = vec0_r >> 2;
  245. tmp7_l = vec0_l >> 2;
  246. tmp7_r = vec3_r - tmp7_r;
  247. tmp7_l = vec3_l - tmp7_l;
  248. BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
  249. res0_r, res0_l, res7_l, res7_r );
  250. BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
  251. res1_r, res1_l, res6_l, res6_r );
  252. BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
  253. res2_r, res2_l, res5_l, res5_r );
  254. BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
  255. res3_r, res3_l, res4_l, res4_r );
  256. SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
  257. SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
  258. SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
  259. SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
  260. PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
  261. res0, res1, res2, res3 );
  262. PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
  263. res4, res5, res6, res7 );
  264. LD_SB8( p_dst, i_dst_stride,
  265. dst0, dst1, dst2, dst3,
  266. dst4, dst5, dst6, dst7 );
  267. ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
  268. tmp0, tmp1, tmp2, tmp3 );
  269. ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
  270. tmp4, tmp5, tmp6, tmp7 );
  271. ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
  272. res0, res1, res2, res3 );
  273. ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
  274. res4, res5, res6, res7 );
  275. CLIP_SH4_0_255( res0, res1, res2, res3 );
  276. CLIP_SH4_0_255( res4, res5, res6, res7 );
  277. PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
  278. dst0, dst1, dst2, dst3 );
  279. ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
  280. p_dst += ( 4 * i_dst_stride );
  281. ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
  282. }
  283. static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
  284. int16_t *p_dst, int32_t i_dst_stride )
  285. {
  286. v8i16 src0, src1, src2, src3;
  287. v4i32 src0_r, src1_r, src2_r, src3_r;
  288. v4i32 hres0, hres1, hres2, hres3;
  289. v8i16 vres0, vres1, vres2, vres3;
  290. v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
  291. v2i64 res0, res1;
  292. LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
  293. UNPCK_R_SH_SW( src0, src0_r );
  294. UNPCK_R_SH_SW( src1, src1_r );
  295. UNPCK_R_SH_SW( src2, src2_r );
  296. UNPCK_R_SH_SW( src3, src3_r );
  297. BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
  298. BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
  299. TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
  300. hres0, hres1, hres2, hres3 );
  301. BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
  302. BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
  303. PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
  304. vres0, vres1, vres2, vres3 );
  305. PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
  306. ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
  307. }
  308. static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
  309. uint8_t *pred_ptr, int32_t i_pred_stride )
  310. {
  311. int16_t i_sum;
  312. uint32_t i_src0, i_src1, i_src2, i_src3;
  313. uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
  314. v16i8 src = { 0 };
  315. v16i8 pred = { 0 };
  316. v16u8 src_l0, src_l1;
  317. v8i16 diff0, diff1;
  318. LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
  319. LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
  320. INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
  321. INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
  322. ILVRL_B2_UB( src, pred, src_l0, src_l1 );
  323. HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
  324. i_sum = HADD_UH_U32( diff0 + diff1 );
  325. return i_sum;
  326. }
  327. void x264_dct4x4dc_msa( int16_t d[16] )
  328. {
  329. avc_dct4x4dc_msa( d, d, 4 );
  330. }
  331. void x264_idct4x4dc_msa( int16_t d[16] )
  332. {
  333. avc_idct4x4dc_msa( d, 4, d, 4 );
  334. }
  335. void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
  336. {
  337. avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
  338. }
  339. void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
  340. {
  341. avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
  342. avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
  343. avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
  344. &pi_dct[2][0], FDEC_STRIDE );
  345. avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
  346. &pi_dct[3][0], FDEC_STRIDE );
  347. }
  348. void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
  349. {
  350. x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
  351. x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
  352. x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
  353. x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
  354. }
  355. void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
  356. {
  357. avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
  358. }
  359. void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
  360. {
  361. avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
  362. avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
  363. avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
  364. &pi_dct[2][0], FDEC_STRIDE );
  365. avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
  366. &pi_dct[3][0], FDEC_STRIDE );
  367. }
  368. void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
  369. {
  370. avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
  371. avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
  372. avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
  373. &pi_dct[2], FDEC_STRIDE );
  374. avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
  375. &pi_dct[3], FDEC_STRIDE );
  376. }
  377. void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
  378. {
  379. for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
  380. {
  381. avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
  382. avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
  383. avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
  384. avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
  385. }
  386. }
  387. void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
  388. uint8_t *p_ref )
  389. {
  390. avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
  391. }
  392. void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
  393. uint8_t *p_ref )
  394. {
  395. avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
  396. &p_ref[0], FDEC_STRIDE, p_dst[0] );
  397. avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
  398. FDEC_STRIDE, p_dst[1] );
  399. avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
  400. FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
  401. FDEC_STRIDE, p_dst[2] );
  402. avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
  403. FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
  404. FDEC_STRIDE, p_dst[3] );
  405. }
  406. void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
  407. uint8_t *p_src,
  408. uint8_t *p_ref )
  409. {
  410. x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
  411. x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
  412. x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
  413. &p_ref[8*FDEC_STRIDE+0] );
  414. x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
  415. &p_ref[8*FDEC_STRIDE+8] );
  416. }
  417. void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
  418. uint8_t *p_pix1, uint8_t *p_pix2 )
  419. {
  420. int32_t d0, d1, d2, d3;
  421. pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
  422. &p_pix2[0], FDEC_STRIDE );
  423. pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
  424. &p_pix2[4], FDEC_STRIDE );
  425. pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
  426. &p_pix2[4 * FDEC_STRIDE + 0],
  427. FDEC_STRIDE );
  428. pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
  429. &p_pix2[4 * FDEC_STRIDE + 4],
  430. FDEC_STRIDE );
  431. BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
  432. BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
  433. }
  434. void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
  435. uint8_t *p_pix1, uint8_t *p_pix2 )
  436. {
  437. int32_t a0, a1, a2, a3, a4, a5, a6, a7;
  438. int32_t b0, b1, b2, b3, b4, b5, b6, b7;
  439. a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
  440. &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
  441. a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
  442. &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
  443. a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
  444. &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
  445. a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
  446. &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
  447. a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
  448. &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
  449. a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
  450. &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
  451. a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
  452. &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
  453. a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
  454. &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
  455. BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
  456. b0, b1, b2, b3, b7, b6, b5, b4 );
  457. BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
  458. a0, a1, a2, a3, a7, a6, a5, a4 );
  459. BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
  460. pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
  461. pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
  462. }
  463. void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
  464. {
  465. avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
  466. }
  467. #endif