quant-c.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. /*****************************************************************************
  2. * quant-c.c: msa quantization and level-run
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Rishikesh More <rishikesh.more@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "quant.h"
  28. #if !HIGH_BIT_DEPTH
  29. static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
  30. int32_t i_qp )
  31. {
  32. const int32_t i_mf = i_qp % 6;
  33. const int32_t q_bits = i_qp / 6 - 4;
  34. v8i16 dct0, dct1;
  35. v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
  36. LD_SH2( p_dct, 8, dct0, dct1 );
  37. LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
  38. LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
  39. if( q_bits >= 0 )
  40. {
  41. v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
  42. q_bits_vec = __msa_fill_h( q_bits );
  43. PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
  44. dequant_mf_h0, dequant_mf_h1 );
  45. dct0 *= dequant_mf_h0;
  46. dct1 *= dequant_mf_h1;
  47. dct0 <<= q_bits_vec;
  48. dct1 <<= q_bits_vec;
  49. ST_SH2( dct0, dct1, p_dct, 8 );
  50. }
  51. else
  52. {
  53. const int32_t q_bits_add = 1 << ( -q_bits - 1 );
  54. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  55. v4i32 q_bits_vec, q_bits_vec_add;
  56. q_bits_vec_add = __msa_fill_w( q_bits_add );
  57. q_bits_vec = __msa_fill_w( -q_bits );
  58. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  59. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  60. dct_signed_w0 *= dequant_m_f0;
  61. dct_signed_w1 *= dequant_m_f1;
  62. dct_signed_w2 *= dequant_m_f2;
  63. dct_signed_w3 *= dequant_m_f3;
  64. dct_signed_w0 += q_bits_vec_add;
  65. dct_signed_w1 += q_bits_vec_add;
  66. dct_signed_w2 += q_bits_vec_add;
  67. dct_signed_w3 += q_bits_vec_add;
  68. SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
  69. q_bits_vec );
  70. PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
  71. dct0, dct1 );
  72. ST_SH2( dct0, dct1, p_dct, 8 );
  73. }
  74. }
  75. static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
  76. int32_t i_qp )
  77. {
  78. const int32_t i_mf = i_qp % 6;
  79. const int32_t q_bits = i_qp / 6 - 6;
  80. v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
  81. v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
  82. v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
  83. v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
  84. v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
  85. LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
  86. LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
  87. LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
  88. LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
  89. LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
  90. LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
  91. LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
  92. LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
  93. LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
  94. if( q_bits >= 0 )
  95. {
  96. v8i16 q_bits_vec;
  97. v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
  98. v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
  99. q_bits_vec = __msa_fill_h( q_bits );
  100. PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
  101. dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
  102. dequant_mf_h0, dequant_mf_h1,
  103. dequant_mf_h2, dequant_mf_h3 );
  104. PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
  105. dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
  106. dequant_mf_h4, dequant_mf_h5,
  107. dequant_mf_h6, dequant_mf_h7 );
  108. dct0 *= dequant_mf_h0;
  109. dct1 *= dequant_mf_h1;
  110. dct2 *= dequant_mf_h2;
  111. dct3 *= dequant_mf_h3;
  112. dct4 *= dequant_mf_h4;
  113. dct5 *= dequant_mf_h5;
  114. dct6 *= dequant_mf_h6;
  115. dct7 *= dequant_mf_h7;
  116. SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
  117. SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
  118. ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
  119. }
  120. else
  121. {
  122. const int32_t q_bits_add = 1 << ( -q_bits - 1 );
  123. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  124. v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
  125. v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
  126. v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
  127. v4i32 q_bits_vec, q_bits_vec_add;
  128. q_bits_vec_add = __msa_fill_w( q_bits_add );
  129. q_bits_vec = __msa_fill_w( -q_bits );
  130. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  131. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  132. UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
  133. UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
  134. UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
  135. UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
  136. UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
  137. UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
  138. dct_signed_w0 *= dequant_m_f0;
  139. dct_signed_w1 *= dequant_m_f1;
  140. dct_signed_w2 *= dequant_m_f2;
  141. dct_signed_w3 *= dequant_m_f3;
  142. dct_signed_w4 *= dequant_m_f4;
  143. dct_signed_w5 *= dequant_m_f5;
  144. dct_signed_w6 *= dequant_m_f6;
  145. dct_signed_w7 *= dequant_m_f7;
  146. dct_signed_w8 *= dequant_m_f8;
  147. dct_signed_w9 *= dequant_m_f9;
  148. dct_signed_w10 *= dequant_m_f10;
  149. dct_signed_w11 *= dequant_m_f11;
  150. dct_signed_w12 *= dequant_m_f12;
  151. dct_signed_w13 *= dequant_m_f13;
  152. dct_signed_w14 *= dequant_m_f14;
  153. dct_signed_w15 *= dequant_m_f15;
  154. dct_signed_w0 += q_bits_vec_add;
  155. dct_signed_w1 += q_bits_vec_add;
  156. dct_signed_w2 += q_bits_vec_add;
  157. dct_signed_w3 += q_bits_vec_add;
  158. dct_signed_w4 += q_bits_vec_add;
  159. dct_signed_w5 += q_bits_vec_add;
  160. dct_signed_w6 += q_bits_vec_add;
  161. dct_signed_w7 += q_bits_vec_add;
  162. dct_signed_w8 += q_bits_vec_add;
  163. dct_signed_w9 += q_bits_vec_add;
  164. dct_signed_w10 += q_bits_vec_add;
  165. dct_signed_w11 += q_bits_vec_add;
  166. dct_signed_w12 += q_bits_vec_add;
  167. dct_signed_w13 += q_bits_vec_add;
  168. dct_signed_w14 += q_bits_vec_add;
  169. dct_signed_w15 += q_bits_vec_add;
  170. SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
  171. q_bits_vec );
  172. SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
  173. q_bits_vec );
  174. SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
  175. q_bits_vec );
  176. SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
  177. q_bits_vec );
  178. PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
  179. dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
  180. dct0, dct1, dct2, dct3 );
  181. PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
  182. dct_signed_w10, dct_signed_w13, dct_signed_w12,
  183. dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
  184. ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
  185. }
  186. }
  187. static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
  188. int32_t pi_dequant_mf[6][16],
  189. int32_t i_qp )
  190. {
  191. const int32_t q_bits = i_qp / 6 - 6;
  192. int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
  193. v8i16 dct0, dct1, dequant_mf_h;
  194. LD_SH2( p_dct, 8, dct0, dct1 );
  195. if( q_bits >= 0 )
  196. {
  197. i_dmf <<= q_bits;
  198. dequant_mf_h = __msa_fill_h( i_dmf );
  199. dct0 = dct0 * dequant_mf_h;
  200. dct1 = dct1 * dequant_mf_h;
  201. ST_SH2( dct0, dct1, p_dct, 8 );
  202. }
  203. else
  204. {
  205. const int32_t q_bits_add = 1 << ( -q_bits - 1 );
  206. v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
  207. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  208. q_bits_vec_add = __msa_fill_w( q_bits_add );
  209. q_bits_vec = __msa_fill_w( -q_bits );
  210. dequant_m_f = __msa_fill_w( i_dmf );
  211. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  212. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  213. dct_signed_w0 *= dequant_m_f;
  214. dct_signed_w1 *= dequant_m_f;
  215. dct_signed_w2 *= dequant_m_f;
  216. dct_signed_w3 *= dequant_m_f;
  217. dct_signed_w0 += q_bits_vec_add;
  218. dct_signed_w1 += q_bits_vec_add;
  219. dct_signed_w2 += q_bits_vec_add;
  220. dct_signed_w3 += q_bits_vec_add;
  221. SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
  222. q_bits_vec );
  223. PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
  224. dct0, dct1 );
  225. ST_SH2( dct0, dct1, p_dct, 8 );
  226. }
  227. }
  228. static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
  229. uint16_t *p_bias )
  230. {
  231. int32_t non_zero = 0;
  232. v8i16 dct0, dct1;
  233. v8i16 zero = { 0 };
  234. v8i16 dct0_mask, dct1_mask;
  235. v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
  236. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  237. v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
  238. v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
  239. v4i32 bias0, bias1, bias2, bias3;
  240. LD_SH2( p_dct, 8, dct0, dct1 );
  241. LD_SH2( p_bias, 8, bias_h0, bias_h1 );
  242. LD_SH2( p_mf, 8, mf_h0, mf_h1 );
  243. dct0_mask = __msa_clei_s_h( dct0, 0 );
  244. dct1_mask = __msa_clei_s_h( dct1, 0 );
  245. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  246. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  247. ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
  248. ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
  249. ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
  250. ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
  251. dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
  252. dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
  253. dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
  254. dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
  255. dct_w0 *= mf_vec0;
  256. dct_w1 *= mf_vec1;
  257. dct_w2 *= mf_vec2;
  258. dct_w3 *= mf_vec3;
  259. SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
  260. PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
  261. dct0 = zero - dct_h0;
  262. dct1 = zero - dct_h1;
  263. dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
  264. ( v16u8 ) dct0_mask );
  265. dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
  266. ( v16u8 ) dct1_mask );
  267. non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
  268. ST_SH2( dct0, dct1, p_dct, 8 );
  269. return !!non_zero;
  270. }
  271. static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
  272. uint16_t *p_bias )
  273. {
  274. int32_t non_zero = 0;
  275. v8i16 dct0, dct1, dct2, dct3;
  276. v8i16 zero = { 0 };
  277. v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
  278. v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
  279. v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
  280. v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
  281. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  282. v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
  283. v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
  284. v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
  285. v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
  286. LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
  287. dct0_mask = __msa_clei_s_h( dct0, 0 );
  288. dct1_mask = __msa_clei_s_h( dct1, 0 );
  289. dct2_mask = __msa_clei_s_h( dct2, 0 );
  290. dct3_mask = __msa_clei_s_h( dct3, 0 );
  291. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  292. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  293. UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
  294. UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
  295. LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
  296. ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
  297. bias0, bias2, bias4, bias6 );
  298. ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
  299. bias1, bias3, bias5, bias7 );
  300. LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
  301. ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
  302. mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
  303. ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
  304. mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
  305. dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
  306. dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
  307. dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
  308. dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
  309. dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
  310. dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
  311. dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
  312. dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
  313. dct_w0 *= mf_vec0;
  314. dct_w1 *= mf_vec1;
  315. dct_w2 *= mf_vec2;
  316. dct_w3 *= mf_vec3;
  317. dct_w4 *= mf_vec4;
  318. dct_w5 *= mf_vec5;
  319. dct_w6 *= mf_vec6;
  320. dct_w7 *= mf_vec7;
  321. SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
  322. SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
  323. PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
  324. dct_h0, dct_h1, dct_h2, dct_h3 );
  325. SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
  326. dct0, dct1, dct2, dct3 );
  327. dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
  328. ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
  329. dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
  330. ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
  331. dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
  332. ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
  333. dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
  334. ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
  335. non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
  336. ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
  337. LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
  338. dct0_mask = __msa_clei_s_h( dct0, 0 );
  339. dct1_mask = __msa_clei_s_h( dct1, 0 );
  340. dct2_mask = __msa_clei_s_h( dct2, 0 );
  341. dct3_mask = __msa_clei_s_h( dct3, 0 );
  342. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  343. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  344. UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
  345. UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
  346. LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
  347. ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
  348. bias0, bias2, bias4, bias6 );
  349. ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
  350. bias1, bias3, bias5, bias7 );
  351. LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
  352. ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
  353. mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
  354. ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
  355. mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
  356. dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
  357. dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
  358. dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
  359. dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
  360. dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
  361. dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
  362. dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
  363. dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
  364. dct_w0 *= mf_vec0;
  365. dct_w1 *= mf_vec1;
  366. dct_w2 *= mf_vec2;
  367. dct_w3 *= mf_vec3;
  368. dct_w4 *= mf_vec4;
  369. dct_w5 *= mf_vec5;
  370. dct_w6 *= mf_vec6;
  371. dct_w7 *= mf_vec7;
  372. SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
  373. SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
  374. PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
  375. PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
  376. SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
  377. dct0, dct1, dct2, dct3 );
  378. dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
  379. ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
  380. dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
  381. ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
  382. dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
  383. ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
  384. dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
  385. ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
  386. non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
  387. ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
  388. return !!non_zero;
  389. }
  390. static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
  391. int32_t i_bias )
  392. {
  393. int32_t non_zero = 0;
  394. v8i16 dct0, dct1, dct0_mask, dct1_mask;
  395. v8i16 zero = { 0 };
  396. v8i16 dct_h0, dct_h1;
  397. v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
  398. v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
  399. v4i32 mf_vec, bias_vec;
  400. LD_SH2( p_dct, 8, dct0, dct1 );
  401. dct0_mask = __msa_clei_s_h( dct0, 0 );
  402. dct1_mask = __msa_clei_s_h( dct1, 0 );
  403. UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
  404. UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
  405. bias_vec = __msa_fill_w( i_bias );
  406. mf_vec = __msa_fill_w( i_mf );
  407. dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
  408. dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
  409. dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
  410. dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
  411. dct_w0 *= mf_vec;
  412. dct_w1 *= mf_vec;
  413. dct_w2 *= mf_vec;
  414. dct_w3 *= mf_vec;
  415. SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
  416. PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
  417. dct0 = zero - dct_h0;
  418. dct1 = zero - dct_h1;
  419. dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
  420. ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
  421. dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
  422. ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
  423. non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
  424. ST_SH2( dct0, dct1, p_dct, 8 );
  425. return !!non_zero;
  426. }
  427. static int32_t avc_coeff_last64_msa( int16_t *p_src )
  428. {
  429. uint32_t u_res;
  430. v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
  431. v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
  432. v16u8 tmp0, tmp1, tmp2, tmp3;
  433. v8u16 vec0, vec1, vec2, vec3;
  434. v4i32 out0;
  435. v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
  436. LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
  437. tmp_h0 = __msa_ceqi_h( src0, 0 );
  438. tmp_h1 = __msa_ceqi_h( src1, 0 );
  439. tmp_h2 = __msa_ceqi_h( src2, 0 );
  440. tmp_h3 = __msa_ceqi_h( src3, 0 );
  441. tmp_h4 = __msa_ceqi_h( src4, 0 );
  442. tmp_h5 = __msa_ceqi_h( src5, 0 );
  443. tmp_h6 = __msa_ceqi_h( src6, 0 );
  444. tmp_h7 = __msa_ceqi_h( src7, 0 );
  445. PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
  446. tmp0, tmp1, tmp2, tmp3 );
  447. tmp0 = tmp0 & mask;
  448. tmp1 = tmp1 & mask;
  449. tmp2 = tmp2 & mask;
  450. tmp3 = tmp3 & mask;
  451. HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
  452. PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
  453. HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
  454. tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
  455. vec0 = __msa_hadd_u_h( tmp0, tmp0 );
  456. tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
  457. out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
  458. u_res = __msa_copy_u_w( out0, 0 );
  459. return ( 63 - u_res );
  460. }
  461. static int32_t avc_coeff_last16_msa( int16_t *p_src )
  462. {
  463. uint32_t u_res;
  464. v8i16 src0, src1;
  465. v8u16 tmp_h0;
  466. v16u8 tmp0;
  467. v8i16 out0, out1;
  468. v16i8 res0;
  469. v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
  470. LD_SH2( p_src, 8, src0, src1 );
  471. out0 = __msa_ceqi_h( src0, 0 );
  472. out1 = __msa_ceqi_h( src1, 0 );
  473. tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
  474. tmp0 = tmp0 & mask;
  475. tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
  476. tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
  477. tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
  478. tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
  479. tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
  480. res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
  481. out0 = __msa_nloc_h( ( v8i16 ) res0 );
  482. u_res = __msa_copy_u_h( out0, 0 );
  483. return ( 15 - u_res );
  484. }
  485. void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
  486. int32_t i_qp )
  487. {
  488. avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
  489. }
  490. void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
  491. int32_t i_qp )
  492. {
  493. avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
  494. }
  495. void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
  496. int32_t i_qp )
  497. {
  498. avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
  499. }
  500. int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
  501. {
  502. return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
  503. }
  504. int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
  505. uint16_t pu_mf[16], uint16_t pu_bias[16] )
  506. {
  507. int32_t i_non_zero, i_non_zero_acc = 0;
  508. for( int32_t j = 0; j < 4; j++ )
  509. {
  510. i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
  511. i_non_zero_acc |= ( !!i_non_zero ) << j;
  512. }
  513. return i_non_zero_acc;
  514. }
  515. int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
  516. {
  517. return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
  518. }
  519. int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
  520. {
  521. return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
  522. }
  523. int32_t x264_coeff_last64_msa( int16_t *p_src )
  524. {
  525. return avc_coeff_last64_msa( p_src );
  526. }
  527. int32_t x264_coeff_last16_msa( int16_t *p_src )
  528. {
  529. return avc_coeff_last16_msa( p_src );
  530. }
  531. #endif