quant.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. /*****************************************************************************
  2. * quant.c: ppc quantization
  3. *****************************************************************************
  4. * Copyright (C) 2007-2018 x264 project
  5. *
  6. * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "ppccommon.h"
  27. #include "quant.h"
  28. #if !HIGH_BIT_DEPTH
  29. // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
  30. #define QUANT_16_U( idx0, idx1 ) \
  31. { \
  32. temp1v = vec_ld((idx0), dct); \
  33. temp2v = vec_ld((idx1), dct); \
  34. mfvA = vec_ld((idx0), mf); \
  35. mfvB = vec_ld((idx1), mf); \
  36. biasvA = vec_ld((idx0), bias); \
  37. biasvB = vec_ld((idx1), bias); \
  38. mskA = vec_cmplt(temp1v, zero_s16v); \
  39. mskB = vec_cmplt(temp2v, zero_s16v); \
  40. coefvA = (vec_u16_t)vec_abs( temp1v ); \
  41. coefvB = (vec_u16_t)vec_abs( temp2v ); \
  42. coefvA = vec_adds(coefvA, biasvA); \
  43. coefvB = vec_adds(coefvB, biasvB); \
  44. multEvenvA = vec_mule(coefvA, mfvA); \
  45. multOddvA = vec_mulo(coefvA, mfvA); \
  46. multEvenvB = vec_mule(coefvB, mfvB); \
  47. multOddvB = vec_mulo(coefvB, mfvB); \
  48. multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
  49. multOddvA = vec_sr(multOddvA, i_qbitsv); \
  50. multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
  51. multOddvB = vec_sr(multOddvB, i_qbitsv); \
  52. temp1v = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
  53. tmpv = xxpermdi( temp1v, temp1v, 2 ); \
  54. temp1v = vec_mergeh( temp1v, tmpv ); \
  55. temp2v = (vec_s16_t) vec_packs( multEvenvB, multOddvB ); \
  56. tmpv = xxpermdi( temp2v, temp2v, 2 ); \
  57. temp2v = vec_mergeh( temp2v, tmpv ); \
  58. temp1v = vec_xor(temp1v, mskA); \
  59. temp2v = vec_xor(temp2v, mskB); \
  60. temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
  61. vec_st(temp1v, (idx0), dct); \
  62. temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
  63. nz = vec_or(nz, vec_or(temp1v, temp2v)); \
  64. vec_st(temp2v, (idx1), dct); \
  65. }
  66. int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
  67. {
  68. LOAD_ZERO;
  69. vector bool short mskA;
  70. vec_u32_t i_qbitsv;
  71. vec_u16_t coefvA;
  72. vec_u32_t multEvenvA, multOddvA;
  73. vec_u16_t mfvA;
  74. vec_u16_t biasvA;
  75. vec_s16_t one = vec_splat_s16(1);
  76. vec_s16_t nz = zero_s16v;
  77. vector bool short mskB;
  78. vec_u16_t coefvB;
  79. vec_u32_t multEvenvB, multOddvB;
  80. vec_u16_t mfvB;
  81. vec_u16_t biasvB;
  82. vec_s16_t temp1v, temp2v, tmpv;
  83. vec_u32_u qbits_u;
  84. qbits_u.s[0]=16;
  85. i_qbitsv = vec_splat(qbits_u.v, 0);
  86. QUANT_16_U( 0, 16 );
  87. return vec_any_ne(nz, zero_s16v);
  88. }
  89. // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
  90. #define QUANT_16_U_DC( idx0, idx1 ) \
  91. { \
  92. temp1v = vec_ld((idx0), dct); \
  93. temp2v = vec_ld((idx1), dct); \
  94. mskA = vec_cmplt(temp1v, zero_s16v); \
  95. mskB = vec_cmplt(temp2v, zero_s16v); \
  96. coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
  97. coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
  98. coefvA = vec_add(coefvA, biasv); \
  99. coefvB = vec_add(coefvB, biasv); \
  100. multEvenvA = vec_mule(coefvA, mfv); \
  101. multOddvA = vec_mulo(coefvA, mfv); \
  102. multEvenvB = vec_mule(coefvB, mfv); \
  103. multOddvB = vec_mulo(coefvB, mfv); \
  104. multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
  105. multOddvA = vec_sr(multOddvA, i_qbitsv); \
  106. multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
  107. multOddvB = vec_sr(multOddvB, i_qbitsv); \
  108. temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
  109. temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
  110. temp1v = vec_xor(temp1v, mskA); \
  111. temp2v = vec_xor(temp2v, mskB); \
  112. temp1v = vec_add(temp1v, vec_and(mskA, one)); \
  113. vec_st(temp1v, (idx0), dct); \
  114. temp2v = vec_add(temp2v, vec_and(mskB, one)); \
  115. nz = vec_or(nz, vec_or(temp1v, temp2v)); \
  116. vec_st(temp2v, (idx1), dct); \
  117. }
  118. int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
  119. {
  120. LOAD_ZERO;
  121. vector bool short mskA;
  122. vec_u32_t i_qbitsv;
  123. vec_u16_t coefvA;
  124. vec_u32_t multEvenvA, multOddvA;
  125. vec_s16_t one = vec_splat_s16(1);
  126. vec_s16_t nz = zero_s16v;
  127. vector bool short mskB;
  128. vec_u16_t coefvB;
  129. vec_u32_t multEvenvB, multOddvB;
  130. vec_s16_t temp1v, temp2v;
  131. vec_u16_t mfv;
  132. vec_u16_t biasv;
  133. mfv = vec_splats( (uint16_t)mf );
  134. i_qbitsv = vec_splats( (uint32_t) 16 );
  135. biasv = vec_splats( (uint16_t)bias );
  136. QUANT_16_U_DC( 0, 16 );
  137. return vec_any_ne(nz, zero_s16v);
  138. }
  139. // DC quant of a whole 2x2 block
  140. #define QUANT_4_U_DC( idx0 ) \
  141. { \
  142. const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
  143. temp1v = vec_ld((idx0), dct); \
  144. mskA = vec_cmplt(temp1v, zero_s16v); \
  145. coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
  146. coefvA = vec_add(coefvA, biasv); \
  147. multEvenvA = vec_mule(coefvA, mfv); \
  148. multOddvA = vec_mulo(coefvA, mfv); \
  149. multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
  150. multOddvA = vec_sr(multOddvA, i_qbitsv); \
  151. temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
  152. temp2v = vec_xor(temp2v, mskA); \
  153. temp2v = vec_add(temp2v, vec_and(mskA, one)); \
  154. temp1v = vec_sel(temp1v, temp2v, sel); \
  155. nz = vec_or(nz, temp1v); \
  156. vec_st(temp1v, (idx0), dct); \
  157. }
  158. int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
  159. {
  160. LOAD_ZERO;
  161. vector bool short mskA;
  162. vec_u32_t i_qbitsv;
  163. vec_u16_t coefvA;
  164. vec_u32_t multEvenvA, multOddvA;
  165. vec_s16_t one = vec_splat_s16(1);
  166. vec_s16_t nz = zero_s16v;
  167. vec_s16_t temp1v, temp2v;
  168. vec_u16_t mfv;
  169. vec_u16_t biasv;
  170. mfv = vec_splats( (uint16_t)mf );
  171. i_qbitsv = vec_splats( (uint32_t) 16 );
  172. biasv = vec_splats( (uint16_t)bias );
  173. static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
  174. QUANT_4_U_DC(0);
  175. return vec_any_ne(vec_and(nz, mask2), zero_s16v);
  176. }
  177. int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
  178. {
  179. LOAD_ZERO;
  180. vector bool short mskA;
  181. vec_u32_t i_qbitsv;
  182. vec_u16_t coefvA;
  183. vec_u32_t multEvenvA, multOddvA;
  184. vec_u16_t mfvA;
  185. vec_u16_t biasvA;
  186. vec_s16_t one = vec_splat_s16(1);
  187. vec_s16_t nz = zero_s16v;
  188. vector bool short mskB;
  189. vec_u16_t coefvB;
  190. vec_u32_t multEvenvB, multOddvB;
  191. vec_u16_t mfvB;
  192. vec_u16_t biasvB;
  193. vec_s16_t temp1v, temp2v, tmpv;
  194. vec_u32_u qbits_u;
  195. qbits_u.s[0]=16;
  196. i_qbitsv = vec_splat(qbits_u.v, 0);
  197. for( int i = 0; i < 4; i++ )
  198. QUANT_16_U( i*2*16, i*2*16+16 );
  199. return vec_any_ne(nz, zero_s16v);
  200. }
  201. #define DEQUANT_SHL() \
  202. { \
  203. dctv = vec_ld(8*y, dct); \
  204. mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
  205. mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
  206. mfv = vec_packs(mf1v, mf2v); \
  207. \
  208. multEvenvA = vec_mule(dctv, mfv); \
  209. multOddvA = vec_mulo(dctv, mfv); \
  210. dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
  211. vec_mergel(multEvenvA, multOddvA)); \
  212. dctv = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
  213. tmpv = xxpermdi( dctv, dctv, 2 ); \
  214. dctv = vec_mergeh( dctv, tmpv ); \
  215. dctv = vec_sl(dctv, i_qbitsv); \
  216. vec_st(dctv, 8*y, dct); \
  217. }
  218. #ifdef WORDS_BIGENDIAN
  219. #define VEC_MULE vec_mule
  220. #define VEC_MULO vec_mulo
  221. #else
  222. #define VEC_MULE vec_mulo
  223. #define VEC_MULO vec_mule
  224. #endif
  225. #define DEQUANT_SHR() \
  226. { \
  227. dctv = vec_ld(8*y, dct); \
  228. dct1v = vec_mergeh(dctv, dctv); \
  229. dct2v = vec_mergel(dctv, dctv); \
  230. mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
  231. mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
  232. \
  233. multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \
  234. multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \
  235. temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
  236. temp1v = vec_add(temp1v, fv); \
  237. temp1v = vec_sra(temp1v, i_qbitsv); \
  238. \
  239. multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \
  240. multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \
  241. temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
  242. temp2v = vec_add(temp2v, fv); \
  243. temp2v = vec_sra(temp2v, i_qbitsv); \
  244. \
  245. dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
  246. vec_st(dctv, y*8, dct); \
  247. }
  248. void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
  249. {
  250. int i_mf = i_qp%6;
  251. int i_qbits = i_qp/6 - 4;
  252. vec_s16_t dctv, tmpv;
  253. vec_s16_t dct1v, dct2v;
  254. vec_s32_t mf1v, mf2v;
  255. vec_s16_t mfv;
  256. vec_s32_t multEvenvA, multOddvA;
  257. vec_s32_t temp1v, temp2v;
  258. if( i_qbits >= 0 )
  259. {
  260. vec_u16_t i_qbitsv;
  261. i_qbitsv = vec_splats( (uint16_t) i_qbits );
  262. for( int y = 0; y < 4; y+=2 )
  263. DEQUANT_SHL();
  264. }
  265. else
  266. {
  267. const int f = 1 << (-i_qbits-1);
  268. vec_s32_t fv;
  269. fv = vec_splats( f );
  270. vec_u32_t i_qbitsv;
  271. i_qbitsv = vec_splats( (uint32_t)-i_qbits );
  272. vec_u32_t sixteenv;
  273. sixteenv = vec_splats( (uint32_t)16 );
  274. for( int y = 0; y < 4; y+=2 )
  275. DEQUANT_SHR();
  276. }
  277. }
  278. void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
  279. {
  280. int i_mf = i_qp%6;
  281. int i_qbits = i_qp/6 - 6;
  282. vec_s16_t dctv, tmpv;
  283. vec_s16_t dct1v, dct2v;
  284. vec_s32_t mf1v, mf2v;
  285. vec_s16_t mfv;
  286. vec_s32_t multEvenvA, multOddvA;
  287. vec_s32_t temp1v, temp2v;
  288. if( i_qbits >= 0 )
  289. {
  290. vec_u16_t i_qbitsv;
  291. i_qbitsv = vec_splats((uint16_t)i_qbits );
  292. for( int y = 0; y < 16; y+=2 )
  293. DEQUANT_SHL();
  294. }
  295. else
  296. {
  297. const int f = 1 << (-i_qbits-1);
  298. vec_s32_t fv;
  299. fv = vec_splats( f );
  300. vec_u32_t i_qbitsv;
  301. i_qbitsv = vec_splats( (uint32_t)-i_qbits );
  302. vec_u32_t sixteenv;
  303. sixteenv = vec_splats( (uint32_t)16 );
  304. for( int y = 0; y < 16; y+=2 )
  305. DEQUANT_SHR();
  306. }
  307. }
  308. #endif // !HIGH_BIT_DEPTH