quant.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. /*****************************************************************************
  2. * quant.c: quantization and level-run
  3. *****************************************************************************
  4. * Copyright (C) 2005-2018 x264 project
  5. *
  6. * Authors: Loren Merritt <lorenm@u.washington.edu>
  7. * Fiona Glaser <fiona@x264.com>
  8. * Christian Heine <sennindemokrit@gmx.net>
  9. * Henrik Gramner <henrik@gramner.com>
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. *
  25. * This program is also available under a commercial proprietary license.
  26. * For more information, contact us at licensing@x264.com.
  27. *****************************************************************************/
  28. #include "common.h"
  29. #if HAVE_MMX
  30. #include "x86/quant.h"
  31. #endif
  32. #if ARCH_PPC
  33. # include "ppc/quant.h"
  34. #endif
  35. #if ARCH_ARM
  36. # include "arm/quant.h"
  37. #endif
  38. #if ARCH_AARCH64
  39. # include "aarch64/quant.h"
  40. #endif
  41. #if ARCH_MIPS
  42. # include "mips/quant.h"
  43. #endif
  44. #define QUANT_ONE( coef, mf, f ) \
  45. { \
  46. if( (coef) > 0 ) \
  47. (coef) = (f + (coef)) * (mf) >> 16; \
  48. else \
  49. (coef) = - ((f - (coef)) * (mf) >> 16); \
  50. nz |= (coef); \
  51. }
  52. static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
  53. {
  54. int nz = 0;
  55. for( int i = 0; i < 64; i++ )
  56. QUANT_ONE( dct[i], mf[i], bias[i] );
  57. return !!nz;
  58. }
  59. static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
  60. {
  61. int nz = 0;
  62. for( int i = 0; i < 16; i++ )
  63. QUANT_ONE( dct[i], mf[i], bias[i] );
  64. return !!nz;
  65. }
  66. static int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] )
  67. {
  68. int nza = 0;
  69. for( int j = 0; j < 4; j++ )
  70. {
  71. int nz = 0;
  72. for( int i = 0; i < 16; i++ )
  73. QUANT_ONE( dct[j][i], mf[i], bias[i] );
  74. nza |= (!!nz)<<j;
  75. }
  76. return nza;
  77. }
  78. static int quant_4x4_dc( dctcoef dct[16], int mf, int bias )
  79. {
  80. int nz = 0;
  81. for( int i = 0; i < 16; i++ )
  82. QUANT_ONE( dct[i], mf, bias );
  83. return !!nz;
  84. }
  85. static int quant_2x2_dc( dctcoef dct[4], int mf, int bias )
  86. {
  87. int nz = 0;
  88. QUANT_ONE( dct[0], mf, bias );
  89. QUANT_ONE( dct[1], mf, bias );
  90. QUANT_ONE( dct[2], mf, bias );
  91. QUANT_ONE( dct[3], mf, bias );
  92. return !!nz;
  93. }
  94. #define DEQUANT_SHL( x ) \
  95. dct[x] = ( dct[x] * dequant_mf[i_mf][x] ) << i_qbits
  96. #define DEQUANT_SHR( x ) \
  97. dct[x] = ( dct[x] * dequant_mf[i_mf][x] + f ) >> (-i_qbits)
  98. static void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
  99. {
  100. const int i_mf = i_qp%6;
  101. const int i_qbits = i_qp/6 - 4;
  102. if( i_qbits >= 0 )
  103. {
  104. for( int i = 0; i < 16; i++ )
  105. DEQUANT_SHL( i );
  106. }
  107. else
  108. {
  109. const int f = 1 << (-i_qbits-1);
  110. for( int i = 0; i < 16; i++ )
  111. DEQUANT_SHR( i );
  112. }
  113. }
  114. static void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp )
  115. {
  116. const int i_mf = i_qp%6;
  117. const int i_qbits = i_qp/6 - 6;
  118. if( i_qbits >= 0 )
  119. {
  120. for( int i = 0; i < 64; i++ )
  121. DEQUANT_SHL( i );
  122. }
  123. else
  124. {
  125. const int f = 1 << (-i_qbits-1);
  126. for( int i = 0; i < 64; i++ )
  127. DEQUANT_SHR( i );
  128. }
  129. }
  130. static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp )
  131. {
  132. const int i_qbits = i_qp/6 - 6;
  133. if( i_qbits >= 0 )
  134. {
  135. const int i_dmf = dequant_mf[i_qp%6][0] << i_qbits;
  136. for( int i = 0; i < 16; i++ )
  137. dct[i] *= i_dmf;
  138. }
  139. else
  140. {
  141. const int i_dmf = dequant_mf[i_qp%6][0];
  142. const int f = 1 << (-i_qbits-1);
  143. for( int i = 0; i < 16; i++ )
  144. dct[i] = ( dct[i] * i_dmf + f ) >> (-i_qbits);
  145. }
  146. }
  147. #define IDCT_DEQUANT_2X4_START \
  148. int a0 = dct[0] + dct[1]; \
  149. int a1 = dct[2] + dct[3]; \
  150. int a2 = dct[4] + dct[5]; \
  151. int a3 = dct[6] + dct[7]; \
  152. int a4 = dct[0] - dct[1]; \
  153. int a5 = dct[2] - dct[3]; \
  154. int a6 = dct[4] - dct[5]; \
  155. int a7 = dct[6] - dct[7]; \
  156. int b0 = a0 + a1; \
  157. int b1 = a2 + a3; \
  158. int b2 = a4 + a5; \
  159. int b3 = a6 + a7; \
  160. int b4 = a0 - a1; \
  161. int b5 = a2 - a3; \
  162. int b6 = a4 - a5; \
  163. int b7 = a6 - a7;
  164. static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
  165. {
  166. IDCT_DEQUANT_2X4_START
  167. int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  168. dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
  169. dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
  170. dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
  171. dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
  172. dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
  173. dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
  174. dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
  175. dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
  176. }
  177. static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
  178. {
  179. IDCT_DEQUANT_2X4_START
  180. int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  181. dct[0] = ((b0 + b1) * dmf + 32) >> 6;
  182. dct[1] = ((b2 + b3) * dmf + 32) >> 6;
  183. dct[2] = ((b0 - b1) * dmf + 32) >> 6;
  184. dct[3] = ((b2 - b3) * dmf + 32) >> 6;
  185. dct[4] = ((b4 - b5) * dmf + 32) >> 6;
  186. dct[5] = ((b6 - b7) * dmf + 32) >> 6;
  187. dct[6] = ((b4 + b5) * dmf + 32) >> 6;
  188. dct[7] = ((b6 + b7) * dmf + 32) >> 6;
  189. }
  190. static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
  191. {
  192. IDCT_DEQUANT_2X4_START
  193. out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
  194. out[1] = ((b2 + b3) * dmf + 2080) >> 6;
  195. out[2] = ((b0 - b1) * dmf + 2080) >> 6;
  196. out[3] = ((b2 - b3) * dmf + 2080) >> 6;
  197. out[4] = ((b4 - b5) * dmf + 2080) >> 6;
  198. out[5] = ((b6 - b7) * dmf + 2080) >> 6;
  199. out[6] = ((b4 + b5) * dmf + 2080) >> 6;
  200. out[7] = ((b6 + b7) * dmf + 2080) >> 6;
  201. }
  202. #undef IDCT_DEQUANT_2X4_START
  203. static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
  204. {
  205. int d0 = dct[0] + dct[1];
  206. int d1 = dct[2] + dct[3];
  207. int d2 = dct[0] - dct[1];
  208. int d3 = dct[2] - dct[3];
  209. out[0] = ((d0 + d1) * dmf >> 5) + 32;
  210. out[1] = ((d0 - d1) * dmf >> 5) + 32;
  211. out[2] = ((d2 + d3) * dmf >> 5) + 32;
  212. out[3] = ((d2 - d3) * dmf >> 5) + 32;
  213. }
  214. static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
  215. {
  216. dctcoef out[8];
  217. if( chroma422 )
  218. optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
  219. else
  220. optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
  221. int sum = 0;
  222. for( int i = 0; i < (chroma422?8:4); i++ )
  223. sum |= ref[i] ^ out[i];
  224. return sum >> 6;
  225. }
  226. static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
  227. {
  228. /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
  229. dctcoef dct_orig[8];
  230. int coeff, nz;
  231. if( chroma422 )
  232. optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
  233. else
  234. optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
  235. /* If the DC coefficients already round to zero, terminate early. */
  236. int sum = 0;
  237. for( int i = 0; i < (chroma422?8:4); i++ )
  238. sum |= dct_orig[i];
  239. if( !(sum >> 6) )
  240. return 0;
  241. /* Start with the highest frequency coefficient... is this the best option? */
  242. for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
  243. {
  244. int level = dct[coeff];
  245. int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
  246. while( level )
  247. {
  248. dct[coeff] = level - sign;
  249. if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
  250. {
  251. nz = 1;
  252. dct[coeff] = level;
  253. break;
  254. }
  255. level -= sign;
  256. }
  257. }
  258. return nz;
  259. }
  260. static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
  261. {
  262. return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
  263. }
  264. static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
  265. {
  266. return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
  267. }
  268. static void denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
  269. {
  270. for( int i = 0; i < size; i++ )
  271. {
  272. int level = dct[i];
  273. int sign = level>>31;
  274. level = (level+sign)^sign;
  275. sum[i] += level;
  276. level -= offset[i];
  277. dct[i] = level<0 ? 0 : (level^sign)-sign;
  278. }
  279. }
  280. /* (ref: JVT-B118)
  281. * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs
  282. * to 0 (low score means set it to null)
  283. * Used in inter macroblock (luma and chroma)
  284. * luma: for a 8x8 block: if score < 4 -> null
  285. * for the complete mb: if score < 6 -> null
  286. * chroma: for the complete mb: if score < 7 -> null
  287. */
  288. static ALWAYS_INLINE int decimate_score_internal( dctcoef *dct, int i_max )
  289. {
  290. const uint8_t *ds_table = (i_max == 64) ? x264_decimate_table8 : x264_decimate_table4;
  291. int i_score = 0;
  292. int idx = i_max - 1;
  293. while( idx >= 0 && dct[idx] == 0 )
  294. idx--;
  295. while( idx >= 0 )
  296. {
  297. int i_run;
  298. if( (unsigned)(dct[idx--] + 1) > 2 )
  299. return 9;
  300. i_run = 0;
  301. while( idx >= 0 && dct[idx] == 0 )
  302. {
  303. idx--;
  304. i_run++;
  305. }
  306. i_score += ds_table[i_run];
  307. }
  308. return i_score;
  309. }
  310. static int decimate_score15( dctcoef *dct )
  311. {
  312. return decimate_score_internal( dct+1, 15 );
  313. }
  314. static int decimate_score16( dctcoef *dct )
  315. {
  316. return decimate_score_internal( dct, 16 );
  317. }
  318. static int decimate_score64( dctcoef *dct )
  319. {
  320. return decimate_score_internal( dct, 64 );
  321. }
  322. #define last(num)\
  323. static int coeff_last##num( dctcoef *l )\
  324. {\
  325. int i_last = num-1;\
  326. while( i_last >= 0 && l[i_last] == 0 )\
  327. i_last--;\
  328. return i_last;\
  329. }
  330. last(4)
  331. last(8)
  332. last(15)
  333. last(16)
  334. last(64)
  335. #define level_run(num)\
  336. static int coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
  337. {\
  338. int i_last = runlevel->last = coeff_last##num(dct);\
  339. int i_total = 0;\
  340. int mask = 0;\
  341. do\
  342. {\
  343. runlevel->level[i_total++] = dct[i_last];\
  344. mask |= 1 << (i_last);\
  345. while( --i_last >= 0 && dct[i_last] == 0 );\
  346. } while( i_last >= 0 );\
  347. runlevel->mask = mask;\
  348. return i_total;\
  349. }
  350. level_run(4)
  351. level_run(8)
  352. level_run(15)
  353. level_run(16)
  354. #if ARCH_X86_64
  355. #define INIT_TRELLIS(cpu)\
  356. pf->trellis_cabac_4x4 = x264_trellis_cabac_4x4_##cpu;\
  357. pf->trellis_cabac_8x8 = x264_trellis_cabac_8x8_##cpu;\
  358. pf->trellis_cabac_4x4_psy = x264_trellis_cabac_4x4_psy_##cpu;\
  359. pf->trellis_cabac_8x8_psy = x264_trellis_cabac_8x8_psy_##cpu;\
  360. pf->trellis_cabac_dc = x264_trellis_cabac_dc_##cpu;\
  361. pf->trellis_cabac_chroma_422_dc = x264_trellis_cabac_chroma_422_dc_##cpu;
  362. #else
  363. #define INIT_TRELLIS(...)
  364. #endif
  365. void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
  366. {
  367. pf->quant_8x8 = quant_8x8;
  368. pf->quant_4x4 = quant_4x4;
  369. pf->quant_4x4x4 = quant_4x4x4;
  370. pf->quant_4x4_dc = quant_4x4_dc;
  371. pf->quant_2x2_dc = quant_2x2_dc;
  372. pf->dequant_4x4 = dequant_4x4;
  373. pf->dequant_4x4_dc = dequant_4x4_dc;
  374. pf->dequant_8x8 = dequant_8x8;
  375. pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
  376. pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
  377. pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
  378. pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
  379. pf->denoise_dct = denoise_dct;
  380. pf->decimate_score15 = decimate_score15;
  381. pf->decimate_score16 = decimate_score16;
  382. pf->decimate_score64 = decimate_score64;
  383. pf->coeff_last4 = coeff_last4;
  384. pf->coeff_last8 = coeff_last8;
  385. pf->coeff_last[ DCT_LUMA_AC] = coeff_last15;
  386. pf->coeff_last[ DCT_LUMA_4x4] = coeff_last16;
  387. pf->coeff_last[ DCT_LUMA_8x8] = coeff_last64;
  388. pf->coeff_level_run4 = coeff_level_run4;
  389. pf->coeff_level_run8 = coeff_level_run8;
  390. pf->coeff_level_run[ DCT_LUMA_AC] = coeff_level_run15;
  391. pf->coeff_level_run[ DCT_LUMA_4x4] = coeff_level_run16;
  392. #if HIGH_BIT_DEPTH
  393. #if HAVE_MMX
  394. INIT_TRELLIS( sse2 );
  395. if( cpu&X264_CPU_MMX2 )
  396. {
  397. #if ARCH_X86
  398. pf->denoise_dct = x264_denoise_dct_mmx;
  399. pf->coeff_last8 = x264_coeff_last8_mmx2;
  400. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
  401. pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
  402. pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
  403. pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
  404. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
  405. pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
  406. #endif
  407. pf->coeff_last4 = x264_coeff_last4_mmx2;
  408. pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
  409. }
  410. if( cpu&X264_CPU_SSE2 )
  411. {
  412. pf->quant_4x4 = x264_quant_4x4_sse2;
  413. pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
  414. pf->quant_8x8 = x264_quant_8x8_sse2;
  415. pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
  416. pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
  417. pf->dequant_4x4 = x264_dequant_4x4_sse2;
  418. pf->dequant_8x8 = x264_dequant_8x8_sse2;
  419. pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
  420. pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
  421. pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
  422. pf->denoise_dct = x264_denoise_dct_sse2;
  423. pf->decimate_score15 = x264_decimate_score15_sse2;
  424. pf->decimate_score16 = x264_decimate_score16_sse2;
  425. pf->decimate_score64 = x264_decimate_score64_sse2;
  426. pf->coeff_last8 = x264_coeff_last8_sse2;
  427. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
  428. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
  429. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
  430. pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
  431. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
  432. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
  433. }
  434. if( cpu&X264_CPU_LZCNT )
  435. {
  436. pf->coeff_last4 = x264_coeff_last4_lzcnt;
  437. pf->coeff_last8 = x264_coeff_last8_lzcnt;
  438. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
  439. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
  440. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
  441. pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
  442. pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
  443. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
  444. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
  445. }
  446. if( cpu&X264_CPU_SSSE3 )
  447. {
  448. pf->quant_4x4 = x264_quant_4x4_ssse3;
  449. pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
  450. pf->quant_8x8 = x264_quant_8x8_ssse3;
  451. pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
  452. pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
  453. pf->denoise_dct = x264_denoise_dct_ssse3;
  454. pf->decimate_score15 = x264_decimate_score15_ssse3;
  455. pf->decimate_score16 = x264_decimate_score16_ssse3;
  456. pf->decimate_score64 = x264_decimate_score64_ssse3;
  457. INIT_TRELLIS( ssse3 );
  458. }
  459. if( cpu&X264_CPU_SSE4 )
  460. {
  461. pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
  462. pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
  463. pf->quant_4x4 = x264_quant_4x4_sse4;
  464. pf->quant_4x4x4 = x264_quant_4x4x4_sse4;
  465. pf->quant_8x8 = x264_quant_8x8_sse4;
  466. }
  467. if( cpu&X264_CPU_AVX )
  468. {
  469. pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
  470. pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
  471. pf->denoise_dct = x264_denoise_dct_avx;
  472. }
  473. if( cpu&X264_CPU_XOP )
  474. {
  475. pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
  476. if( h->param.i_cqm_preset != X264_CQM_FLAT )
  477. {
  478. pf->dequant_4x4 = x264_dequant_4x4_xop;
  479. pf->dequant_8x8 = x264_dequant_8x8_xop;
  480. }
  481. }
  482. if( cpu&X264_CPU_AVX2 )
  483. {
  484. pf->quant_4x4 = x264_quant_4x4_avx2;
  485. pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
  486. pf->quant_8x8 = x264_quant_8x8_avx2;
  487. pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
  488. pf->dequant_4x4 = x264_dequant_4x4_avx2;
  489. pf->dequant_8x8 = x264_dequant_8x8_avx2;
  490. pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
  491. pf->denoise_dct = x264_denoise_dct_avx2;
  492. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
  493. }
  494. if( cpu&X264_CPU_AVX512 )
  495. {
  496. pf->dequant_4x4 = x264_dequant_4x4_avx512;
  497. pf->dequant_8x8 = x264_dequant_8x8_avx512;
  498. pf->decimate_score15 = x264_decimate_score15_avx512;
  499. pf->decimate_score16 = x264_decimate_score16_avx512;
  500. pf->decimate_score64 = x264_decimate_score64_avx512;
  501. pf->coeff_last4 = x264_coeff_last4_avx512;
  502. pf->coeff_last8 = x264_coeff_last8_avx512;
  503. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
  504. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
  505. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
  506. }
  507. #endif // HAVE_MMX
  508. #else // !HIGH_BIT_DEPTH
  509. #if HAVE_MMX
  510. INIT_TRELLIS( sse2 );
  511. if( cpu&X264_CPU_MMX )
  512. {
  513. #if ARCH_X86
  514. pf->dequant_4x4 = x264_dequant_4x4_mmx;
  515. pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
  516. pf->dequant_8x8 = x264_dequant_8x8_mmx;
  517. if( h->param.i_cqm_preset == X264_CQM_FLAT )
  518. {
  519. pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
  520. pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
  521. }
  522. pf->denoise_dct = x264_denoise_dct_mmx;
  523. #endif
  524. }
  525. if( cpu&X264_CPU_MMX2 )
  526. {
  527. pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
  528. #if ARCH_X86
  529. pf->quant_4x4 = x264_quant_4x4_mmx2;
  530. pf->quant_8x8 = x264_quant_8x8_mmx2;
  531. pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
  532. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
  533. pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
  534. pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
  535. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
  536. pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
  537. #endif
  538. pf->coeff_last4 = x264_coeff_last4_mmx2;
  539. pf->coeff_last8 = x264_coeff_last8_mmx2;
  540. pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
  541. pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
  542. }
  543. if( cpu&X264_CPU_SSE2 )
  544. {
  545. pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
  546. pf->quant_4x4 = x264_quant_4x4_sse2;
  547. pf->quant_4x4x4 = x264_quant_4x4x4_sse2;
  548. pf->quant_8x8 = x264_quant_8x8_sse2;
  549. pf->dequant_4x4 = x264_dequant_4x4_sse2;
  550. pf->dequant_4x4_dc = x264_dequant_4x4dc_sse2;
  551. pf->dequant_8x8 = x264_dequant_8x8_sse2;
  552. if( h->param.i_cqm_preset == X264_CQM_FLAT )
  553. {
  554. pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
  555. pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
  556. }
  557. pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_sse2;
  558. pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_sse2;
  559. pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
  560. pf->denoise_dct = x264_denoise_dct_sse2;
  561. pf->decimate_score15 = x264_decimate_score15_sse2;
  562. pf->decimate_score16 = x264_decimate_score16_sse2;
  563. pf->decimate_score64 = x264_decimate_score64_sse2;
  564. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
  565. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
  566. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
  567. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
  568. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
  569. }
  570. if( cpu&X264_CPU_LZCNT )
  571. {
  572. pf->coeff_last4 = x264_coeff_last4_lzcnt;
  573. pf->coeff_last8 = x264_coeff_last8_lzcnt;
  574. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
  575. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
  576. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
  577. pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
  578. pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
  579. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
  580. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
  581. }
  582. if( cpu&X264_CPU_SSSE3 )
  583. {
  584. pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
  585. pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
  586. pf->quant_4x4 = x264_quant_4x4_ssse3;
  587. pf->quant_4x4x4 = x264_quant_4x4x4_ssse3;
  588. pf->quant_8x8 = x264_quant_8x8_ssse3;
  589. pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
  590. pf->denoise_dct = x264_denoise_dct_ssse3;
  591. pf->decimate_score15 = x264_decimate_score15_ssse3;
  592. pf->decimate_score16 = x264_decimate_score16_ssse3;
  593. pf->decimate_score64 = x264_decimate_score64_ssse3;
  594. INIT_TRELLIS( ssse3 );
  595. #if ARCH_X86 || !defined( __MACH__ )
  596. pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
  597. pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
  598. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
  599. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
  600. if( cpu&X264_CPU_LZCNT )
  601. {
  602. pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt;
  603. pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt;
  604. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
  605. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
  606. }
  607. #endif
  608. }
  609. if( cpu&X264_CPU_SSE4 )
  610. {
  611. pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
  612. pf->quant_4x4 = x264_quant_4x4_sse4;
  613. pf->quant_8x8 = x264_quant_8x8_sse4;
  614. pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
  615. }
  616. if( cpu&X264_CPU_AVX )
  617. {
  618. pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
  619. if( h->param.i_cqm_preset != X264_CQM_FLAT )
  620. {
  621. pf->dequant_4x4 = x264_dequant_4x4_avx;
  622. pf->dequant_8x8 = x264_dequant_8x8_avx;
  623. }
  624. pf->idct_dequant_2x4_dc = x264_idct_dequant_2x4_dc_avx;
  625. pf->idct_dequant_2x4_dconly = x264_idct_dequant_2x4_dconly_avx;
  626. pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
  627. pf->denoise_dct = x264_denoise_dct_avx;
  628. }
  629. if( cpu&X264_CPU_XOP )
  630. {
  631. if( h->param.i_cqm_preset != X264_CQM_FLAT )
  632. {
  633. pf->dequant_4x4 = x264_dequant_4x4_xop;
  634. pf->dequant_8x8 = x264_dequant_8x8_xop;
  635. }
  636. }
  637. if( cpu&X264_CPU_AVX2 )
  638. {
  639. pf->quant_4x4 = x264_quant_4x4_avx2;
  640. pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
  641. pf->quant_8x8 = x264_quant_8x8_avx2;
  642. pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
  643. pf->dequant_4x4 = x264_dequant_4x4_avx2;
  644. pf->dequant_8x8 = x264_dequant_8x8_avx2;
  645. pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
  646. if( h->param.i_cqm_preset == X264_CQM_FLAT )
  647. {
  648. pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
  649. pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
  650. }
  651. pf->decimate_score64 = x264_decimate_score64_avx2;
  652. pf->denoise_dct = x264_denoise_dct_avx2;
  653. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
  654. #if ARCH_X86 || !defined( __MACH__ )
  655. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
  656. pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
  657. #endif
  658. }
  659. if( cpu&X264_CPU_AVX512 )
  660. {
  661. if( h->param.i_cqm_preset == X264_CQM_FLAT )
  662. pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512;
  663. else
  664. {
  665. pf->dequant_4x4 = x264_dequant_4x4_avx512;
  666. pf->dequant_8x8 = x264_dequant_8x8_avx512;
  667. }
  668. pf->decimate_score15 = x264_decimate_score15_avx512;
  669. pf->decimate_score16 = x264_decimate_score16_avx512;
  670. pf->decimate_score64 = x264_decimate_score64_avx512;
  671. pf->coeff_last8 = x264_coeff_last8_avx512;
  672. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
  673. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
  674. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
  675. }
  676. #endif // HAVE_MMX
  677. #if HAVE_ALTIVEC
  678. if( cpu&X264_CPU_ALTIVEC )
  679. {
  680. pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
  681. pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
  682. pf->quant_4x4 = x264_quant_4x4_altivec;
  683. pf->quant_8x8 = x264_quant_8x8_altivec;
  684. pf->dequant_4x4 = x264_dequant_4x4_altivec;
  685. pf->dequant_8x8 = x264_dequant_8x8_altivec;
  686. }
  687. #endif
  688. #if HAVE_ARMV6
  689. if( cpu&X264_CPU_ARMV6 )
  690. {
  691. pf->coeff_last4 = x264_coeff_last4_arm;
  692. pf->coeff_last8 = x264_coeff_last8_arm;
  693. }
  694. #endif
  695. #if HAVE_ARMV6 || ARCH_AARCH64
  696. if( cpu&X264_CPU_NEON )
  697. {
  698. pf->quant_2x2_dc = x264_quant_2x2_dc_neon;
  699. pf->quant_4x4 = x264_quant_4x4_neon;
  700. pf->quant_4x4_dc = x264_quant_4x4_dc_neon;
  701. pf->quant_4x4x4 = x264_quant_4x4x4_neon;
  702. pf->quant_8x8 = x264_quant_8x8_neon;
  703. pf->dequant_4x4 = x264_dequant_4x4_neon;
  704. pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon;
  705. pf->dequant_8x8 = x264_dequant_8x8_neon;
  706. pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
  707. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
  708. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
  709. pf->denoise_dct = x264_denoise_dct_neon;
  710. pf->decimate_score15 = x264_decimate_score15_neon;
  711. pf->decimate_score16 = x264_decimate_score16_neon;
  712. pf->decimate_score64 = x264_decimate_score64_neon;
  713. }
  714. #endif
  715. #if ARCH_AARCH64
  716. if( cpu&X264_CPU_ARMV8 )
  717. {
  718. pf->coeff_last4 = x264_coeff_last4_aarch64;
  719. pf->coeff_last8 = x264_coeff_last8_aarch64;
  720. pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
  721. }
  722. if( cpu&X264_CPU_NEON )
  723. {
  724. pf->coeff_level_run8 = x264_coeff_level_run8_neon;
  725. pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
  726. pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
  727. }
  728. #endif
  729. #if HAVE_MSA
  730. if( cpu&X264_CPU_MSA )
  731. {
  732. pf->quant_4x4 = x264_quant_4x4_msa;
  733. pf->quant_4x4_dc = x264_quant_4x4_dc_msa;
  734. pf->quant_4x4x4 = x264_quant_4x4x4_msa;
  735. pf->quant_8x8 = x264_quant_8x8_msa;
  736. pf->dequant_4x4 = x264_dequant_4x4_msa;
  737. pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa;
  738. pf->dequant_8x8 = x264_dequant_8x8_msa;
  739. pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa;
  740. pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa;
  741. }
  742. #endif
  743. #endif // HIGH_BIT_DEPTH
  744. pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =
  745. pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
  746. pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[DCT_CHROMAU_AC] =
  747. pf->coeff_last[DCT_CHROMAV_AC] = pf->coeff_last[DCT_LUMA_AC];
  748. pf->coeff_last[DCT_CHROMAU_8x8] = pf->coeff_last[DCT_CHROMAV_8x8] = pf->coeff_last[DCT_LUMA_8x8];
  749. pf->coeff_level_run[DCT_LUMA_DC] = pf->coeff_level_run[DCT_CHROMAU_DC] = pf->coeff_level_run[DCT_CHROMAV_DC] =
  750. pf->coeff_level_run[DCT_CHROMAU_4x4] = pf->coeff_level_run[DCT_CHROMAV_4x4] = pf->coeff_level_run[DCT_LUMA_4x4];
  751. pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[DCT_CHROMAU_AC] =
  752. pf->coeff_level_run[DCT_CHROMAV_AC] = pf->coeff_level_run[DCT_LUMA_AC];
  753. }