macroblock.c 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425
  1. /*****************************************************************************
  2. * macroblock.c: macroblock encoding
  3. *****************************************************************************
  4. * Copyright (C) 2003-2018 x264 project
  5. *
  6. * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  7. * Loren Merritt <lorenm@u.washington.edu>
  8. * Fiona Glaser <fiona@x264.com>
  9. * Henrik Gramner <henrik@gramner.com>
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2 of the License, or
  14. * (at your option) any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  24. *
  25. * This program is also available under a commercial proprietary license.
  26. * For more information, contact us at licensing@x264.com.
  27. *****************************************************************************/
  28. #include "common/common.h"
  29. #include "macroblock.h"
  30. /* These chroma DC functions don't have assembly versions and are only used here. */
  31. #define ZIG(i,y,x) level[i] = dct[x*2+y];
  32. static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] )
  33. {
  34. ZIG(0,0,0)
  35. ZIG(1,0,1)
  36. ZIG(2,1,0)
  37. ZIG(3,1,1)
  38. }
  39. #undef ZIG
  40. static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
  41. {
  42. level[0] = dct[0];
  43. level[1] = dct[2];
  44. level[2] = dct[1];
  45. level[3] = dct[4];
  46. level[4] = dct[6];
  47. level[5] = dct[3];
  48. level[6] = dct[5];
  49. level[7] = dct[7];
  50. }
  51. #define IDCT_DEQUANT_2X2_START \
  52. int d0 = dct[0] + dct[1]; \
  53. int d1 = dct[2] + dct[3]; \
  54. int d2 = dct[0] - dct[1]; \
  55. int d3 = dct[2] - dct[3]; \
  56. int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  57. static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
  58. {
  59. IDCT_DEQUANT_2X2_START
  60. dct4x4[0][0] = (d0 + d1) * dmf >> 5;
  61. dct4x4[1][0] = (d0 - d1) * dmf >> 5;
  62. dct4x4[2][0] = (d2 + d3) * dmf >> 5;
  63. dct4x4[3][0] = (d2 - d3) * dmf >> 5;
  64. }
  65. static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
  66. {
  67. IDCT_DEQUANT_2X2_START
  68. dct[0] = (d0 + d1) * dmf >> 5;
  69. dct[1] = (d0 - d1) * dmf >> 5;
  70. dct[2] = (d2 + d3) * dmf >> 5;
  71. dct[3] = (d2 - d3) * dmf >> 5;
  72. }
  73. #undef IDCT_2X2_DEQUANT_START
  74. static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
  75. {
  76. int d0 = dct4x4[0][0] + dct4x4[1][0];
  77. int d1 = dct4x4[2][0] + dct4x4[3][0];
  78. int d2 = dct4x4[0][0] - dct4x4[1][0];
  79. int d3 = dct4x4[2][0] - dct4x4[3][0];
  80. d[0] = d0 + d1;
  81. d[2] = d2 + d3;
  82. d[1] = d0 - d1;
  83. d[3] = d2 - d3;
  84. dct4x4[0][0] = 0;
  85. dct4x4[1][0] = 0;
  86. dct4x4[2][0] = 0;
  87. dct4x4[3][0] = 0;
  88. }
  89. static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
  90. {
  91. if( WORD_SIZE == 8 )
  92. {
  93. for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
  94. if( M64( &v[i] ) )
  95. return 1;
  96. }
  97. else
  98. {
  99. for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
  100. if( M32( &v[i] ) )
  101. return 1;
  102. }
  103. return 0;
  104. }
  105. /* All encoding functions must output the correct CBP and NNZ values.
  106. * The entropy coding functions will check CBP first, then NNZ, before
  107. * actually reading the DCT coefficients. NNZ still must be correct even
  108. * if CBP is zero because of the use of NNZ values for context selection.
  109. * "NNZ" need only be 0 or 1 rather than the exact coefficient count because
  110. * that is only needed in CAVLC, and will be calculated by CAVLC's residual
  111. * coding and stored as necessary. */
  112. /* This means that decimation can be done merely by adjusting the CBP and NNZ
  113. * rather than memsetting the coefficients. */
  114. static void mb_encode_i16x16( x264_t *h, int p, int i_qp )
  115. {
  116. pixel *p_src = h->mb.pic.p_fenc[p];
  117. pixel *p_dst = h->mb.pic.p_fdec[p];
  118. ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
  119. ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] );
  120. int nz, block_cbp = 0;
  121. int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
  122. int i_quant_cat = p ? CQM_4IC : CQM_4IY;
  123. int i_mode = h->mb.i_intra16x16_pred_mode;
  124. if( h->mb.b_lossless )
  125. x264_predict_lossless_16x16( h, p, i_mode );
  126. else
  127. h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
  128. if( h->mb.b_lossless )
  129. {
  130. for( int i = 0; i < 16; i++ )
  131. {
  132. int oe = block_idx_xy_fenc[i];
  133. int od = block_idx_xy_fdec[i];
  134. nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16*p+i], p_src+oe, p_dst+od, &dct_dc4x4[block_idx_yx_1d[i]] );
  135. h->mb.cache.non_zero_count[x264_scan8[16*p+i]] = nz;
  136. block_cbp |= nz;
  137. }
  138. h->mb.i_cbp_luma |= block_cbp * 0xf;
  139. h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
  140. h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
  141. return;
  142. }
  143. CLEAR_16x16_NNZ( p );
  144. h->dctf.sub16x16_dct( dct4x4, p_src, p_dst );
  145. if( h->mb.b_noise_reduction )
  146. for( int idx = 0; idx < 16; idx++ )
  147. h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
  148. for( int idx = 0; idx < 16; idx++ )
  149. {
  150. dct_dc4x4[block_idx_xy_1d[idx]] = dct4x4[idx][0];
  151. dct4x4[idx][0] = 0;
  152. }
  153. if( h->mb.b_trellis )
  154. {
  155. for( int idx = 0; idx < 16; idx++ )
  156. if( x264_quant_4x4_trellis( h, dct4x4[idx], i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_AC][p], 1, !!p, idx ) )
  157. {
  158. block_cbp = 0xf;
  159. h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
  160. h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
  161. if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
  162. h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
  163. }
  164. }
  165. else
  166. {
  167. for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  168. {
  169. nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] );
  170. if( nz )
  171. {
  172. block_cbp = 0xf;
  173. FOREACH_BIT( idx, i8x8*4, nz )
  174. {
  175. h->zigzagf.scan_4x4( h->dct.luma4x4[16*p+idx], dct4x4[idx] );
  176. h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[i_quant_cat], i_qp );
  177. if( decimate_score < 6 ) decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16*p+idx] );
  178. h->mb.cache.non_zero_count[x264_scan8[16*p+idx]] = 1;
  179. }
  180. }
  181. }
  182. }
  183. /* Writing the 16 CBFs in an i16x16 block is quite costly, so decimation can save many bits. */
  184. /* More useful with CAVLC, but still useful with CABAC. */
  185. if( decimate_score < 6 )
  186. {
  187. CLEAR_16x16_NNZ( p );
  188. block_cbp = 0;
  189. }
  190. else
  191. h->mb.i_cbp_luma |= block_cbp;
  192. h->dctf.dct4x4dc( dct_dc4x4 );
  193. if( h->mb.b_trellis )
  194. nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
  195. else
  196. nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
  197. h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = nz;
  198. if( nz )
  199. {
  200. h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
  201. /* output samples to fdec */
  202. h->dctf.idct4x4dc( dct_dc4x4 );
  203. h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[i_quant_cat], i_qp ); /* XXX not inversed */
  204. if( block_cbp )
  205. for( int i = 0; i < 16; i++ )
  206. dct4x4[i][0] = dct_dc4x4[block_idx_xy_1d[i]];
  207. }
  208. /* put pixels to fdec */
  209. if( block_cbp )
  210. h->dctf.add16x16_idct( p_dst, dct4x4 );
  211. else if( nz )
  212. h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 );
  213. }
  214. /* Round down coefficients losslessly in DC-only chroma blocks.
  215. * Unlike luma blocks, this can't be done with a lookup table or
  216. * other shortcut technique because of the interdependencies
  217. * between the coefficients due to the chroma DC transform. */
  218. static ALWAYS_INLINE int mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
  219. {
  220. int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
  221. /* If the QP is too high, there's no benefit to rounding optimization. */
  222. if( dmf > 32*64 )
  223. return 1;
  224. if( chroma422 )
  225. return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
  226. else
  227. return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
  228. }
  229. static ALWAYS_INLINE void mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
  230. {
  231. int nz, nz_dc;
  232. int b_decimate = b_inter && h->mb.b_dct_decimate;
  233. int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
  234. ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
  235. h->mb.i_cbp_chroma = 0;
  236. h->nr_count[2] += h->mb.b_noise_reduction * 4;
  237. M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
  238. M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
  239. M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
  240. M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
  241. if( chroma422 )
  242. {
  243. M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
  244. M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
  245. M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
  246. M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
  247. }
  248. /* Early termination: check variance of chroma residual before encoding.
  249. * Don't bother trying early termination at low QPs.
  250. * Values are experimentally derived. */
  251. if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
  252. {
  253. int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
  254. ALIGNED_ARRAY_8( int, ssd,[2] );
  255. int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
  256. if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 )
  257. {
  258. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
  259. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
  260. for( int ch = 0; ch < 2; ch++ )
  261. {
  262. if( ssd[ch] > thresh )
  263. {
  264. pixel *p_src = h->mb.pic.p_fenc[1+ch];
  265. pixel *p_dst = h->mb.pic.p_fdec[1+ch];
  266. if( chroma422 )
  267. /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
  268. h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
  269. else
  270. h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
  271. if( h->mb.b_trellis )
  272. nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
  273. else
  274. {
  275. nz_dc = 0;
  276. for( int i = 0; i <= chroma422; i++ )
  277. nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
  278. h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
  279. }
  280. if( nz_dc )
  281. {
  282. if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
  283. continue;
  284. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
  285. if( chroma422 )
  286. {
  287. zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
  288. h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
  289. }
  290. else
  291. {
  292. zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
  293. idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
  294. }
  295. for( int i = 0; i <= chroma422; i++ )
  296. h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
  297. h->mb.i_cbp_chroma = 1;
  298. }
  299. }
  300. }
  301. return;
  302. }
  303. }
  304. for( int ch = 0; ch < 2; ch++ )
  305. {
  306. pixel *p_src = h->mb.pic.p_fenc[1+ch];
  307. pixel *p_dst = h->mb.pic.p_fdec[1+ch];
  308. int i_decimate_score = b_decimate ? 0 : 7;
  309. int nz_ac = 0;
  310. ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
  311. if( h->mb.b_lossless )
  312. {
  313. static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
  314. for( int i = 0; i < (chroma422?8:4); i++ )
  315. {
  316. int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
  317. int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
  318. nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
  319. &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
  320. h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
  321. h->mb.i_cbp_chroma |= nz;
  322. }
  323. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
  324. continue;
  325. }
  326. for( int i = 0; i <= chroma422; i++ )
  327. h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
  328. if( h->mb.b_noise_reduction )
  329. for( int i = 0; i < (chroma422?8:4); i++ )
  330. h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
  331. if( chroma422 )
  332. h->dctf.dct2x4dc( dct_dc, dct4x4 );
  333. else
  334. dct2x2dc( dct_dc, dct4x4 );
  335. /* calculate dct coeffs */
  336. for( int i8x8 = 0; i8x8 < (chroma422?2:1); i8x8++ )
  337. {
  338. if( h->mb.b_trellis )
  339. {
  340. for( int i4x4 = 0; i4x4 < 4; i4x4++ )
  341. {
  342. if( x264_quant_4x4_trellis( h, dct4x4[i8x8*4+i4x4], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ) )
  343. {
  344. int idx = 16+ch*16+i8x8*8+i4x4;
  345. h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
  346. h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
  347. if( i_decimate_score < 7 )
  348. i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
  349. h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
  350. nz_ac = 1;
  351. }
  352. }
  353. }
  354. else
  355. {
  356. nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4IC+b_inter][i_qp],
  357. h->quant4_bias[CQM_4IC+b_inter][i_qp] );
  358. nz_ac |= nz;
  359. FOREACH_BIT( i4x4, 0, nz )
  360. {
  361. int idx = 16+ch*16+i8x8*8+i4x4;
  362. h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[i8x8*4+i4x4] );
  363. h->quantf.dequant_4x4( dct4x4[i8x8*4+i4x4], dequant_mf, i_qp );
  364. if( i_decimate_score < 7 )
  365. i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[idx] );
  366. h->mb.cache.non_zero_count[x264_scan8[idx]] = 1;
  367. }
  368. }
  369. }
  370. if( h->mb.b_trellis )
  371. nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
  372. else
  373. {
  374. nz_dc = 0;
  375. for( int i = 0; i <= chroma422; i++ )
  376. nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
  377. h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
  378. }
  379. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
  380. if( i_decimate_score < 7 || !nz_ac )
  381. {
  382. /* Decimate the block */
  383. M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
  384. M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
  385. if( chroma422 )
  386. {
  387. M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
  388. M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
  389. }
  390. if( !nz_dc ) /* Whole block is empty */
  391. continue;
  392. if( !mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
  393. {
  394. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
  395. continue;
  396. }
  397. /* DC-only */
  398. if( chroma422 )
  399. {
  400. zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
  401. h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
  402. }
  403. else
  404. {
  405. zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
  406. idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
  407. }
  408. for( int i = 0; i <= chroma422; i++ )
  409. h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
  410. }
  411. else
  412. {
  413. h->mb.i_cbp_chroma = 1;
  414. if( nz_dc )
  415. {
  416. if( chroma422 )
  417. {
  418. zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
  419. h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
  420. }
  421. else
  422. {
  423. zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
  424. idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
  425. }
  426. }
  427. for( int i = 0; i <= chroma422; i++ )
  428. h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
  429. }
  430. }
  431. /* 0 = none, 1 = DC only, 2 = DC+AC */
  432. h->mb.i_cbp_chroma += (h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] |
  433. h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
  434. }
  435. void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
  436. {
  437. if( CHROMA_FORMAT == CHROMA_420 )
  438. mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
  439. else
  440. mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
  441. }
  442. static void macroblock_encode_skip( x264_t *h )
  443. {
  444. M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
  445. M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;
  446. M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;
  447. M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;
  448. M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 0]] ) = 0;
  449. M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
  450. M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
  451. M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
  452. if( CHROMA_FORMAT >= CHROMA_422 )
  453. {
  454. M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
  455. M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
  456. M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 8]] ) = 0;
  457. M32( &h->mb.cache.non_zero_count[x264_scan8[32+10]] ) = 0;
  458. }
  459. h->mb.i_cbp_luma = 0;
  460. h->mb.i_cbp_chroma = 0;
  461. h->mb.cbp[h->mb.i_mb_xy] = 0;
  462. }
  463. /*****************************************************************************
  464. * Intra prediction for predictive lossless mode.
  465. *****************************************************************************/
  466. void x264_predict_lossless_chroma( x264_t *h, int i_mode )
  467. {
  468. int height = 16 >> CHROMA_V_SHIFT;
  469. if( i_mode == I_PRED_CHROMA_V )
  470. {
  471. h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
  472. h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
  473. memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
  474. memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
  475. }
  476. else if( i_mode == I_PRED_CHROMA_H )
  477. {
  478. h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
  479. h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
  480. x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
  481. x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
  482. if( CHROMA_FORMAT == CHROMA_422 )
  483. {
  484. x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
  485. x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
  486. }
  487. }
  488. else
  489. {
  490. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
  491. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
  492. }
  493. }
  494. void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode )
  495. {
  496. int stride = h->fenc->i_stride[p] << MB_INTERLACED;
  497. pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
  498. if( i_mode == I_PRED_4x4_V )
  499. {
  500. h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
  501. memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) );
  502. }
  503. else if( i_mode == I_PRED_4x4_H )
  504. {
  505. h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
  506. for( int i = 0; i < 4; i++ )
  507. p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
  508. }
  509. else
  510. h->predict_4x4[i_mode]( p_dst );
  511. }
  512. void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
  513. {
  514. int stride = h->fenc->i_stride[p] << MB_INTERLACED;
  515. pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
  516. if( i_mode == I_PRED_8x8_V )
  517. {
  518. h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
  519. memcpy( p_dst, &edge[16], 8*sizeof(pixel) );
  520. }
  521. else if( i_mode == I_PRED_8x8_H )
  522. {
  523. h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
  524. for( int i = 0; i < 8; i++ )
  525. p_dst[i*FDEC_STRIDE] = edge[14-i];
  526. }
  527. else
  528. h->predict_8x8[i_mode]( p_dst, edge );
  529. }
  530. void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
  531. {
  532. int stride = h->fenc->i_stride[p] << MB_INTERLACED;
  533. pixel *p_dst = h->mb.pic.p_fdec[p];
  534. if( i_mode == I_PRED_16x16_V )
  535. {
  536. h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
  537. memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) );
  538. }
  539. else if( i_mode == I_PRED_16x16_H )
  540. {
  541. h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
  542. for( int i = 0; i < 16; i++ )
  543. p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
  544. }
  545. else
  546. h->predict_16x16[i_mode]( p_dst );
  547. }
  548. /*****************************************************************************
  549. * x264_macroblock_encode:
  550. *****************************************************************************/
  551. static ALWAYS_INLINE void macroblock_encode_internal( x264_t *h, int plane_count, int chroma )
  552. {
  553. int i_qp = h->mb.i_qp;
  554. int b_decimate = h->mb.b_dct_decimate;
  555. int b_force_no_skip = 0;
  556. int nz;
  557. h->mb.i_cbp_luma = 0;
  558. for( int p = 0; p < plane_count; p++ )
  559. h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = 0;
  560. if( h->mb.i_type == I_PCM )
  561. {
  562. /* if PCM is chosen, we need to store reconstructed frame data */
  563. for( int p = 0; p < plane_count; p++ )
  564. h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
  565. if( chroma )
  566. {
  567. int height = 16 >> CHROMA_V_SHIFT;
  568. h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
  569. h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
  570. }
  571. return;
  572. }
  573. if( !h->mb.b_allow_skip )
  574. {
  575. b_force_no_skip = 1;
  576. if( IS_SKIP(h->mb.i_type) )
  577. {
  578. if( h->mb.i_type == P_SKIP )
  579. h->mb.i_type = P_L0;
  580. else if( h->mb.i_type == B_SKIP )
  581. h->mb.i_type = B_DIRECT;
  582. }
  583. }
  584. if( h->mb.i_type == P_SKIP )
  585. {
  586. /* don't do pskip motion compensation if it was already done in macroblock_analyse */
  587. if( !h->mb.b_skip_mc )
  588. {
  589. int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][0],
  590. h->mb.mv_min[0], h->mb.mv_max[0] );
  591. int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[0]][1],
  592. h->mb.mv_min[1], h->mb.mv_max[1] );
  593. for( int p = 0; p < plane_count; p++ )
  594. h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
  595. &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
  596. mvx, mvy, 16, 16, &h->sh.weight[0][p] );
  597. if( chroma )
  598. {
  599. int v_shift = CHROMA_V_SHIFT;
  600. int height = 16 >> v_shift;
  601. /* Special case for mv0, which is (of course) very common in P-skip mode. */
  602. if( mvx | mvy )
  603. h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
  604. h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
  605. mvx, 2*mvy>>v_shift, 8, height );
  606. else
  607. h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
  608. h->mb.pic.i_stride[1], height );
  609. if( h->sh.weight[0][1].weightfn )
  610. h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
  611. h->mb.pic.p_fdec[1], FDEC_STRIDE,
  612. &h->sh.weight[0][1], height );
  613. if( h->sh.weight[0][2].weightfn )
  614. h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
  615. h->mb.pic.p_fdec[2], FDEC_STRIDE,
  616. &h->sh.weight[0][2], height );
  617. }
  618. }
  619. macroblock_encode_skip( h );
  620. return;
  621. }
  622. if( h->mb.i_type == B_SKIP )
  623. {
  624. /* don't do bskip motion compensation if it was already done in macroblock_analyse */
  625. if( !h->mb.b_skip_mc )
  626. x264_mb_mc( h );
  627. macroblock_encode_skip( h );
  628. return;
  629. }
  630. if( h->mb.i_type == I_16x16 )
  631. {
  632. h->mb.b_transform_8x8 = 0;
  633. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  634. mb_encode_i16x16( h, p, i_qp );
  635. }
  636. else if( h->mb.i_type == I_8x8 )
  637. {
  638. h->mb.b_transform_8x8 = 1;
  639. /* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
  640. if( h->mb.i_skip_intra )
  641. {
  642. h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
  643. M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
  644. M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
  645. M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
  646. M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
  647. h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
  648. /* In RD mode, restore the now-overwritten DCT data. */
  649. if( h->mb.i_skip_intra == 2 )
  650. h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) );
  651. }
  652. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  653. {
  654. for( int i = (p == 0 && h->mb.i_skip_intra) ? 3 : 0; i < 4; i++ )
  655. {
  656. int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
  657. x264_mb_encode_i8x8( h, p, i, i_qp, i_mode, NULL, 1 );
  658. }
  659. }
  660. }
  661. else if( h->mb.i_type == I_4x4 )
  662. {
  663. h->mb.b_transform_8x8 = 0;
  664. /* If we already encoded 15 of the 16 i4x4 blocks, we don't have to do them again. */
  665. if( h->mb.i_skip_intra )
  666. {
  667. h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
  668. M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
  669. M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
  670. M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
  671. M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
  672. h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
  673. /* In RD mode, restore the now-overwritten DCT data. */
  674. if( h->mb.i_skip_intra == 2 )
  675. h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) );
  676. }
  677. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  678. {
  679. for( int i = (p == 0 && h->mb.i_skip_intra) ? 15 : 0; i < 16; i++ )
  680. {
  681. pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i]];
  682. int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
  683. if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
  684. /* emulate missing topright samples */
  685. MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
  686. x264_mb_encode_i4x4( h, p, i, i_qp, i_mode, 1 );
  687. }
  688. }
  689. }
  690. else /* Inter MB */
  691. {
  692. int i_decimate_mb = 0;
  693. /* Don't repeat motion compensation if it was already done in non-RD transform analysis */
  694. if( !h->mb.b_skip_mc )
  695. x264_mb_mc( h );
  696. if( h->mb.b_lossless )
  697. {
  698. if( h->mb.b_transform_8x8 )
  699. for( int p = 0; p < plane_count; p++ )
  700. for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  701. {
  702. int x = i8x8&1;
  703. int y = i8x8>>1;
  704. nz = h->zigzagf.sub_8x8( h->dct.luma8x8[p*4+i8x8], h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE,
  705. h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE );
  706. STORE_8x8_NNZ( p, i8x8, nz );
  707. h->mb.i_cbp_luma |= nz << i8x8;
  708. }
  709. else
  710. for( int p = 0; p < plane_count; p++ )
  711. for( int i4x4 = 0; i4x4 < 16; i4x4++ )
  712. {
  713. nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4x4],
  714. h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4x4],
  715. h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4x4] );
  716. h->mb.cache.non_zero_count[x264_scan8[p*16+i4x4]] = nz;
  717. h->mb.i_cbp_luma |= nz << (i4x4>>2);
  718. }
  719. }
  720. else if( h->mb.b_transform_8x8 )
  721. {
  722. ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
  723. b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
  724. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  725. {
  726. int quant_cat = p ? CQM_8PC : CQM_8PY;
  727. CLEAR_16x16_NNZ( p );
  728. h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
  729. h->nr_count[1+!!p*2] += h->mb.b_noise_reduction * 4;
  730. int plane_cbp = 0;
  731. for( int idx = 0; idx < 4; idx++ )
  732. {
  733. nz = x264_quant_8x8( h, dct8x8[idx], i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, idx );
  734. if( nz )
  735. {
  736. h->zigzagf.scan_8x8( h->dct.luma8x8[p*4+idx], dct8x8[idx] );
  737. if( b_decimate )
  738. {
  739. int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[p*4+idx] );
  740. i_decimate_mb += i_decimate_8x8;
  741. if( i_decimate_8x8 >= 4 )
  742. plane_cbp |= 1<<idx;
  743. }
  744. else
  745. plane_cbp |= 1<<idx;
  746. }
  747. }
  748. if( i_decimate_mb >= 6 || !b_decimate )
  749. {
  750. h->mb.i_cbp_luma |= plane_cbp;
  751. FOREACH_BIT( idx, 0, plane_cbp )
  752. {
  753. h->quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[quant_cat], i_qp );
  754. h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[p][8*(idx&1) + 8*(idx>>1)*FDEC_STRIDE], dct8x8[idx] );
  755. STORE_8x8_NNZ( p, idx, 1 );
  756. }
  757. }
  758. }
  759. }
  760. else
  761. {
  762. ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
  763. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  764. {
  765. int quant_cat = p ? CQM_4PC : CQM_4PY;
  766. CLEAR_16x16_NNZ( p );
  767. h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[p], h->mb.pic.p_fdec[p] );
  768. if( h->mb.b_noise_reduction )
  769. {
  770. h->nr_count[0+!!p*2] += 16;
  771. for( int idx = 0; idx < 16; idx++ )
  772. h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
  773. }
  774. int plane_cbp = 0;
  775. for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  776. {
  777. int i_decimate_8x8 = b_decimate ? 0 : 6;
  778. int nnz8x8 = 0;
  779. if( h->mb.b_trellis )
  780. {
  781. for( int i4x4 = 0; i4x4 < 4; i4x4++ )
  782. {
  783. int idx = i8x8*4+i4x4;
  784. if( x264_quant_4x4_trellis( h, dct4x4[idx], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, p*16+idx ) )
  785. {
  786. h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
  787. h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp );
  788. if( i_decimate_8x8 < 6 )
  789. i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
  790. h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
  791. nnz8x8 = 1;
  792. }
  793. }
  794. }
  795. else
  796. {
  797. nnz8x8 = nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
  798. if( nz )
  799. {
  800. FOREACH_BIT( idx, i8x8*4, nz )
  801. {
  802. h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+idx], dct4x4[idx] );
  803. h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[quant_cat], i_qp );
  804. if( i_decimate_8x8 < 6 )
  805. i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+idx] );
  806. h->mb.cache.non_zero_count[x264_scan8[p*16+idx]] = 1;
  807. }
  808. }
  809. }
  810. if( nnz8x8 )
  811. {
  812. i_decimate_mb += i_decimate_8x8;
  813. if( i_decimate_8x8 < 4 )
  814. STORE_8x8_NNZ( p, i8x8, 0 );
  815. else
  816. plane_cbp |= 1<<i8x8;
  817. }
  818. }
  819. if( i_decimate_mb < 6 )
  820. {
  821. plane_cbp = 0;
  822. CLEAR_16x16_NNZ( p );
  823. }
  824. else
  825. {
  826. h->mb.i_cbp_luma |= plane_cbp;
  827. FOREACH_BIT( i8x8, 0, plane_cbp )
  828. {
  829. h->dctf.add8x8_idct( &h->mb.pic.p_fdec[p][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] );
  830. }
  831. }
  832. }
  833. }
  834. }
  835. /* encode chroma */
  836. if( chroma )
  837. {
  838. if( IS_INTRA( h->mb.i_type ) )
  839. {
  840. int i_mode = h->mb.i_chroma_pred_mode;
  841. if( h->mb.b_lossless )
  842. x264_predict_lossless_chroma( h, i_mode );
  843. else
  844. {
  845. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
  846. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
  847. }
  848. }
  849. /* encode the 8x8 blocks */
  850. x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
  851. }
  852. else
  853. h->mb.i_cbp_chroma = 0;
  854. /* store cbp */
  855. int cbp = h->mb.i_cbp_chroma << 4 | h->mb.i_cbp_luma;
  856. if( h->param.b_cabac )
  857. cbp |= h->mb.cache.non_zero_count[x264_scan8[LUMA_DC ]] << 8
  858. | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] << 9
  859. | h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] << 10;
  860. h->mb.cbp[h->mb.i_mb_xy] = cbp;
  861. /* Check for P_SKIP
  862. * XXX: in the me perhaps we should take x264_mb_predict_mv_pskip into account
  863. * (if multiple mv give same result)*/
  864. if( !b_force_no_skip )
  865. {
  866. if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
  867. !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
  868. M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
  869. && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
  870. {
  871. h->mb.i_type = P_SKIP;
  872. }
  873. /* Check for B_SKIP */
  874. if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
  875. {
  876. h->mb.i_type = B_SKIP;
  877. }
  878. }
  879. }
  880. void x264_macroblock_encode( x264_t *h )
  881. {
  882. if( CHROMA444 )
  883. macroblock_encode_internal( h, 3, 0 );
  884. else if( CHROMA_FORMAT )
  885. macroblock_encode_internal( h, 1, 1 );
  886. else
  887. macroblock_encode_internal( h, 1, 0 );
  888. }
  889. /*****************************************************************************
  890. * x264_macroblock_probe_skip:
  891. * Check if the current MB could be encoded as a [PB]_SKIP
  892. *****************************************************************************/
  893. static ALWAYS_INLINE int macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
  894. {
  895. ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
  896. ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
  897. ALIGNED_4( int16_t mvp[2] );
  898. int i_qp = h->mb.i_qp;
  899. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  900. {
  901. int quant_cat = p ? CQM_4PC : CQM_4PY;
  902. if( !b_bidir )
  903. {
  904. /* Get the MV */
  905. mvp[0] = x264_clip3( h->mb.cache.pskip_mv[0], h->mb.mv_min[0], h->mb.mv_max[0] );
  906. mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
  907. /* Motion compensation */
  908. h->mc.mc_luma( h->mb.pic.p_fdec[p], FDEC_STRIDE,
  909. &h->mb.pic.p_fref[0][0][p*4], h->mb.pic.i_stride[p],
  910. mvp[0], mvp[1], 16, 16, &h->sh.weight[0][p] );
  911. }
  912. for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < 4; i8x8++ )
  913. {
  914. int fenc_offset = (i8x8&1) * 8 + (i8x8>>1) * FENC_STRIDE * 8;
  915. int fdec_offset = (i8x8&1) * 8 + (i8x8>>1) * FDEC_STRIDE * 8;
  916. h->dctf.sub8x8_dct( dct4x4, h->mb.pic.p_fenc[p] + fenc_offset,
  917. h->mb.pic.p_fdec[p] + fdec_offset );
  918. if( h->mb.b_noise_reduction )
  919. for( int i4x4 = 0; i4x4 < 4; i4x4++ )
  920. h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
  921. int nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
  922. FOREACH_BIT( idx, 0, nz )
  923. {
  924. h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
  925. i_decimate_mb += h->quantf.decimate_score16( dctscan );
  926. if( i_decimate_mb >= 6 )
  927. return 0;
  928. }
  929. }
  930. }
  931. if( chroma == CHROMA_420 || chroma == CHROMA_422 )
  932. {
  933. i_qp = h->mb.i_chroma_qp;
  934. int chroma422 = chroma == CHROMA_422;
  935. int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
  936. int ssd;
  937. ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
  938. if( !b_bidir )
  939. {
  940. /* Special case for mv0, which is (of course) very common in P-skip mode. */
  941. if( M32( mvp ) )
  942. h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
  943. h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
  944. mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
  945. else
  946. h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
  947. h->mb.pic.i_stride[1], chroma422?16:8 );
  948. }
  949. for( int ch = 0; ch < 2; ch++ )
  950. {
  951. pixel *p_src = h->mb.pic.p_fenc[1+ch];
  952. pixel *p_dst = h->mb.pic.p_fdec[1+ch];
  953. if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
  954. h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
  955. h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
  956. &h->sh.weight[0][1+ch], chroma422?16:8 );
  957. /* there is almost never a termination during chroma, but we can't avoid the check entirely */
  958. /* so instead we check SSD and skip the actual check if the score is low enough. */
  959. ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
  960. if( ssd < thresh )
  961. continue;
  962. /* The vast majority of chroma checks will terminate during the DC check or the higher
  963. * threshold check, so we can save time by doing a DC-only DCT. */
  964. if( h->mb.b_noise_reduction )
  965. {
  966. for( int i = 0; i <= chroma422; i++ )
  967. h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
  968. for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
  969. {
  970. h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
  971. dct_dc[i4x4] = dct4x4[i4x4][0];
  972. dct4x4[i4x4][0] = 0;
  973. }
  974. }
  975. else
  976. {
  977. if( chroma422 )
  978. h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
  979. else
  980. h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
  981. }
  982. for( int i = 0; i <= chroma422; i++ )
  983. if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
  984. h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
  985. return 0;
  986. /* If there wasn't a termination in DC, we can check against a much higher threshold. */
  987. if( ssd < thresh*4 )
  988. continue;
  989. if( !h->mb.b_noise_reduction )
  990. for( int i = 0; i <= chroma422; i++ )
  991. {
  992. h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
  993. dct4x4[i*4+0][0] = 0;
  994. dct4x4[i*4+1][0] = 0;
  995. dct4x4[i*4+2][0] = 0;
  996. dct4x4[i*4+3][0] = 0;
  997. }
  998. /* calculate dct coeffs */
  999. for( int i8x8 = 0, i_decimate_mb = 0; i8x8 < (chroma422?2:1); i8x8++ )
  1000. {
  1001. int nz = h->quantf.quant_4x4x4( &dct4x4[i8x8*4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
  1002. FOREACH_BIT( idx, i8x8*4, nz )
  1003. {
  1004. h->zigzagf.scan_4x4( dctscan, dct4x4[idx] );
  1005. i_decimate_mb += h->quantf.decimate_score15( dctscan );
  1006. if( i_decimate_mb >= 7 )
  1007. return 0;
  1008. }
  1009. }
  1010. }
  1011. }
  1012. h->mb.b_skip_mc = 1;
  1013. return 1;
  1014. }
  1015. int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
  1016. {
  1017. if( CHROMA_FORMAT == CHROMA_420 )
  1018. return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
  1019. else if( CHROMA_FORMAT == CHROMA_422 )
  1020. return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
  1021. else if( CHROMA_FORMAT == CHROMA_444 )
  1022. return macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
  1023. else
  1024. return macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_400 );
  1025. }
  1026. /****************************************************************************
  1027. * DCT-domain noise reduction / adaptive deadzone
  1028. * from libavcodec
  1029. ****************************************************************************/
  1030. void x264_noise_reduction_update( x264_t *h )
  1031. {
  1032. h->nr_offset = h->nr_offset_denoise;
  1033. h->nr_residual_sum = h->nr_residual_sum_buf[0];
  1034. h->nr_count = h->nr_count_buf[0];
  1035. for( int cat = 0; cat < 3 + CHROMA444; cat++ )
  1036. {
  1037. int dct8x8 = cat&1;
  1038. int size = dct8x8 ? 64 : 16;
  1039. const uint32_t *weight = dct8x8 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
  1040. if( h->nr_count[cat] > (dct8x8 ? (1<<16) : (1<<18)) )
  1041. {
  1042. for( int i = 0; i < size; i++ )
  1043. h->nr_residual_sum[cat][i] >>= 1;
  1044. h->nr_count[cat] >>= 1;
  1045. }
  1046. for( int i = 0; i < size; i++ )
  1047. h->nr_offset[cat][i] =
  1048. ((uint64_t)h->param.analyse.i_noise_reduction * h->nr_count[cat]
  1049. + h->nr_residual_sum[cat][i]/2)
  1050. / ((uint64_t)h->nr_residual_sum[cat][i] * weight[i]/256 + 1);
  1051. /* Don't denoise DC coefficients */
  1052. h->nr_offset[cat][0] = 0;
  1053. }
  1054. }
  1055. /*****************************************************************************
  1056. * RD only; 4 calls to this do not make up for one macroblock_encode.
  1057. * doesn't transform chroma dc.
  1058. *****************************************************************************/
  1059. static ALWAYS_INLINE void macroblock_encode_p8x8_internal( x264_t *h, int i8, int plane_count, int chroma )
  1060. {
  1061. int b_decimate = h->mb.b_dct_decimate;
  1062. int i_qp = h->mb.i_qp;
  1063. int x = i8&1;
  1064. int y = i8>>1;
  1065. int nz;
  1066. int chroma422 = chroma == CHROMA_422;
  1067. h->mb.i_cbp_chroma = 0;
  1068. h->mb.i_cbp_luma &= ~(1 << i8);
  1069. if( !h->mb.b_skip_mc )
  1070. x264_mb_mc_8x8( h, i8 );
  1071. if( h->mb.b_lossless )
  1072. {
  1073. for( int p = 0; p < plane_count; p++ )
  1074. {
  1075. pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
  1076. pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
  1077. int nnz8x8 = 0;
  1078. if( h->mb.b_transform_8x8 )
  1079. {
  1080. nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[4*p+i8], p_fenc, p_fdec );
  1081. STORE_8x8_NNZ( p, i8, nnz8x8 );
  1082. }
  1083. else
  1084. {
  1085. for( int i4 = i8*4; i4 < i8*4+4; i4++ )
  1086. {
  1087. nz = h->zigzagf.sub_4x4( h->dct.luma4x4[16*p+i4],
  1088. h->mb.pic.p_fenc[p]+block_idx_xy_fenc[i4],
  1089. h->mb.pic.p_fdec[p]+block_idx_xy_fdec[i4] );
  1090. h->mb.cache.non_zero_count[x264_scan8[16*p+i4]] = nz;
  1091. nnz8x8 |= nz;
  1092. }
  1093. }
  1094. h->mb.i_cbp_luma |= nnz8x8 << i8;
  1095. }
  1096. if( chroma == CHROMA_420 || chroma == CHROMA_422 )
  1097. {
  1098. for( int ch = 0; ch < 2; ch++ )
  1099. {
  1100. dctcoef dc;
  1101. pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
  1102. pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
  1103. for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
  1104. {
  1105. int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
  1106. nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
  1107. h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
  1108. }
  1109. }
  1110. h->mb.i_cbp_chroma = 0x02;
  1111. }
  1112. }
  1113. else
  1114. {
  1115. if( h->mb.b_transform_8x8 )
  1116. {
  1117. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  1118. {
  1119. int quant_cat = p ? CQM_8PC : CQM_8PY;
  1120. pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
  1121. pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
  1122. ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
  1123. h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
  1124. int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
  1125. if( nnz8x8 )
  1126. {
  1127. h->zigzagf.scan_8x8( h->dct.luma8x8[4*p+i8], dct8x8 );
  1128. if( b_decimate && !h->mb.b_trellis )
  1129. nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[4*p+i8] );
  1130. if( nnz8x8 )
  1131. {
  1132. h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[quant_cat], i_qp );
  1133. h->dctf.add8x8_idct8( p_fdec, dct8x8 );
  1134. STORE_8x8_NNZ( p, i8, 1 );
  1135. h->mb.i_cbp_luma |= 1 << i8;
  1136. }
  1137. else
  1138. STORE_8x8_NNZ( p, i8, 0 );
  1139. }
  1140. else
  1141. STORE_8x8_NNZ( p, i8, 0 );
  1142. }
  1143. }
  1144. else
  1145. {
  1146. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  1147. {
  1148. int quant_cat = p ? CQM_4PC : CQM_4PY;
  1149. pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
  1150. pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
  1151. int i_decimate_8x8 = b_decimate ? 0 : 4;
  1152. ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] );
  1153. int nnz8x8 = 0;
  1154. h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
  1155. STORE_8x8_NNZ( p, i8, 0 );
  1156. if( h->mb.b_noise_reduction )
  1157. for( int idx = 0; idx < 4; idx++ )
  1158. h->quantf.denoise_dct( dct4x4[idx], h->nr_residual_sum[0+!!p*2], h->nr_offset[0+!!p*2], 16 );
  1159. if( h->mb.b_trellis )
  1160. {
  1161. for( int i4x4 = 0; i4x4 < 4; i4x4++ )
  1162. {
  1163. if( x264_quant_4x4_trellis( h, dct4x4[i4x4], quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, !!p, i8*4+i4x4+p*16 ) )
  1164. {
  1165. h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
  1166. h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
  1167. if( i_decimate_8x8 < 4 )
  1168. i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
  1169. h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
  1170. nnz8x8 = 1;
  1171. }
  1172. }
  1173. }
  1174. else
  1175. {
  1176. nnz8x8 = nz = h->quantf.quant_4x4x4( dct4x4, h->quant4_mf[quant_cat][i_qp], h->quant4_bias[quant_cat][i_qp] );
  1177. if( nz )
  1178. {
  1179. FOREACH_BIT( i4x4, 0, nz )
  1180. {
  1181. h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i8*4+i4x4], dct4x4[i4x4] );
  1182. h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[quant_cat], i_qp );
  1183. if( i_decimate_8x8 < 4 )
  1184. i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[p*16+i8*4+i4x4] );
  1185. h->mb.cache.non_zero_count[x264_scan8[p*16+i8*4+i4x4]] = 1;
  1186. }
  1187. }
  1188. }
  1189. if( nnz8x8 )
  1190. {
  1191. /* decimate this 8x8 block */
  1192. if( i_decimate_8x8 < 4 )
  1193. STORE_8x8_NNZ( p, i8, 0 );
  1194. else
  1195. {
  1196. h->dctf.add8x8_idct( p_fdec, dct4x4 );
  1197. h->mb.i_cbp_luma |= 1 << i8;
  1198. }
  1199. }
  1200. }
  1201. }
  1202. if( chroma == CHROMA_420 || chroma == CHROMA_422 )
  1203. {
  1204. i_qp = h->mb.i_chroma_qp;
  1205. for( int ch = 0; ch < 2; ch++ )
  1206. {
  1207. ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] );
  1208. pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
  1209. pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
  1210. for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
  1211. {
  1212. h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
  1213. if( h->mb.b_noise_reduction )
  1214. h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
  1215. dct4x4[i4x4][0] = 0;
  1216. if( h->mb.b_trellis )
  1217. nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
  1218. else
  1219. nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
  1220. int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
  1221. h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
  1222. if( nz )
  1223. {
  1224. h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
  1225. h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
  1226. h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
  1227. }
  1228. }
  1229. }
  1230. h->mb.i_cbp_chroma = 0x02;
  1231. }
  1232. }
  1233. }
  1234. void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
  1235. {
  1236. if( CHROMA_FORMAT == CHROMA_420 )
  1237. macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
  1238. else if( CHROMA_FORMAT == CHROMA_422 )
  1239. macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
  1240. else if( CHROMA_FORMAT == CHROMA_444 )
  1241. macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
  1242. else
  1243. macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_400 );
  1244. }
  1245. /*****************************************************************************
  1246. * RD only, luma only (for 4:2:0)
  1247. *****************************************************************************/
  1248. static ALWAYS_INLINE void macroblock_encode_p4x4_internal( x264_t *h, int i4, int plane_count )
  1249. {
  1250. int i_qp = h->mb.i_qp;
  1251. for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
  1252. {
  1253. int quant_cat = p ? CQM_4PC : CQM_4PY;
  1254. pixel *p_fenc = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[i4]];
  1255. pixel *p_fdec = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[i4]];
  1256. int nz;
  1257. /* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
  1258. if( h->mb.b_lossless )
  1259. {
  1260. nz = h->zigzagf.sub_4x4( h->dct.luma4x4[p*16+i4], p_fenc, p_fdec );
  1261. h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
  1262. }
  1263. else
  1264. {
  1265. ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
  1266. h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
  1267. nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
  1268. h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
  1269. if( nz )
  1270. {
  1271. h->zigzagf.scan_4x4( h->dct.luma4x4[p*16+i4], dct4x4 );
  1272. h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[quant_cat], i_qp );
  1273. h->dctf.add4x4_idct( p_fdec, dct4x4 );
  1274. }
  1275. }
  1276. }
  1277. }
  1278. void x264_macroblock_encode_p4x4( x264_t *h, int i8 )
  1279. {
  1280. if( CHROMA444 )
  1281. macroblock_encode_p4x4_internal( h, i8, 3 );
  1282. else
  1283. macroblock_encode_p4x4_internal( h, i8, 1 );
  1284. }