mc.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /*****************************************************************************
  2. * mc.h: motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2004-2018 x264 project
  5. *
  6. * Authors: Loren Merritt <lorenm@u.washington.edu>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #ifndef X264_MC_H
  26. #define X264_MC_H
  27. #define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
  28. #define MC_CLIP_ADD2(s,x)\
  29. do\
  30. {\
  31. MC_CLIP_ADD((s)[0], (x)[0]);\
  32. MC_CLIP_ADD((s)[1], (x)[1]);\
  33. } while( 0 )
  34. #define x264_mbtree_propagate_list_internal_neon x264_template(mbtree_propagate_list_internal_neon)
  35. #define PROPAGATE_LIST(cpu)\
  36. void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
  37. uint16_t *lowres_costs, int16_t *output,\
  38. int bipred_weight, int mb_y, int len );\
  39. \
  40. static void mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
  41. int16_t *propagate_amount, uint16_t *lowres_costs,\
  42. int bipred_weight, int mb_y, int len, int list )\
  43. {\
  44. int16_t *current = h->scratch_buffer2;\
  45. \
  46. x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
  47. current, bipred_weight, mb_y, len );\
  48. \
  49. unsigned stride = h->mb.i_mb_stride;\
  50. unsigned width = h->mb.i_mb_width;\
  51. unsigned height = h->mb.i_mb_height;\
  52. \
  53. for( unsigned i = 0; i < len; current += 32 )\
  54. {\
  55. int end = X264_MIN( i+8, len );\
  56. for( ; i < end; i++, current += 2 )\
  57. {\
  58. if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
  59. continue;\
  60. \
  61. unsigned mbx = current[0];\
  62. unsigned mby = current[1];\
  63. unsigned idx0 = mbx + mby * stride;\
  64. unsigned idx2 = idx0 + stride;\
  65. \
  66. /* Shortcut for the simple/common case of zero MV */\
  67. if( !M32( mvs[i] ) )\
  68. {\
  69. MC_CLIP_ADD( ref_costs[idx0], current[16] );\
  70. continue;\
  71. }\
  72. \
  73. if( mbx < width-1 && mby < height-1 )\
  74. {\
  75. MC_CLIP_ADD2( ref_costs+idx0, current+16 );\
  76. MC_CLIP_ADD2( ref_costs+idx2, current+32 );\
  77. }\
  78. else\
  79. {\
  80. /* Note: this takes advantage of unsigned representation to\
  81. * catch negative mbx/mby. */\
  82. if( mby < height )\
  83. {\
  84. if( mbx < width )\
  85. MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\
  86. if( mbx+1 < width )\
  87. MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\
  88. }\
  89. if( mby+1 < height )\
  90. {\
  91. if( mbx < width )\
  92. MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\
  93. if( mbx+1 < width )\
  94. MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\
  95. }\
  96. }\
  97. }\
  98. }\
  99. }
  100. #define x264_plane_copy_c x264_template(plane_copy_c)
  101. void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  102. #define PLANE_COPY(align, cpu)\
  103. static void plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
  104. {\
  105. int c_w = (align) / sizeof(pixel) - 1;\
  106. if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
  107. x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
  108. else if( !(w&c_w) )\
  109. x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
  110. else\
  111. {\
  112. if( --h > 0 )\
  113. {\
  114. if( i_src > 0 )\
  115. {\
  116. x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
  117. dst += i_dst * h;\
  118. src += i_src * h;\
  119. }\
  120. else\
  121. x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
  122. }\
  123. /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
  124. memcpy( dst, src, w*sizeof(pixel) );\
  125. }\
  126. }
  127. #define x264_plane_copy_swap_c x264_template(plane_copy_swap_c)
  128. void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  129. #define PLANE_COPY_SWAP(align, cpu)\
  130. static void plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
  131. {\
  132. int c_w = (align>>1) / sizeof(pixel) - 1;\
  133. if( !(w&c_w) )\
  134. x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
  135. else if( w > c_w )\
  136. {\
  137. if( --h > 0 )\
  138. {\
  139. if( i_src > 0 )\
  140. {\
  141. x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
  142. dst += i_dst * h;\
  143. src += i_src * h;\
  144. }\
  145. else\
  146. x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
  147. }\
  148. x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
  149. for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
  150. {\
  151. dst[x] = src[x+1];\
  152. dst[x+1] = src[x];\
  153. }\
  154. }\
  155. else\
  156. x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
  157. }
  158. #define x264_plane_copy_deinterleave_c x264_template(plane_copy_deinterleave_c)
  159. void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
  160. pixel *src, intptr_t i_src, int w, int h );
  161. /* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
  162. * input with the additional constraint that we cannot overread src. */
  163. #define PLANE_COPY_YUYV(align, cpu)\
  164. static void plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
  165. pixel *src, intptr_t i_src, int w, int h )\
  166. {\
  167. int c_w = (align>>1) / sizeof(pixel) - 1;\
  168. if( !(w&c_w) )\
  169. x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
  170. else if( w > c_w )\
  171. {\
  172. if( --h > 0 )\
  173. {\
  174. if( i_src > 0 )\
  175. {\
  176. x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
  177. dsta += i_dsta * h;\
  178. dstb += i_dstb * h;\
  179. src += i_src * h;\
  180. }\
  181. else\
  182. x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
  183. src+i_src, i_src, w, h );\
  184. }\
  185. x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
  186. }\
  187. else\
  188. x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
  189. }
  190. #define x264_plane_copy_interleave_c x264_template(plane_copy_interleave_c)
  191. void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
  192. pixel *srcu, intptr_t i_srcu,
  193. pixel *srcv, intptr_t i_srcv, int w, int h );
  194. #define PLANE_INTERLEAVE(cpu) \
  195. static void plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
  196. pixel *srcu, intptr_t i_srcu,\
  197. pixel *srcv, intptr_t i_srcv, int w, int h )\
  198. {\
  199. int c_w = 16 / sizeof(pixel) - 1;\
  200. if( !(w&c_w) )\
  201. x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
  202. else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
  203. {\
  204. if( --h > 0 )\
  205. {\
  206. if( i_srcu > 0 )\
  207. {\
  208. x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
  209. dst += i_dst * h;\
  210. srcu += i_srcu * h;\
  211. srcv += i_srcv * h;\
  212. }\
  213. else\
  214. x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
  215. }\
  216. x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
  217. }\
  218. else\
  219. x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
  220. }
  221. struct x264_weight_t;
  222. typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
  223. typedef struct x264_weight_t
  224. {
  225. /* aligning the first member is a gcc hack to force the struct to be
  226. * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
  227. ALIGNED_16( int16_t cachea[8] );
  228. int16_t cacheb[8];
  229. int32_t i_denom;
  230. int32_t i_scale;
  231. int32_t i_offset;
  232. weight_fn_t *weightfn;
  233. } ALIGNED_16( x264_weight_t );
  234. #define x264_weight_none ((const x264_weight_t*)x264_zero)
  235. #define SET_WEIGHT( w, b, s, d, o )\
  236. {\
  237. (w).i_scale = (s);\
  238. (w).i_denom = (d);\
  239. (w).i_offset = (o);\
  240. if( b )\
  241. h->mc.weight_cache( h, &w );\
  242. else\
  243. w.weightfn = NULL;\
  244. }
  245. /* Do the MC
  246. * XXX: Only width = 4, 8 or 16 are valid
  247. * width == 4 -> height == 4 or 8
  248. * width == 8 -> height == 4 or 8 or 16
  249. * width == 16-> height == 8 or 16
  250. * */
  251. typedef struct
  252. {
  253. void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
  254. int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
  255. /* may round up the dimensions if they're not a power of 2 */
  256. pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
  257. int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
  258. /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
  259. * so it must be run from left to right. */
  260. void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
  261. int mvx, int mvy, int i_width, int i_height );
  262. void (*avg[12])( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
  263. pixel *src2, intptr_t src2_stride, int i_weight );
  264. /* only 16x16, 8x8, and 4x4 defined */
  265. void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
  266. void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
  267. void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  268. void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
  269. void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
  270. void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
  271. void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
  272. void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
  273. pixel *srcv, intptr_t i_srcv, int w, int h );
  274. /* may write up to 15 pixels off the end of each plane */
  275. void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
  276. pixel *src, intptr_t i_src, int w, int h );
  277. void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
  278. pixel *src, intptr_t i_src, int w, int h );
  279. void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
  280. pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
  281. void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
  282. pixel *dstc, intptr_t i_dstc,
  283. uint32_t *src, intptr_t i_src, int w, int h );
  284. void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
  285. intptr_t i_stride, int i_width, int i_height, int16_t *buf );
  286. /* prefetch the next few macroblocks of fenc or fdec */
  287. void (*prefetch_fenc) ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
  288. void (*prefetch_fenc_400)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
  289. void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
  290. void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
  291. /* prefetch the next few macroblocks of a hpel reference frame */
  292. void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
  293. void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
  294. void (*memzero_aligned)( void *dst, size_t n );
  295. /* successive elimination prefilter */
  296. void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
  297. void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
  298. void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
  299. void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
  300. void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
  301. intptr_t src_stride, intptr_t dst_stride, int width, int height );
  302. weight_fn_t *weight;
  303. weight_fn_t *offsetadd;
  304. weight_fn_t *offsetsub;
  305. void (*weight_cache)( x264_t *, x264_weight_t * );
  306. void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  307. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  308. void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
  309. int16_t *propagate_amount, uint16_t *lowres_costs,
  310. int bipred_weight, int mb_y, int len, int list );
  311. void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
  312. void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
  313. } x264_mc_functions_t;
  314. #define x264_mc_init x264_template(mc_init)
  315. void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
  316. #endif