mc-c.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. /*****************************************************************************
  2. * mc-c.c: arm motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Janne Grunau <janne-x264@jannau.net>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "common/common.h"
  27. #include "mc.h"
  28. #define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
  29. void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
  30. #define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
  31. void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  32. #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
  33. void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
  34. #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
  35. void x264_memzero_aligned_neon( void *dst, size_t n );
  36. #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
  37. void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  38. #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
  39. void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  40. #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
  41. void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  42. #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
  43. void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  44. #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
  45. void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  46. #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
  47. void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  48. #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
  49. void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  50. #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
  51. void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  52. #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
  53. void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  54. #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
  55. void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  56. #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
  57. void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  58. #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
  59. void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  60. #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
  61. void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  62. #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
  63. void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
  64. pixel *src, intptr_t i_src, int w, int h );
  65. #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
  66. void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
  67. pixel *dstv, intptr_t i_dstv,
  68. pixel *src, intptr_t i_src, int w, int h );
  69. #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
  70. void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
  71. pixel *dstb, intptr_t i_dstb,
  72. pixel *dstc, intptr_t i_dstc,
  73. pixel *src, intptr_t i_src, int pw, int w, int h );
  74. #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
  75. void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
  76. pixel *srcu, intptr_t i_srcu,
  77. pixel *srcv, intptr_t i_srcv, int w, int h );
  78. #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
  79. void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
  80. pixel *src, intptr_t i_src, int w, int h );
  81. #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
  82. void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  83. #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
  84. void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
  85. #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
  86. void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
  87. #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
  88. #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
  89. #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
  90. #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
  91. #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
  92. #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
  93. #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
  94. #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
  95. #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
  96. #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
  97. #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
  98. #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
  99. #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
  100. #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
  101. #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
  102. #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
  103. #if !HIGH_BIT_DEPTH
  104. #define MC_WEIGHT(func)\
  105. void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  106. void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  107. void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  108. void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  109. \
  110. static weight_fn_t mc##func##_wtab_neon[6] =\
  111. {\
  112. x264_mc_weight_w4##func##_neon,\
  113. x264_mc_weight_w4##func##_neon,\
  114. x264_mc_weight_w8##func##_neon,\
  115. x264_mc_weight_w16##func##_neon,\
  116. x264_mc_weight_w16##func##_neon,\
  117. x264_mc_weight_w20##func##_neon,\
  118. };
  119. MC_WEIGHT()
  120. MC_WEIGHT(_nodenom)
  121. MC_WEIGHT(_offsetadd)
  122. MC_WEIGHT(_offsetsub)
  123. #endif
  124. #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
  125. void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  126. #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
  127. void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  128. #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
  129. void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  130. #define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
  131. void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  132. #define x264_mc_chroma_neon x264_template(mc_chroma_neon)
  133. void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
  134. #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
  135. void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
  136. #define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
  137. void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
  138. #define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
  139. void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
  140. #define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
  141. void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
  142. #define x264_integral_init4h_neon x264_template(integral_init4h_neon)
  143. void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
  144. #define x264_integral_init4v_neon x264_template(integral_init4v_neon)
  145. void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
  146. #define x264_integral_init8h_neon x264_template(integral_init8h_neon)
  147. void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
  148. #define x264_integral_init8v_neon x264_template(integral_init8v_neon)
  149. void x264_integral_init8v_neon( uint16_t *, intptr_t );
  150. #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
  151. void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
  152. #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
  153. void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
  154. #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
  155. void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
  156. #if !HIGH_BIT_DEPTH
  157. static void weight_cache_neon( x264_t *h, x264_weight_t *w )
  158. {
  159. if( w->i_scale == 1<<w->i_denom )
  160. {
  161. if( w->i_offset < 0 )
  162. {
  163. w->weightfn = mc_offsetsub_wtab_neon;
  164. w->cachea[0] = -w->i_offset;
  165. }
  166. else
  167. {
  168. w->weightfn = mc_offsetadd_wtab_neon;
  169. w->cachea[0] = w->i_offset;
  170. }
  171. }
  172. else if( !w->i_denom )
  173. w->weightfn = mc_nodenom_wtab_neon;
  174. else
  175. w->weightfn = mc_wtab_neon;
  176. }
  177. static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
  178. {
  179. NULL,
  180. x264_pixel_avg2_w4_neon,
  181. x264_pixel_avg2_w8_neon,
  182. x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
  183. x264_pixel_avg2_w16_neon,
  184. x264_pixel_avg2_w20_neon,
  185. };
  186. static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
  187. {
  188. NULL,
  189. x264_mc_copy_w4_neon,
  190. x264_mc_copy_w8_neon,
  191. NULL,
  192. x264_mc_copy_w16_neon,
  193. };
  194. static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
  195. uint8_t *src[4], intptr_t i_src_stride,
  196. int mvx, int mvy,
  197. int i_width, int i_height, const x264_weight_t *weight )
  198. {
  199. int qpel_idx = ((mvy&3)<<2) + (mvx&3);
  200. intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
  201. uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
  202. if( (mvy&3) == 3 ) // explict if() to force conditional add
  203. src1 += i_src_stride;
  204. if( qpel_idx & 5 ) /* qpel interpolation needed */
  205. {
  206. uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
  207. pixel_avg_wtab_neon[i_width>>2](
  208. dst, i_dst_stride, src1, i_src_stride,
  209. src2, i_height );
  210. if( weight->weightfn )
  211. weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
  212. }
  213. else if( weight->weightfn )
  214. weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
  215. else
  216. mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
  217. }
  218. static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
  219. uint8_t *src[4], intptr_t i_src_stride,
  220. int mvx, int mvy,
  221. int i_width, int i_height, const x264_weight_t *weight )
  222. {
  223. int qpel_idx = ((mvy&3)<<2) + (mvx&3);
  224. intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
  225. uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
  226. if( (mvy&3) == 3 ) // explict if() to force conditional add
  227. src1 += i_src_stride;
  228. if( qpel_idx & 5 ) /* qpel interpolation needed */
  229. {
  230. uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
  231. pixel_avg_wtab_neon[i_width>>2](
  232. dst, *i_dst_stride, src1, i_src_stride,
  233. src2, i_height );
  234. if( weight->weightfn )
  235. weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
  236. return dst;
  237. }
  238. else if( weight->weightfn )
  239. {
  240. weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
  241. return dst;
  242. }
  243. else
  244. {
  245. *i_dst_stride = i_src_stride;
  246. return src1;
  247. }
  248. }
  249. static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
  250. intptr_t stride, int width, int height, int16_t *buf )
  251. {
  252. intptr_t realign = (intptr_t)src & 15;
  253. src -= realign;
  254. dstv -= realign;
  255. dstc -= realign;
  256. dsth -= realign;
  257. width += realign;
  258. while( height-- )
  259. {
  260. x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
  261. x264_hpel_filter_c_neon( dstc, buf+8, width );
  262. x264_hpel_filter_h_neon( dsth, src, width );
  263. dsth += stride;
  264. dstv += stride;
  265. dstc += stride;
  266. src += stride;
  267. }
  268. }
  269. PLANE_COPY(16, neon)
  270. PLANE_COPY_SWAP(16, neon)
  271. PLANE_INTERLEAVE(neon)
  272. PROPAGATE_LIST(neon)
  273. #endif // !HIGH_BIT_DEPTH
  274. void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
  275. {
  276. if( !(cpu&X264_CPU_ARMV6) )
  277. return;
  278. #if !HIGH_BIT_DEPTH
  279. pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
  280. pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
  281. pf->prefetch_ref = x264_prefetch_ref_arm;
  282. #endif // !HIGH_BIT_DEPTH
  283. if( !(cpu&X264_CPU_NEON) )
  284. return;
  285. #if !HIGH_BIT_DEPTH
  286. pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
  287. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
  288. pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
  289. pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
  290. pf->plane_copy = plane_copy_neon;
  291. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
  292. pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
  293. pf->plane_copy_interleave = plane_copy_interleave_neon;
  294. pf->plane_copy_swap = plane_copy_swap_neon;
  295. pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
  296. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
  297. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
  298. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
  299. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
  300. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
  301. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
  302. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
  303. pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
  304. pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
  305. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
  306. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
  307. pf->weight = mc_wtab_neon;
  308. pf->offsetadd = mc_offsetadd_wtab_neon;
  309. pf->offsetsub = mc_offsetsub_wtab_neon;
  310. pf->weight_cache = weight_cache_neon;
  311. pf->mc_chroma = x264_mc_chroma_neon;
  312. pf->mc_luma = mc_luma_neon;
  313. pf->get_ref = get_ref_neon;
  314. pf->hpel_filter = hpel_filter_neon;
  315. pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
  316. pf->integral_init4h = x264_integral_init4h_neon;
  317. pf->integral_init8h = x264_integral_init8h_neon;
  318. pf->integral_init4v = x264_integral_init4v_neon;
  319. pf->integral_init8v = x264_integral_init8v_neon;
  320. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
  321. pf->mbtree_propagate_list = mbtree_propagate_list_neon;
  322. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
  323. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
  324. #endif // !HIGH_BIT_DEPTH
  325. // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
  326. #ifndef SYS_MACOSX
  327. pf->memcpy_aligned = x264_memcpy_aligned_neon;
  328. #endif
  329. pf->memzero_aligned = x264_memzero_aligned_neon;
  330. }