mc-c.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /*****************************************************************************
  2. * mc-c.c: aarch64 motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2009-2018 x264 project
  5. *
  6. * Authors: David Conrad <lessen42@gmail.com>
  7. * Janne Grunau <janne-x264@jannau.net>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "common/common.h"
  27. #include "mc.h"
  28. #define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
  29. void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
  30. #define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
  31. void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  32. #define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
  33. void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  34. #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
  35. void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
  36. #define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
  37. void x264_memzero_aligned_neon( void *dst, size_t n );
  38. #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
  39. void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  40. #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
  41. void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  42. #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
  43. void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  44. #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
  45. void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  46. #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
  47. void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  48. #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
  49. void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  50. #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
  51. void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  52. #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
  53. void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  54. #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
  55. void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  56. #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
  57. void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  58. #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
  59. void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  60. #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
  61. void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  62. #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
  63. void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
  64. #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
  65. void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
  66. pixel *src, intptr_t i_src, int w, int h );
  67. #define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
  68. void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
  69. pixel *src, intptr_t i_src, int w, int h );
  70. #define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
  71. void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
  72. pixel *dstv, intptr_t i_dstv,
  73. pixel *src, intptr_t i_src, int w, int h );
  74. #define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
  75. void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
  76. pixel *dstb, intptr_t i_dstb,
  77. pixel *dstc, intptr_t i_dstc,
  78. pixel *src, intptr_t i_src, int pw, int w, int h );
  79. #define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
  80. void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
  81. pixel *srcu, intptr_t i_srcu,
  82. pixel *srcv, intptr_t i_srcv, int w, int h );
  83. #define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
  84. void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  85. #define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
  86. void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
  87. #define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
  88. void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
  89. #define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
  90. #define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
  91. #define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
  92. #define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
  93. #define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
  94. #define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
  95. #define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
  96. #define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
  97. #define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
  98. #define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
  99. #define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
  100. #define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
  101. #define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
  102. #define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
  103. #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
  104. #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
  105. #define MC_WEIGHT(func)\
  106. void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  107. void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  108. void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  109. void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
  110. \
  111. static void (* mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
  112. {\
  113. x264_mc_weight_w4##func##_neon,\
  114. x264_mc_weight_w4##func##_neon,\
  115. x264_mc_weight_w8##func##_neon,\
  116. x264_mc_weight_w16##func##_neon,\
  117. x264_mc_weight_w16##func##_neon,\
  118. x264_mc_weight_w20##func##_neon,\
  119. };
  120. #if !HIGH_BIT_DEPTH
  121. MC_WEIGHT()
  122. MC_WEIGHT(_nodenom)
  123. MC_WEIGHT(_offsetadd)
  124. MC_WEIGHT(_offsetsub)
  125. #endif
  126. #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
  127. void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  128. #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
  129. void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  130. #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
  131. void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
  132. #define x264_mc_chroma_neon x264_template(mc_chroma_neon)
  133. void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
  134. #define x264_integral_init4h_neon x264_template(integral_init4h_neon)
  135. void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
  136. #define x264_integral_init4v_neon x264_template(integral_init4v_neon)
  137. void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
  138. #define x264_integral_init8h_neon x264_template(integral_init8h_neon)
  139. void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
  140. #define x264_integral_init8v_neon x264_template(integral_init8v_neon)
  141. void x264_integral_init8v_neon( uint16_t *, intptr_t );
  142. #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
  143. void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
  144. #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
  145. void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
  146. #define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
  147. void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
  148. #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
  149. void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
  150. #if !HIGH_BIT_DEPTH
  151. static void weight_cache_neon( x264_t *h, x264_weight_t *w )
  152. {
  153. if( w->i_scale == 1<<w->i_denom )
  154. {
  155. if( w->i_offset < 0 )
  156. {
  157. w->weightfn = mc_offsetsub_wtab_neon;
  158. w->cachea[0] = -w->i_offset;
  159. }
  160. else
  161. {
  162. w->weightfn = mc_offsetadd_wtab_neon;
  163. w->cachea[0] = w->i_offset;
  164. }
  165. }
  166. else if( !w->i_denom )
  167. w->weightfn = mc_nodenom_wtab_neon;
  168. else
  169. w->weightfn = mc_wtab_neon;
  170. }
  171. static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
  172. {
  173. NULL,
  174. x264_pixel_avg2_w4_neon,
  175. x264_pixel_avg2_w8_neon,
  176. x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
  177. x264_pixel_avg2_w16_neon,
  178. x264_pixel_avg2_w20_neon,
  179. };
  180. static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
  181. {
  182. NULL,
  183. x264_mc_copy_w4_neon,
  184. x264_mc_copy_w8_neon,
  185. NULL,
  186. x264_mc_copy_w16_neon,
  187. };
  188. static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
  189. uint8_t *src[4], intptr_t i_src_stride,
  190. int mvx, int mvy,
  191. int i_width, int i_height, const x264_weight_t *weight )
  192. {
  193. int qpel_idx = ((mvy&3)<<2) + (mvx&3);
  194. intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
  195. uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
  196. if( (mvy&3) == 3 ) // explict if() to force conditional add
  197. src1 += i_src_stride;
  198. if( qpel_idx & 5 ) /* qpel interpolation needed */
  199. {
  200. uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
  201. pixel_avg_wtab_neon[i_width>>2](
  202. dst, i_dst_stride, src1, i_src_stride,
  203. src2, i_height );
  204. if( weight->weightfn )
  205. weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
  206. }
  207. else if( weight->weightfn )
  208. weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
  209. else
  210. mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
  211. }
  212. static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
  213. uint8_t *src[4], intptr_t i_src_stride,
  214. int mvx, int mvy,
  215. int i_width, int i_height, const x264_weight_t *weight )
  216. {
  217. int qpel_idx = ((mvy&3)<<2) + (mvx&3);
  218. intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
  219. uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
  220. if( (mvy&3) == 3 ) // explict if() to force conditional add
  221. src1 += i_src_stride;
  222. if( qpel_idx & 5 ) /* qpel interpolation needed */
  223. {
  224. uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
  225. pixel_avg_wtab_neon[i_width>>2](
  226. dst, *i_dst_stride, src1, i_src_stride,
  227. src2, i_height );
  228. if( weight->weightfn )
  229. weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
  230. return dst;
  231. }
  232. else if( weight->weightfn )
  233. {
  234. weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
  235. return dst;
  236. }
  237. else
  238. {
  239. *i_dst_stride = i_src_stride;
  240. return src1;
  241. }
  242. }
  243. #define x264_hpel_filter_neon x264_template(hpel_filter_neon)
  244. void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
  245. uint8_t *src, intptr_t stride, int width,
  246. int height, int16_t *buf );
  247. PLANE_COPY(16, neon)
  248. PLANE_COPY_SWAP(16, neon)
  249. PLANE_INTERLEAVE(neon)
  250. PROPAGATE_LIST(neon)
  251. #endif // !HIGH_BIT_DEPTH
  252. void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
  253. {
  254. #if !HIGH_BIT_DEPTH
  255. if( cpu&X264_CPU_ARMV8 )
  256. {
  257. pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
  258. pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
  259. pf->prefetch_ref = x264_prefetch_ref_aarch64;
  260. }
  261. if( !(cpu&X264_CPU_NEON) )
  262. return;
  263. pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
  264. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
  265. pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
  266. pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
  267. pf->plane_copy = plane_copy_neon;
  268. pf->plane_copy_swap = plane_copy_swap_neon;
  269. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
  270. pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
  271. pf->plane_copy_interleave = plane_copy_interleave_neon;
  272. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
  273. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
  274. pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
  275. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
  276. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
  277. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
  278. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
  279. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
  280. pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
  281. pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
  282. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
  283. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
  284. pf->weight = mc_wtab_neon;
  285. pf->offsetadd = mc_offsetadd_wtab_neon;
  286. pf->offsetsub = mc_offsetsub_wtab_neon;
  287. pf->weight_cache = weight_cache_neon;
  288. pf->mc_chroma = x264_mc_chroma_neon;
  289. pf->mc_luma = mc_luma_neon;
  290. pf->get_ref = get_ref_neon;
  291. pf->hpel_filter = x264_hpel_filter_neon;
  292. pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
  293. pf->integral_init4h = x264_integral_init4h_neon;
  294. pf->integral_init8h = x264_integral_init8h_neon;
  295. pf->integral_init4v = x264_integral_init4v_neon;
  296. pf->integral_init8v = x264_integral_init8v_neon;
  297. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
  298. pf->mbtree_propagate_list = mbtree_propagate_list_neon;
  299. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
  300. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
  301. pf->memcpy_aligned = x264_memcpy_aligned_neon;
  302. pf->memzero_aligned = x264_memzero_aligned_neon;
  303. #endif // !HIGH_BIT_DEPTH
  304. }