mc-c.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137
  1. /*****************************************************************************
  2. * mc-c.c: x86 motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2003-2018 x264 project
  5. *
  6. * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  7. * Loren Merritt <lorenm@u.washington.edu>
  8. * Fiona Glaser <fiona@x264.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include "common/common.h"
  28. #include "mc.h"
  29. #define x264_pixel_avg_16x16_avx2 x264_template(pixel_avg_16x16_avx2)
  30. #define x264_pixel_avg_16x16_avx512 x264_template(pixel_avg_16x16_avx512)
  31. #define x264_pixel_avg_16x16_mmx2 x264_template(pixel_avg_16x16_mmx2)
  32. #define x264_pixel_avg_16x16_sse2 x264_template(pixel_avg_16x16_sse2)
  33. #define x264_pixel_avg_16x16_ssse3 x264_template(pixel_avg_16x16_ssse3)
  34. #define x264_pixel_avg_16x8_avx2 x264_template(pixel_avg_16x8_avx2)
  35. #define x264_pixel_avg_16x8_avx512 x264_template(pixel_avg_16x8_avx512)
  36. #define x264_pixel_avg_16x8_mmx2 x264_template(pixel_avg_16x8_mmx2)
  37. #define x264_pixel_avg_16x8_sse2 x264_template(pixel_avg_16x8_sse2)
  38. #define x264_pixel_avg_16x8_ssse3 x264_template(pixel_avg_16x8_ssse3)
  39. #define x264_pixel_avg_4x16_mmx2 x264_template(pixel_avg_4x16_mmx2)
  40. #define x264_pixel_avg_4x16_sse2 x264_template(pixel_avg_4x16_sse2)
  41. #define x264_pixel_avg_4x16_ssse3 x264_template(pixel_avg_4x16_ssse3)
  42. #define x264_pixel_avg_4x2_mmx2 x264_template(pixel_avg_4x2_mmx2)
  43. #define x264_pixel_avg_4x2_sse2 x264_template(pixel_avg_4x2_sse2)
  44. #define x264_pixel_avg_4x2_ssse3 x264_template(pixel_avg_4x2_ssse3)
  45. #define x264_pixel_avg_4x4_mmx2 x264_template(pixel_avg_4x4_mmx2)
  46. #define x264_pixel_avg_4x4_sse2 x264_template(pixel_avg_4x4_sse2)
  47. #define x264_pixel_avg_4x4_ssse3 x264_template(pixel_avg_4x4_ssse3)
  48. #define x264_pixel_avg_4x8_mmx2 x264_template(pixel_avg_4x8_mmx2)
  49. #define x264_pixel_avg_4x8_sse2 x264_template(pixel_avg_4x8_sse2)
  50. #define x264_pixel_avg_4x8_ssse3 x264_template(pixel_avg_4x8_ssse3)
  51. #define x264_pixel_avg_8x16_avx512 x264_template(pixel_avg_8x16_avx512)
  52. #define x264_pixel_avg_8x16_mmx2 x264_template(pixel_avg_8x16_mmx2)
  53. #define x264_pixel_avg_8x16_sse2 x264_template(pixel_avg_8x16_sse2)
  54. #define x264_pixel_avg_8x16_ssse3 x264_template(pixel_avg_8x16_ssse3)
  55. #define x264_pixel_avg_8x4_avx512 x264_template(pixel_avg_8x4_avx512)
  56. #define x264_pixel_avg_8x4_mmx2 x264_template(pixel_avg_8x4_mmx2)
  57. #define x264_pixel_avg_8x4_sse2 x264_template(pixel_avg_8x4_sse2)
  58. #define x264_pixel_avg_8x4_ssse3 x264_template(pixel_avg_8x4_ssse3)
  59. #define x264_pixel_avg_8x8_avx512 x264_template(pixel_avg_8x8_avx512)
  60. #define x264_pixel_avg_8x8_mmx2 x264_template(pixel_avg_8x8_mmx2)
  61. #define x264_pixel_avg_8x8_sse2 x264_template(pixel_avg_8x8_sse2)
  62. #define x264_pixel_avg_8x8_ssse3 x264_template(pixel_avg_8x8_ssse3)
  63. #define DECL_SUF( func, args )\
  64. void func##_mmx2 args;\
  65. void func##_sse2 args;\
  66. void func##_ssse3 args;\
  67. void func##_avx2 args;\
  68. void func##_avx512 args;
  69. DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  70. DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  71. DECL_SUF( x264_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  72. DECL_SUF( x264_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  73. DECL_SUF( x264_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  74. DECL_SUF( x264_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  75. DECL_SUF( x264_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  76. DECL_SUF( x264_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  77. DECL_SUF( x264_pixel_avg_4x2, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
  78. #undef DECL_SUF
  79. #define x264_mc_weight_w12_mmx2 x264_template(mc_weight_w12_mmx2)
  80. #define x264_mc_weight_w12_sse2 x264_template(mc_weight_w12_sse2)
  81. #define x264_mc_weight_w16_avx2 x264_template(mc_weight_w16_avx2)
  82. #define x264_mc_weight_w16_mmx2 x264_template(mc_weight_w16_mmx2)
  83. #define x264_mc_weight_w16_sse2 x264_template(mc_weight_w16_sse2)
  84. #define x264_mc_weight_w16_ssse3 x264_template(mc_weight_w16_ssse3)
  85. #define x264_mc_weight_w20_avx2 x264_template(mc_weight_w20_avx2)
  86. #define x264_mc_weight_w20_mmx2 x264_template(mc_weight_w20_mmx2)
  87. #define x264_mc_weight_w20_sse2 x264_template(mc_weight_w20_sse2)
  88. #define x264_mc_weight_w20_ssse3 x264_template(mc_weight_w20_ssse3)
  89. #define x264_mc_weight_w4_mmx2 x264_template(mc_weight_w4_mmx2)
  90. #define x264_mc_weight_w4_ssse3 x264_template(mc_weight_w4_ssse3)
  91. #define x264_mc_weight_w8_avx2 x264_template(mc_weight_w8_avx2)
  92. #define x264_mc_weight_w8_mmx2 x264_template(mc_weight_w8_mmx2)
  93. #define x264_mc_weight_w8_sse2 x264_template(mc_weight_w8_sse2)
  94. #define x264_mc_weight_w8_ssse3 x264_template(mc_weight_w8_ssse3)
  95. #define MC_WEIGHT(w,type) \
  96. void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
  97. #define x264_mc_offsetadd_w12_mmx2 x264_template(mc_offsetadd_w12_mmx2)
  98. #define x264_mc_offsetadd_w16_mmx2 x264_template(mc_offsetadd_w16_mmx2)
  99. #define x264_mc_offsetadd_w16_sse2 x264_template(mc_offsetadd_w16_sse2)
  100. #define x264_mc_offsetadd_w20_mmx2 x264_template(mc_offsetadd_w20_mmx2)
  101. #define x264_mc_offsetadd_w20_sse2 x264_template(mc_offsetadd_w20_sse2)
  102. #define x264_mc_offsetadd_w4_mmx2 x264_template(mc_offsetadd_w4_mmx2)
  103. #define x264_mc_offsetadd_w8_mmx2 x264_template(mc_offsetadd_w8_mmx2)
  104. #define x264_mc_offsetadd_w8_sse2 x264_template(mc_offsetadd_w8_sse2)
  105. #define x264_mc_offsetsub_w12_mmx2 x264_template(mc_offsetsub_w12_mmx2)
  106. #define x264_mc_offsetsub_w16_mmx2 x264_template(mc_offsetsub_w16_mmx2)
  107. #define x264_mc_offsetsub_w16_sse2 x264_template(mc_offsetsub_w16_sse2)
  108. #define x264_mc_offsetsub_w20_mmx2 x264_template(mc_offsetsub_w20_mmx2)
  109. #define x264_mc_offsetsub_w20_sse2 x264_template(mc_offsetsub_w20_sse2)
  110. #define x264_mc_offsetsub_w4_mmx2 x264_template(mc_offsetsub_w4_mmx2)
  111. #define x264_mc_offsetsub_w8_mmx2 x264_template(mc_offsetsub_w8_mmx2)
  112. #define x264_mc_offsetsub_w8_sse2 x264_template(mc_offsetsub_w8_sse2)
  113. #define MC_WEIGHT_OFFSET(w,type) \
  114. void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
  115. void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
  116. MC_WEIGHT(w,type)
  117. MC_WEIGHT_OFFSET( 4, mmx2 )
  118. MC_WEIGHT_OFFSET( 8, mmx2 )
  119. MC_WEIGHT_OFFSET( 12, mmx2 )
  120. MC_WEIGHT_OFFSET( 16, mmx2 )
  121. MC_WEIGHT_OFFSET( 20, mmx2 )
  122. MC_WEIGHT_OFFSET( 12, sse2 )
  123. MC_WEIGHT_OFFSET( 16, sse2 )
  124. MC_WEIGHT_OFFSET( 20, sse2 )
  125. #if HIGH_BIT_DEPTH
  126. MC_WEIGHT_OFFSET( 8, sse2 )
  127. #endif
  128. MC_WEIGHT( 8, sse2 )
  129. MC_WEIGHT( 4, ssse3 )
  130. MC_WEIGHT( 8, ssse3 )
  131. MC_WEIGHT( 12, ssse3 )
  132. MC_WEIGHT( 16, ssse3 )
  133. MC_WEIGHT( 20, ssse3 )
  134. MC_WEIGHT( 8, avx2 )
  135. MC_WEIGHT( 16, avx2 )
  136. MC_WEIGHT( 20, avx2 )
  137. #undef MC_WEIGHT_OFFSET
  138. #undef MC_WEIGHT
  139. #define x264_mc_copy_w4_mmx x264_template(mc_copy_w4_mmx)
  140. void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
  141. #define x264_mc_copy_w8_mmx x264_template(mc_copy_w8_mmx)
  142. void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
  143. #define x264_mc_copy_w8_sse x264_template(mc_copy_w8_sse)
  144. void x264_mc_copy_w8_sse ( pixel *, intptr_t, pixel *, intptr_t, int );
  145. #define x264_mc_copy_w16_mmx x264_template(mc_copy_w16_mmx)
  146. void x264_mc_copy_w16_mmx( pixel *, intptr_t, pixel *, intptr_t, int );
  147. #define x264_mc_copy_w16_sse x264_template(mc_copy_w16_sse)
  148. void x264_mc_copy_w16_sse( pixel *, intptr_t, pixel *, intptr_t, int );
  149. #define x264_mc_copy_w16_aligned_sse x264_template(mc_copy_w16_aligned_sse)
  150. void x264_mc_copy_w16_aligned_sse( pixel *, intptr_t, pixel *, intptr_t, int );
  151. #define x264_mc_copy_w16_avx x264_template(mc_copy_w16_avx)
  152. void x264_mc_copy_w16_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
  153. #define x264_mc_copy_w16_aligned_avx x264_template(mc_copy_w16_aligned_avx)
  154. void x264_mc_copy_w16_aligned_avx( uint16_t *, intptr_t, uint16_t *, intptr_t, int );
  155. #define x264_prefetch_fenc_400_mmx2 x264_template(prefetch_fenc_400_mmx2)
  156. void x264_prefetch_fenc_400_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  157. #define x264_prefetch_fenc_420_mmx2 x264_template(prefetch_fenc_420_mmx2)
  158. void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  159. #define x264_prefetch_fenc_422_mmx2 x264_template(prefetch_fenc_422_mmx2)
  160. void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
  161. #define x264_prefetch_ref_mmx2 x264_template(prefetch_ref_mmx2)
  162. void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
  163. #define x264_plane_copy_core_sse x264_template(plane_copy_core_sse)
  164. void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  165. #define x264_plane_copy_core_avx x264_template(plane_copy_core_avx)
  166. void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  167. #define x264_plane_copy_avx512 x264_template(plane_copy_avx512)
  168. void x264_plane_copy_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  169. #define x264_plane_copy_swap_core_ssse3 x264_template(plane_copy_swap_core_ssse3)
  170. void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  171. #define x264_plane_copy_swap_core_avx2 x264_template(plane_copy_swap_core_avx2)
  172. void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  173. #define x264_plane_copy_swap_avx512 x264_template(plane_copy_swap_avx512)
  174. void x264_plane_copy_swap_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
  175. #define x264_plane_copy_interleave_core_mmx2 x264_template(plane_copy_interleave_core_mmx2)
  176. void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
  177. pixel *srcu, intptr_t i_srcu,
  178. pixel *srcv, intptr_t i_srcv, int w, int h );
  179. #define x264_plane_copy_interleave_core_sse2 x264_template(plane_copy_interleave_core_sse2)
  180. void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
  181. pixel *srcu, intptr_t i_srcu,
  182. pixel *srcv, intptr_t i_srcv, int w, int h );
  183. #define x264_plane_copy_interleave_core_avx x264_template(plane_copy_interleave_core_avx)
  184. void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
  185. pixel *srcu, intptr_t i_srcu,
  186. pixel *srcv, intptr_t i_srcv, int w, int h );
  187. #define x264_plane_copy_deinterleave_sse2 x264_template(plane_copy_deinterleave_sse2)
  188. void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta,
  189. pixel *dstb, intptr_t i_dstb,
  190. pixel *src, intptr_t i_src, int w, int h );
  191. #define x264_plane_copy_deinterleave_ssse3 x264_template(plane_copy_deinterleave_ssse3)
  192. void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta,
  193. uint8_t *dstb, intptr_t i_dstb,
  194. uint8_t *src, intptr_t i_src, int w, int h );
  195. #define x264_plane_copy_deinterleave_avx x264_template(plane_copy_deinterleave_avx)
  196. void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta,
  197. uint16_t *dstb, intptr_t i_dstb,
  198. uint16_t *src, intptr_t i_src, int w, int h );
  199. #define x264_plane_copy_deinterleave_avx2 x264_template(plane_copy_deinterleave_avx2)
  200. void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta,
  201. pixel *dstb, intptr_t i_dstb,
  202. pixel *src, intptr_t i_src, int w, int h );
  203. #define x264_plane_copy_deinterleave_rgb_sse2 x264_template(plane_copy_deinterleave_rgb_sse2)
  204. void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
  205. pixel *dstb, intptr_t i_dstb,
  206. pixel *dstc, intptr_t i_dstc,
  207. pixel *src, intptr_t i_src, int pw, int w, int h );
  208. #define x264_plane_copy_deinterleave_rgb_ssse3 x264_template(plane_copy_deinterleave_rgb_ssse3)
  209. void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
  210. pixel *dstb, intptr_t i_dstb,
  211. pixel *dstc, intptr_t i_dstc,
  212. pixel *src, intptr_t i_src, int pw, int w, int h );
  213. #define x264_plane_copy_deinterleave_rgb_avx2 x264_template(plane_copy_deinterleave_rgb_avx2)
  214. void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
  215. pixel *dstb, intptr_t i_dstb,
  216. pixel *dstc, intptr_t i_dstc,
  217. pixel *src, intptr_t i_src, int pw, int w, int h );
  218. #define x264_plane_copy_deinterleave_v210_ssse3 x264_template(plane_copy_deinterleave_v210_ssse3)
  219. void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
  220. uint16_t *dstv, intptr_t i_dstv,
  221. uint32_t *src, intptr_t i_src, int w, int h );
  222. #define x264_plane_copy_deinterleave_v210_avx x264_template(plane_copy_deinterleave_v210_avx)
  223. void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
  224. uint16_t *dstv, intptr_t i_dstv,
  225. uint32_t *src, intptr_t i_src, int w, int h );
  226. #define x264_plane_copy_deinterleave_v210_avx2 x264_template(plane_copy_deinterleave_v210_avx2)
  227. void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
  228. uint16_t *dstv, intptr_t i_dstv,
  229. uint32_t *src, intptr_t i_src, int w, int h );
  230. #define x264_plane_copy_deinterleave_v210_avx512 x264_template(plane_copy_deinterleave_v210_avx512)
  231. void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
  232. uint16_t *dstv, intptr_t i_dstv,
  233. uint32_t *src, intptr_t i_src, int w, int h );
  234. #define x264_store_interleave_chroma_mmx2 x264_template(store_interleave_chroma_mmx2)
  235. void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  236. #define x264_store_interleave_chroma_sse2 x264_template(store_interleave_chroma_sse2)
  237. void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  238. #define x264_store_interleave_chroma_avx x264_template(store_interleave_chroma_avx)
  239. void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
  240. #define x264_load_deinterleave_chroma_fenc_sse2 x264_template(load_deinterleave_chroma_fenc_sse2)
  241. void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
  242. #define x264_load_deinterleave_chroma_fenc_ssse3 x264_template(load_deinterleave_chroma_fenc_ssse3)
  243. void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  244. #define x264_load_deinterleave_chroma_fenc_avx x264_template(load_deinterleave_chroma_fenc_avx)
  245. void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
  246. #define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2)
  247. void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
  248. #define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512)
  249. void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  250. #define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2)
  251. void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
  252. #define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3)
  253. void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  254. #define x264_load_deinterleave_chroma_fdec_avx x264_template(load_deinterleave_chroma_fdec_avx)
  255. void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
  256. #define x264_load_deinterleave_chroma_fdec_avx2 x264_template(load_deinterleave_chroma_fdec_avx2)
  257. void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
  258. #define x264_load_deinterleave_chroma_fdec_avx512 x264_template(load_deinterleave_chroma_fdec_avx512)
  259. void x264_load_deinterleave_chroma_fdec_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  260. #define x264_memcpy_aligned_sse x264_template(memcpy_aligned_sse)
  261. void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
  262. #define x264_memcpy_aligned_avx x264_template(memcpy_aligned_avx)
  263. void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
  264. #define x264_memcpy_aligned_avx512 x264_template(memcpy_aligned_avx512)
  265. void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
  266. #define x264_memzero_aligned_sse x264_template(memzero_aligned_sse)
  267. void x264_memzero_aligned_sse ( void *dst, size_t n );
  268. #define x264_memzero_aligned_avx x264_template(memzero_aligned_avx)
  269. void x264_memzero_aligned_avx ( void *dst, size_t n );
  270. #define x264_memzero_aligned_avx512 x264_template(memzero_aligned_avx512)
  271. void x264_memzero_aligned_avx512( void *dst, size_t n );
  272. #define x264_integral_init4h_sse4 x264_template(integral_init4h_sse4)
  273. void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
  274. #define x264_integral_init4h_avx2 x264_template(integral_init4h_avx2)
  275. void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
  276. #define x264_integral_init8h_sse4 x264_template(integral_init8h_sse4)
  277. void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
  278. #define x264_integral_init8h_avx x264_template(integral_init8h_avx)
  279. void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
  280. #define x264_integral_init8h_avx2 x264_template(integral_init8h_avx2)
  281. void x264_integral_init8h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
  282. #define x264_integral_init4v_mmx x264_template(integral_init4v_mmx)
  283. void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
  284. #define x264_integral_init4v_sse2 x264_template(integral_init4v_sse2)
  285. void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
  286. #define x264_integral_init4v_ssse3 x264_template(integral_init4v_ssse3)
  287. void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
  288. #define x264_integral_init4v_avx2 x264_template(integral_init4v_avx2)
  289. void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
  290. #define x264_integral_init8v_mmx x264_template(integral_init8v_mmx)
  291. void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
  292. #define x264_integral_init8v_sse2 x264_template(integral_init8v_sse2)
  293. void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
  294. #define x264_integral_init8v_avx2 x264_template(integral_init8v_avx2)
  295. void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
  296. #define x264_mbtree_propagate_cost_sse2 x264_template(mbtree_propagate_cost_sse2)
  297. void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  298. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  299. #define x264_mbtree_propagate_cost_avx x264_template(mbtree_propagate_cost_avx)
  300. void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  301. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  302. #define x264_mbtree_propagate_cost_fma4 x264_template(mbtree_propagate_cost_fma4)
  303. void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  304. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  305. #define x264_mbtree_propagate_cost_avx2 x264_template(mbtree_propagate_cost_avx2)
  306. void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  307. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  308. #define x264_mbtree_propagate_cost_avx512 x264_template(mbtree_propagate_cost_avx512)
  309. void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
  310. uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
  311. #define x264_mbtree_fix8_pack_ssse3 x264_template(mbtree_fix8_pack_ssse3)
  312. void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
  313. #define x264_mbtree_fix8_pack_avx2 x264_template(mbtree_fix8_pack_avx2)
  314. void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
  315. #define x264_mbtree_fix8_pack_avx512 x264_template(mbtree_fix8_pack_avx512)
  316. void x264_mbtree_fix8_pack_avx512( uint16_t *dst, float *src, int count );
  317. #define x264_mbtree_fix8_unpack_ssse3 x264_template(mbtree_fix8_unpack_ssse3)
  318. void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
  319. #define x264_mbtree_fix8_unpack_avx2 x264_template(mbtree_fix8_unpack_avx2)
  320. void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count );
  321. #define x264_mbtree_fix8_unpack_avx512 x264_template(mbtree_fix8_unpack_avx512)
  322. void x264_mbtree_fix8_unpack_avx512( float *dst, uint16_t *src, int count );
  323. #define x264_mc_chroma_avx x264_template(mc_chroma_avx)
  324. #define x264_mc_chroma_avx2 x264_template(mc_chroma_avx2)
  325. #define x264_mc_chroma_cache64_ssse3 x264_template(mc_chroma_cache64_ssse3)
  326. #define x264_mc_chroma_mmx2 x264_template(mc_chroma_mmx2)
  327. #define x264_mc_chroma_sse2 x264_template(mc_chroma_sse2)
  328. #define x264_mc_chroma_ssse3 x264_template(mc_chroma_ssse3)
  329. #define MC_CHROMA(cpu)\
  330. void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
  331. int dx, int dy, int i_width, int i_height );
  332. MC_CHROMA(mmx2)
  333. MC_CHROMA(sse2)
  334. MC_CHROMA(ssse3)
  335. MC_CHROMA(cache64_ssse3)
  336. MC_CHROMA(avx)
  337. MC_CHROMA(avx2)
  338. #undef MC_CHROMA
  339. #define x264_frame_init_lowres_core_avx x264_template(frame_init_lowres_core_avx)
  340. #define x264_frame_init_lowres_core_avx2 x264_template(frame_init_lowres_core_avx2)
  341. #define x264_frame_init_lowres_core_mmx2 x264_template(frame_init_lowres_core_mmx2)
  342. #define x264_frame_init_lowres_core_cache32_mmx2 x264_template(frame_init_lowres_core_cache32_mmx2)
  343. #define x264_frame_init_lowres_core_sse2 x264_template(frame_init_lowres_core_sse2)
  344. #define x264_frame_init_lowres_core_ssse3 x264_template(frame_init_lowres_core_ssse3)
  345. #define x264_frame_init_lowres_core_xop x264_template(frame_init_lowres_core_xop)
  346. #define LOWRES(cpu)\
  347. void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
  348. intptr_t src_stride, intptr_t dst_stride, int width, int height );
  349. LOWRES(mmx2)
  350. LOWRES(cache32_mmx2)
  351. LOWRES(sse2)
  352. LOWRES(ssse3)
  353. LOWRES(avx)
  354. LOWRES(xop)
  355. LOWRES(avx2)
  356. #undef LOWRES
  357. #define x264_pixel_avg2_w10_mmx2 x264_template(pixel_avg2_w10_mmx2)
  358. #define x264_pixel_avg2_w10_sse2 x264_template(pixel_avg2_w10_sse2)
  359. #define x264_pixel_avg2_w12_cache32_mmx2 x264_template(pixel_avg2_w12_cache32_mmx2)
  360. #define x264_pixel_avg2_w12_cache64_mmx2 x264_template(pixel_avg2_w12_cache64_mmx2)
  361. #define x264_pixel_avg2_w12_mmx2 x264_template(pixel_avg2_w12_mmx2)
  362. #define x264_pixel_avg2_w16_avx2 x264_template(pixel_avg2_w16_avx2)
  363. #define x264_pixel_avg2_w16_cache32_mmx2 x264_template(pixel_avg2_w16_cache32_mmx2)
  364. #define x264_pixel_avg2_w16_cache64_mmx2 x264_template(pixel_avg2_w16_cache64_mmx2)
  365. #define x264_pixel_avg2_w16_cache64_sse2 x264_template(pixel_avg2_w16_cache64_sse2)
  366. #define x264_pixel_avg2_w16_cache64_ssse3 x264_template(pixel_avg2_w16_cache64_ssse3)
  367. #define x264_pixel_avg2_w16_mmx2 x264_template(pixel_avg2_w16_mmx2)
  368. #define x264_pixel_avg2_w16_sse2 x264_template(pixel_avg2_w16_sse2)
  369. #define x264_pixel_avg2_w18_avx2 x264_template(pixel_avg2_w18_avx2)
  370. #define x264_pixel_avg2_w18_mmx2 x264_template(pixel_avg2_w18_mmx2)
  371. #define x264_pixel_avg2_w18_sse2 x264_template(pixel_avg2_w18_sse2)
  372. #define x264_pixel_avg2_w20_avx2 x264_template(pixel_avg2_w20_avx2)
  373. #define x264_pixel_avg2_w20_cache32_mmx2 x264_template(pixel_avg2_w20_cache32_mmx2)
  374. #define x264_pixel_avg2_w20_cache64_mmx2 x264_template(pixel_avg2_w20_cache64_mmx2)
  375. #define x264_pixel_avg2_w20_cache64_sse2 x264_template(pixel_avg2_w20_cache64_sse2)
  376. #define x264_pixel_avg2_w20_mmx2 x264_template(pixel_avg2_w20_mmx2)
  377. #define x264_pixel_avg2_w20_sse2 x264_template(pixel_avg2_w20_sse2)
  378. #define x264_pixel_avg2_w4_mmx2 x264_template(pixel_avg2_w4_mmx2)
  379. #define x264_pixel_avg2_w8_cache32_mmx2 x264_template(pixel_avg2_w8_cache32_mmx2)
  380. #define x264_pixel_avg2_w8_cache64_mmx2 x264_template(pixel_avg2_w8_cache64_mmx2)
  381. #define x264_pixel_avg2_w8_mmx2 x264_template(pixel_avg2_w8_mmx2)
  382. #define x264_pixel_avg2_w8_sse2 x264_template(pixel_avg2_w8_sse2)
  383. #define PIXEL_AVG_W(width,cpu)\
  384. void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
  385. /* This declares some functions that don't exist, but that isn't a problem. */
  386. #define PIXEL_AVG_WALL(cpu)\
  387. PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
  388. PIXEL_AVG_WALL(mmx2)
  389. PIXEL_AVG_WALL(cache32_mmx2)
  390. PIXEL_AVG_WALL(cache64_mmx2)
  391. PIXEL_AVG_WALL(cache64_sse2)
  392. PIXEL_AVG_WALL(sse2)
  393. PIXEL_AVG_WALL(cache64_ssse3)
  394. PIXEL_AVG_WALL(avx2)
  395. #undef PIXEL_AVG_W
  396. #undef PIXEL_AVG_WALL
  397. #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
  398. static void (* const pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
  399. {\
  400. NULL,\
  401. x264_pixel_avg2_w4_##name1,\
  402. x264_pixel_avg2_w8_##name2,\
  403. x264_pixel_avg2_w12_##name3,\
  404. x264_pixel_avg2_w16_##name4,\
  405. x264_pixel_avg2_w20_##name5,\
  406. };
  407. #if HIGH_BIT_DEPTH
  408. /* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
  409. #undef x264_pixel_avg2_w12_mmx2
  410. #undef x264_pixel_avg2_w20_mmx2
  411. #undef x264_pixel_avg2_w20_sse2
  412. #undef x264_pixel_avg2_w20_avx2
  413. #define x264_pixel_avg2_w12_mmx2 x264_pixel_avg2_w10_mmx2
  414. #define x264_pixel_avg2_w20_mmx2 x264_pixel_avg2_w18_mmx2
  415. #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2
  416. #define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2
  417. #define x264_pixel_avg2_w12_avx2 x264_pixel_avg2_w16_avx2
  418. #define x264_pixel_avg2_w20_avx2 x264_pixel_avg2_w18_avx2
  419. #else
  420. /* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
  421. #define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
  422. #define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
  423. #define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
  424. #define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
  425. #endif // HIGH_BIT_DEPTH
  426. PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
  427. #if HIGH_BIT_DEPTH
  428. PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
  429. PIXEL_AVG_WTAB(avx2, mmx2, sse2, avx2, avx2, avx2)
  430. #else // !HIGH_BIT_DEPTH
  431. #if ARCH_X86
  432. PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
  433. PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
  434. #endif
  435. PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
  436. PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
  437. PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
  438. PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
  439. PIXEL_AVG_WTAB(avx2, mmx2, mmx2, sse2, sse2, avx2)
  440. #endif // HIGH_BIT_DEPTH
  441. #define MC_COPY_WTAB(instr, name1, name2, name3)\
  442. static void (* const mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
  443. {\
  444. NULL,\
  445. x264_mc_copy_w4_##name1,\
  446. x264_mc_copy_w8_##name2,\
  447. NULL,\
  448. x264_mc_copy_w16_##name3,\
  449. };
  450. MC_COPY_WTAB(mmx,mmx,mmx,mmx)
  451. #if HIGH_BIT_DEPTH
  452. MC_COPY_WTAB(sse,mmx,sse,sse)
  453. MC_COPY_WTAB(avx,mmx,sse,avx)
  454. #else
  455. MC_COPY_WTAB(sse,mmx,mmx,sse)
  456. #endif
  457. #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
  458. static void (* mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
  459. {\
  460. x264_mc_##function##_w4_##name1,\
  461. x264_mc_##function##_w4_##name1,\
  462. x264_mc_##function##_w8_##name2,\
  463. x264_mc_##function##_w##w12version##_##instr,\
  464. x264_mc_##function##_w16_##instr,\
  465. x264_mc_##function##_w20_##instr,\
  466. };
  467. #if HIGH_BIT_DEPTH
  468. MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
  469. MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
  470. MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
  471. MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
  472. MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
  473. MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)
  474. static void weight_cache_mmx2( x264_t *h, x264_weight_t *w )
  475. {
  476. if( w->i_scale == 1<<w->i_denom )
  477. {
  478. if( w->i_offset < 0 )
  479. w->weightfn = h->mc.offsetsub;
  480. else
  481. w->weightfn = h->mc.offsetadd;
  482. for( int i = 0; i < 8; i++ )
  483. w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
  484. return;
  485. }
  486. w->weightfn = h->mc.weight;
  487. int den1 = 1<<w->i_denom;
  488. int den2 = w->i_scale<<1;
  489. int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
  490. for( int i = 0; i < 8; i++ )
  491. {
  492. w->cachea[i] = den1;
  493. w->cacheb[i] = i&1 ? den3 : den2;
  494. }
  495. }
  496. #else
  497. MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
  498. MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
  499. MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
  500. MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
  501. MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
  502. MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
  503. MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)
  504. MC_WEIGHT_WTAB(weight,avx2,ssse3,avx2,16)
  505. static void weight_cache_mmx2( x264_t *h, x264_weight_t *w )
  506. {
  507. int i;
  508. int16_t den1;
  509. if( w->i_scale == 1<<w->i_denom )
  510. {
  511. if( w->i_offset < 0 )
  512. w->weightfn = h->mc.offsetsub;
  513. else
  514. w->weightfn = h->mc.offsetadd;
  515. memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
  516. return;
  517. }
  518. w->weightfn = h->mc.weight;
  519. den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
  520. for( i = 0; i < 8; i++ )
  521. {
  522. w->cachea[i] = w->i_scale;
  523. w->cacheb[i] = den1;
  524. }
  525. }
  526. static void weight_cache_ssse3( x264_t *h, x264_weight_t *w )
  527. {
  528. int i, den1;
  529. if( w->i_scale == 1<<w->i_denom )
  530. {
  531. if( w->i_offset < 0 )
  532. w->weightfn = h->mc.offsetsub;
  533. else
  534. w->weightfn = h->mc.offsetadd;
  535. memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
  536. return;
  537. }
  538. w->weightfn = h->mc.weight;
  539. den1 = w->i_scale << (8 - w->i_denom);
  540. for( i = 0; i < 8; i++ )
  541. {
  542. w->cachea[i] = den1;
  543. w->cacheb[i] = w->i_offset;
  544. }
  545. }
  546. #endif // !HIGH_BIT_DEPTH
  547. #define MC_LUMA(name,instr1,instr2)\
  548. static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\
  549. pixel *src[4], intptr_t i_src_stride,\
  550. int mvx, int mvy,\
  551. int i_width, int i_height, const x264_weight_t *weight )\
  552. {\
  553. int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
  554. int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
  555. pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
  556. if( qpel_idx & 5 ) /* qpel interpolation needed */\
  557. {\
  558. pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
  559. pixel_avg_wtab_##instr1[i_width>>2](\
  560. dst, i_dst_stride, src1, i_src_stride,\
  561. src2, i_height );\
  562. if( weight->weightfn )\
  563. weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
  564. }\
  565. else if( weight->weightfn )\
  566. weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
  567. else\
  568. mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
  569. }
  570. MC_LUMA(mmx2,mmx2,mmx)
  571. MC_LUMA(sse2,sse2,sse)
  572. #if HIGH_BIT_DEPTH
  573. MC_LUMA(avx2,avx2,avx)
  574. #else
  575. #if ARCH_X86
  576. MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
  577. MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
  578. #endif
  579. MC_LUMA(cache64_sse2,cache64_sse2,sse)
  580. MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
  581. MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
  582. #endif // !HIGH_BIT_DEPTH
  583. #define GET_REF(name)\
  584. static pixel *get_ref_##name( pixel *dst, intptr_t *i_dst_stride,\
  585. pixel *src[4], intptr_t i_src_stride,\
  586. int mvx, int mvy,\
  587. int i_width, int i_height, const x264_weight_t *weight )\
  588. {\
  589. int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
  590. int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
  591. pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
  592. if( qpel_idx & 5 ) /* qpel interpolation needed */\
  593. {\
  594. pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
  595. pixel_avg_wtab_##name[i_width>>2](\
  596. dst, *i_dst_stride, src1, i_src_stride,\
  597. src2, i_height );\
  598. if( weight->weightfn )\
  599. weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
  600. return dst;\
  601. }\
  602. else if( weight->weightfn )\
  603. {\
  604. weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
  605. return dst;\
  606. }\
  607. else\
  608. {\
  609. *i_dst_stride = i_src_stride;\
  610. return src1;\
  611. }\
  612. }
  613. GET_REF(mmx2)
  614. GET_REF(sse2)
  615. GET_REF(avx2)
  616. #if !HIGH_BIT_DEPTH
  617. #if ARCH_X86
  618. GET_REF(cache32_mmx2)
  619. GET_REF(cache64_mmx2)
  620. #endif
  621. GET_REF(cache64_sse2)
  622. GET_REF(cache64_ssse3)
  623. GET_REF(cache64_ssse3_atom)
  624. #endif // !HIGH_BIT_DEPTH
  625. #define x264_hpel_filter_avx x264_template(hpel_filter_avx)
  626. #define x264_hpel_filter_avx2 x264_template(hpel_filter_avx2)
  627. #define x264_hpel_filter_c_mmx2 x264_template(hpel_filter_c_mmx2)
  628. #define x264_hpel_filter_c_sse2 x264_template(hpel_filter_c_sse2)
  629. #define x264_hpel_filter_c_ssse3 x264_template(hpel_filter_c_ssse3)
  630. #define x264_hpel_filter_c_avx x264_template(hpel_filter_c_avx)
  631. #define x264_hpel_filter_c_avx2 x264_template(hpel_filter_c_avx2)
  632. #define x264_hpel_filter_h_mmx2 x264_template(hpel_filter_h_mmx2)
  633. #define x264_hpel_filter_h_sse2 x264_template(hpel_filter_h_sse2)
  634. #define x264_hpel_filter_h_ssse3 x264_template(hpel_filter_h_ssse3)
  635. #define x264_hpel_filter_h_avx x264_template(hpel_filter_h_avx)
  636. #define x264_hpel_filter_h_avx2 x264_template(hpel_filter_h_avx2)
  637. #define x264_hpel_filter_sse2 x264_template(hpel_filter_sse2)
  638. #define x264_hpel_filter_ssse3 x264_template(hpel_filter_ssse3)
  639. #define x264_hpel_filter_v_mmx2 x264_template(hpel_filter_v_mmx2)
  640. #define x264_hpel_filter_v_sse2 x264_template(hpel_filter_v_sse2)
  641. #define x264_hpel_filter_v_ssse3 x264_template(hpel_filter_v_ssse3)
  642. #define x264_hpel_filter_v_avx x264_template(hpel_filter_v_avx)
  643. #define x264_hpel_filter_v_avx2 x264_template(hpel_filter_v_avx2)
  644. #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
  645. void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
  646. void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
  647. void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
  648. static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
  649. intptr_t stride, int width, int height, int16_t *buf )\
  650. {\
  651. intptr_t realign = (intptr_t)src & (align-1);\
  652. src -= realign;\
  653. dstv -= realign;\
  654. dstc -= realign;\
  655. dsth -= realign;\
  656. width += realign;\
  657. while( height-- )\
  658. {\
  659. x264_hpel_filter_v_##cpuv( dstv, src, buf+16, stride, width );\
  660. x264_hpel_filter_c_##cpuc( dstc, buf+16, width );\
  661. x264_hpel_filter_h_##cpuh( dsth, src, width );\
  662. dsth += stride;\
  663. dstv += stride;\
  664. dstc += stride;\
  665. src += stride;\
  666. }\
  667. x264_sfence();\
  668. }
  669. HPEL(8, mmx2, mmx2, mmx2, mmx2)
  670. #if HIGH_BIT_DEPTH
  671. HPEL(16, sse2, sse2, sse2, sse2)
  672. #else // !HIGH_BIT_DEPTH
  673. HPEL(16, sse2_amd, mmx2, mmx2, sse2)
  674. #if ARCH_X86_64
  675. void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  676. void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  677. void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  678. void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
  679. #else
  680. HPEL(16, sse2, sse2, sse2, sse2)
  681. HPEL(16, ssse3, ssse3, ssse3, ssse3)
  682. HPEL(16, avx, avx, avx, avx)
  683. HPEL(32, avx2, avx2, avx2, avx2)
  684. #endif
  685. #endif // HIGH_BIT_DEPTH
  686. PLANE_COPY(16, sse)
  687. PLANE_COPY(32, avx)
  688. PLANE_COPY_SWAP(16, ssse3)
  689. PLANE_COPY_SWAP(32, avx2)
  690. #if HIGH_BIT_DEPTH
  691. PLANE_COPY_YUYV(64, sse2)
  692. PLANE_COPY_YUYV(64, avx)
  693. #else
  694. PLANE_COPY_YUYV(32, sse2)
  695. PLANE_COPY_YUYV(32, ssse3)
  696. #endif
  697. PLANE_COPY_YUYV(64, avx2)
  698. PLANE_INTERLEAVE(mmx2)
  699. PLANE_INTERLEAVE(sse2)
  700. #if HIGH_BIT_DEPTH
  701. PLANE_INTERLEAVE(avx)
  702. #endif
  703. #if HAVE_X86_INLINE_ASM
  704. #undef MC_CLIP_ADD
  705. #define MC_CLIP_ADD(s,x)\
  706. do\
  707. {\
  708. int temp;\
  709. asm("movd %0, %%xmm0 \n"\
  710. "movd %2, %%xmm1 \n"\
  711. "paddsw %%xmm1, %%xmm0 \n"\
  712. "movd %%xmm0, %1 \n"\
  713. :"+m"(s), "=&r"(temp)\
  714. :"m"(x)\
  715. );\
  716. s = temp;\
  717. } while( 0 )
  718. #undef MC_CLIP_ADD2
  719. #define MC_CLIP_ADD2(s,x)\
  720. do\
  721. {\
  722. asm("movd %0, %%xmm0 \n"\
  723. "movd %1, %%xmm1 \n"\
  724. "paddsw %%xmm1, %%xmm0 \n"\
  725. "movd %%xmm0, %0 \n"\
  726. :"+m"(M32(s))\
  727. :"m"(M32(x))\
  728. );\
  729. } while( 0 )
  730. #endif
  731. #define x264_mbtree_propagate_list_internal_ssse3 x264_template(mbtree_propagate_list_internal_ssse3)
  732. PROPAGATE_LIST(ssse3)
  733. #define x264_mbtree_propagate_list_internal_avx x264_template(mbtree_propagate_list_internal_avx)
  734. PROPAGATE_LIST(avx)
  735. #define x264_mbtree_propagate_list_internal_avx2 x264_template(mbtree_propagate_list_internal_avx2)
  736. PROPAGATE_LIST(avx2)
  737. #if ARCH_X86_64
  738. #define x264_mbtree_propagate_list_internal_avx512 x264_template(mbtree_propagate_list_internal_avx512)
  739. void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
  740. uint16_t *lowres_costs, int bipred_weight, int mb_y,
  741. int width, int height, int stride, int list_mask );
  742. static void mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
  743. int16_t *propagate_amount, uint16_t *lowres_costs,
  744. int bipred_weight, int mb_y, int len, int list )
  745. {
  746. x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9,
  747. mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride,
  748. (1 << LOWRES_COST_SHIFT) << list );
  749. }
  750. #endif
  751. void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  752. {
  753. if( !(cpu&X264_CPU_MMX) )
  754. return;
  755. pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
  756. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
  757. pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
  758. pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
  759. pf->integral_init4v = x264_integral_init4v_mmx;
  760. pf->integral_init8v = x264_integral_init8v_mmx;
  761. if( !(cpu&X264_CPU_MMX2) )
  762. return;
  763. pf->prefetch_fenc_400 = x264_prefetch_fenc_400_mmx2;
  764. pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
  765. pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
  766. pf->prefetch_ref = x264_prefetch_ref_mmx2;
  767. pf->plane_copy_interleave = plane_copy_interleave_mmx2;
  768. pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
  769. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
  770. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2;
  771. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmx2;
  772. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmx2;
  773. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmx2;
  774. pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_mmx2;
  775. pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmx2;
  776. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmx2;
  777. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmx2;
  778. pf->mc_luma = mc_luma_mmx2;
  779. pf->get_ref = get_ref_mmx2;
  780. pf->mc_chroma = x264_mc_chroma_mmx2;
  781. pf->hpel_filter = x264_hpel_filter_mmx2;
  782. pf->weight = mc_weight_wtab_mmx2;
  783. pf->weight_cache = weight_cache_mmx2;
  784. pf->offsetadd = mc_offsetadd_wtab_mmx2;
  785. pf->offsetsub = mc_offsetsub_wtab_mmx2;
  786. pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
  787. if( cpu&X264_CPU_SSE )
  788. {
  789. pf->memcpy_aligned = x264_memcpy_aligned_sse;
  790. pf->memzero_aligned = x264_memzero_aligned_sse;
  791. pf->plane_copy = plane_copy_sse;
  792. }
  793. #if HIGH_BIT_DEPTH
  794. #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
  795. if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
  796. pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
  797. #endif
  798. if( !(cpu&X264_CPU_SSE2) )
  799. return;
  800. pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
  801. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
  802. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
  803. pf->plane_copy_interleave = plane_copy_interleave_sse2;
  804. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
  805. pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_sse2;
  806. if( cpu&X264_CPU_SSE2_IS_FAST )
  807. {
  808. pf->get_ref = get_ref_sse2;
  809. pf->mc_luma = mc_luma_sse2;
  810. pf->hpel_filter = x264_hpel_filter_sse2;
  811. }
  812. pf->integral_init4v = x264_integral_init4v_sse2;
  813. pf->integral_init8v = x264_integral_init8v_sse2;
  814. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
  815. pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
  816. pf->offsetadd = mc_offsetadd_wtab_sse2;
  817. pf->offsetsub = mc_offsetsub_wtab_sse2;
  818. if( cpu&X264_CPU_SSE2_IS_SLOW )
  819. return;
  820. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
  821. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
  822. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
  823. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
  824. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
  825. pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sse2;
  826. pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sse2;
  827. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2;
  828. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2;
  829. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
  830. pf->weight = mc_weight_wtab_sse2;
  831. if( !(cpu&X264_CPU_STACK_MOD4) )
  832. pf->mc_chroma = x264_mc_chroma_sse2;
  833. if( !(cpu&X264_CPU_SSSE3) )
  834. return;
  835. pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
  836. pf->plane_copy_swap = plane_copy_swap_ssse3;
  837. pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
  838. pf->mbtree_propagate_list = mbtree_propagate_list_ssse3;
  839. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
  840. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
  841. if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
  842. pf->integral_init4v = x264_integral_init4v_ssse3;
  843. if( !(cpu&X264_CPU_AVX) )
  844. return;
  845. pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
  846. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
  847. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
  848. pf->plane_copy_interleave = plane_copy_interleave_avx;
  849. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
  850. pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx;
  851. pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
  852. pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
  853. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
  854. if( !(cpu&X264_CPU_STACK_MOD4) )
  855. pf->mc_chroma = x264_mc_chroma_avx;
  856. if( cpu&X264_CPU_XOP )
  857. pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
  858. if( cpu&X264_CPU_AVX2 )
  859. {
  860. pf->mc_luma = mc_luma_avx2;
  861. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
  862. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
  863. pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
  864. }
  865. if( cpu&X264_CPU_AVX512 )
  866. {
  867. pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
  868. }
  869. #else // !HIGH_BIT_DEPTH
  870. #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
  871. if( cpu&X264_CPU_CACHELINE_32 )
  872. {
  873. pf->mc_luma = mc_luma_cache32_mmx2;
  874. pf->get_ref = get_ref_cache32_mmx2;
  875. pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
  876. }
  877. else if( cpu&X264_CPU_CACHELINE_64 )
  878. {
  879. pf->mc_luma = mc_luma_cache64_mmx2;
  880. pf->get_ref = get_ref_cache64_mmx2;
  881. pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
  882. }
  883. #endif
  884. if( !(cpu&X264_CPU_SSE2) )
  885. return;
  886. pf->integral_init4v = x264_integral_init4v_sse2;
  887. pf->integral_init8v = x264_integral_init8v_sse2;
  888. pf->hpel_filter = x264_hpel_filter_sse2_amd;
  889. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
  890. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
  891. pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_sse2;
  892. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
  893. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
  894. pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
  895. if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
  896. {
  897. pf->weight = mc_weight_wtab_sse2;
  898. if( !(cpu&X264_CPU_SLOW_ATOM) )
  899. {
  900. pf->offsetadd = mc_offsetadd_wtab_sse2;
  901. pf->offsetsub = mc_offsetsub_wtab_sse2;
  902. }
  903. pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
  904. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
  905. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
  906. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
  907. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
  908. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
  909. pf->hpel_filter = x264_hpel_filter_sse2;
  910. pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
  911. if( !(cpu&X264_CPU_STACK_MOD4) )
  912. pf->mc_chroma = x264_mc_chroma_sse2;
  913. if( cpu&X264_CPU_SSE2_IS_FAST )
  914. {
  915. pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
  916. pf->plane_copy_interleave = plane_copy_interleave_sse2;
  917. pf->mc_luma = mc_luma_sse2;
  918. pf->get_ref = get_ref_sse2;
  919. if( cpu&X264_CPU_CACHELINE_64 )
  920. {
  921. pf->mc_luma = mc_luma_cache64_sse2;
  922. pf->get_ref = get_ref_cache64_sse2;
  923. }
  924. }
  925. }
  926. if( !(cpu&X264_CPU_SSSE3) )
  927. return;
  928. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
  929. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3;
  930. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3;
  931. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3;
  932. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3;
  933. pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_ssse3;
  934. pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3;
  935. pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
  936. pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
  937. pf->plane_copy_swap = plane_copy_swap_ssse3;
  938. pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
  939. pf->mbtree_propagate_list = mbtree_propagate_list_ssse3;
  940. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
  941. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
  942. if( !(cpu&X264_CPU_SLOW_PSHUFB) )
  943. {
  944. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
  945. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
  946. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
  947. pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_ssse3;
  948. }
  949. if( !(cpu&X264_CPU_SLOW_PALIGNR) )
  950. {
  951. #if ARCH_X86_64
  952. if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
  953. #endif
  954. pf->hpel_filter = x264_hpel_filter_ssse3;
  955. pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
  956. }
  957. if( !(cpu&X264_CPU_STACK_MOD4) )
  958. pf->mc_chroma = x264_mc_chroma_ssse3;
  959. if( cpu&X264_CPU_CACHELINE_64 )
  960. {
  961. if( !(cpu&X264_CPU_STACK_MOD4) )
  962. pf->mc_chroma = x264_mc_chroma_cache64_ssse3;
  963. pf->mc_luma = mc_luma_cache64_ssse3;
  964. pf->get_ref = get_ref_cache64_ssse3;
  965. if( cpu&X264_CPU_SLOW_ATOM )
  966. {
  967. pf->mc_luma = mc_luma_cache64_ssse3_atom;
  968. pf->get_ref = get_ref_cache64_ssse3_atom;
  969. }
  970. }
  971. pf->weight_cache = weight_cache_ssse3;
  972. pf->weight = mc_weight_wtab_ssse3;
  973. if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
  974. pf->integral_init4v = x264_integral_init4v_ssse3;
  975. if( !(cpu&X264_CPU_SSE4) )
  976. return;
  977. pf->integral_init4h = x264_integral_init4h_sse4;
  978. pf->integral_init8h = x264_integral_init8h_sse4;
  979. if( !(cpu&X264_CPU_AVX) )
  980. return;
  981. pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
  982. pf->integral_init8h = x264_integral_init8h_avx;
  983. pf->hpel_filter = x264_hpel_filter_avx;
  984. if( !(cpu&X264_CPU_STACK_MOD4) )
  985. pf->mc_chroma = x264_mc_chroma_avx;
  986. if( cpu&X264_CPU_XOP )
  987. pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
  988. if( cpu&X264_CPU_AVX2 )
  989. {
  990. pf->hpel_filter = x264_hpel_filter_avx2;
  991. pf->mc_chroma = x264_mc_chroma_avx2;
  992. pf->weight = mc_weight_wtab_avx2;
  993. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx2;
  994. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx2;
  995. pf->integral_init8v = x264_integral_init8v_avx2;
  996. pf->integral_init4v = x264_integral_init4v_avx2;
  997. pf->integral_init8h = x264_integral_init8h_avx2;
  998. pf->integral_init4h = x264_integral_init4h_avx2;
  999. pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
  1000. pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
  1001. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
  1002. }
  1003. if( cpu&X264_CPU_AVX512 )
  1004. {
  1005. pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
  1006. pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
  1007. pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
  1008. pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
  1009. pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
  1010. pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx512;
  1011. pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512;
  1012. }
  1013. #endif // HIGH_BIT_DEPTH
  1014. if( !(cpu&X264_CPU_AVX) )
  1015. return;
  1016. pf->memcpy_aligned = x264_memcpy_aligned_avx;
  1017. pf->memzero_aligned = x264_memzero_aligned_avx;
  1018. pf->plane_copy = plane_copy_avx;
  1019. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
  1020. pf->mbtree_propagate_list = mbtree_propagate_list_avx;
  1021. if( cpu&X264_CPU_FMA4 )
  1022. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
  1023. if( !(cpu&X264_CPU_AVX2) )
  1024. return;
  1025. pf->plane_copy_swap = plane_copy_swap_avx2;
  1026. pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
  1027. pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2;
  1028. pf->get_ref = get_ref_avx2;
  1029. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
  1030. pf->mbtree_propagate_list = mbtree_propagate_list_avx2;
  1031. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
  1032. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
  1033. if( !(cpu&X264_CPU_AVX512) )
  1034. return;
  1035. pf->memcpy_aligned = x264_memcpy_aligned_avx512;
  1036. pf->memzero_aligned = x264_memzero_aligned_avx512;
  1037. pf->plane_copy = x264_plane_copy_avx512;
  1038. pf->plane_copy_swap = x264_plane_copy_swap_avx512;
  1039. pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
  1040. #if ARCH_X86_64
  1041. pf->mbtree_propagate_list = mbtree_propagate_list_avx512;
  1042. #endif
  1043. pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx512;
  1044. pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx512;
  1045. }