util.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. /*****************************************************************************
  2. * util.h: x86 inline asm
  3. *****************************************************************************
  4. * Copyright (C) 2008-2018 x264 project
  5. *
  6. * Authors: Fiona Glaser <fiona@x264.com>
  7. * Loren Merritt <lorenm@u.washington.edu>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #ifndef X264_X86_UTIL_H
  27. #define X264_X86_UTIL_H
  28. #ifdef __SSE__
  29. #include <xmmintrin.h>
  30. #undef M128_ZERO
  31. #define M128_ZERO ((__m128){0,0,0,0})
  32. #define x264_union128_t x264_union128_sse_t
  33. typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
  34. #if HAVE_VECTOREXT
  35. typedef uint32_t v4si __attribute__((vector_size (16)));
  36. #endif
  37. #endif // __SSE__
  38. #if HAVE_X86_INLINE_ASM && HAVE_MMX
  39. #define x264_median_mv x264_median_mv_mmx2
  40. static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
  41. {
  42. asm(
  43. "movd %1, %%mm0 \n"
  44. "movd %2, %%mm1 \n"
  45. "movq %%mm0, %%mm3 \n"
  46. "movd %3, %%mm2 \n"
  47. "pmaxsw %%mm1, %%mm0 \n"
  48. "pminsw %%mm3, %%mm1 \n"
  49. "pminsw %%mm2, %%mm0 \n"
  50. "pmaxsw %%mm1, %%mm0 \n"
  51. "movd %%mm0, %0 \n"
  52. :"=m"(*(x264_union32_t*)dst)
  53. :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
  54. );
  55. }
  56. #define x264_predictor_difference x264_predictor_difference_mmx2
  57. static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
  58. {
  59. int sum;
  60. static const uint64_t pw_1 = 0x0001000100010001ULL;
  61. asm(
  62. "pxor %%mm4, %%mm4 \n"
  63. "test $1, %1 \n"
  64. "jnz 3f \n"
  65. "movd -8(%2,%1,4), %%mm0 \n"
  66. "movd -4(%2,%1,4), %%mm3 \n"
  67. "psubw %%mm3, %%mm0 \n"
  68. "jmp 2f \n"
  69. "3: \n"
  70. "dec %1 \n"
  71. "1: \n"
  72. "movq -8(%2,%1,4), %%mm0 \n"
  73. "psubw -4(%2,%1,4), %%mm0 \n"
  74. "2: \n"
  75. "sub $2, %1 \n"
  76. "pxor %%mm2, %%mm2 \n"
  77. "psubw %%mm0, %%mm2 \n"
  78. "pmaxsw %%mm2, %%mm0 \n"
  79. "paddusw %%mm0, %%mm4 \n"
  80. "jg 1b \n"
  81. "pmaddwd %4, %%mm4 \n"
  82. "pshufw $14, %%mm4, %%mm0 \n"
  83. "paddd %%mm0, %%mm4 \n"
  84. "movd %%mm4, %0 \n"
  85. :"=r"(sum), "+r"(i_mvc)
  86. :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
  87. );
  88. return sum;
  89. }
  90. #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
  91. static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
  92. {
  93. static const uint64_t pb_2 = 0x0202020202020202ULL;
  94. static const uint64_t pb_32 = 0x2020202020202020ULL;
  95. static const uint64_t pb_33 = 0x2121212121212121ULL;
  96. int amvd;
  97. asm(
  98. "movd %1, %%mm0 \n"
  99. "movd %2, %%mm1 \n"
  100. "paddusb %%mm1, %%mm0 \n"
  101. "pminub %5, %%mm0 \n"
  102. "pxor %%mm2, %%mm2 \n"
  103. "movq %%mm0, %%mm1 \n"
  104. "pcmpgtb %3, %%mm0 \n"
  105. "pcmpgtb %4, %%mm1 \n"
  106. "psubb %%mm0, %%mm2 \n"
  107. "psubb %%mm1, %%mm2 \n"
  108. "movd %%mm2, %0 \n"
  109. :"=r"(amvd)
  110. :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
  111. "m"(pb_2),"m"(pb_32),"m"(pb_33)
  112. );
  113. return amvd;
  114. }
  115. #define x264_predictor_clip x264_predictor_clip_mmx2
  116. static ALWAYS_INLINE int x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
  117. {
  118. static const uint32_t pd_32 = 0x20;
  119. intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
  120. asm(
  121. "movq (%2), %%mm5 \n"
  122. "movd %6, %%mm3 \n"
  123. "psllw $2, %%mm5 \n" // Convert to subpel
  124. "pshufw $0xEE, %%mm5, %%mm6 \n"
  125. "dec %k3 \n"
  126. "jz 2f \n" // if( i_mvc == 1 ) {do the last iteration}
  127. "punpckldq %%mm3, %%mm3 \n"
  128. "punpckldq %%mm5, %%mm5 \n"
  129. "movd %7, %%mm4 \n"
  130. "lea (%0,%3,4), %3 \n"
  131. "1: \n"
  132. "movq (%0), %%mm0 \n"
  133. "add $8, %0 \n"
  134. "movq %%mm3, %%mm1 \n"
  135. "pxor %%mm2, %%mm2 \n"
  136. "pcmpeqd %%mm0, %%mm1 \n" // mv == pmv
  137. "pcmpeqd %%mm0, %%mm2 \n" // mv == 0
  138. "por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
  139. "pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf
  140. "pmaxsw %%mm5, %%mm0 \n"
  141. "pminsw %%mm6, %%mm0 \n"
  142. "pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
  143. "psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped
  144. "movq %%mm0, (%5,%4,4) \n"
  145. "and $24, %k2 \n"
  146. "add $2, %4 \n"
  147. "add $8, %k2 \n"
  148. "shr $4, %k2 \n" // (4-val)>>1
  149. "sub %2, %4 \n" // +1 for each valid motion vector
  150. "cmp %3, %0 \n"
  151. "jl 1b \n"
  152. "jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration}
  153. /* Do the last iteration */
  154. "2: \n"
  155. "movd (%0), %%mm0 \n"
  156. "pxor %%mm2, %%mm2 \n"
  157. "pcmpeqd %%mm0, %%mm3 \n"
  158. "pcmpeqd %%mm0, %%mm2 \n"
  159. "por %%mm3, %%mm2 \n"
  160. "pmovmskb %%mm2, %k2 \n"
  161. "pmaxsw %%mm5, %%mm0 \n"
  162. "pminsw %%mm6, %%mm0 \n"
  163. "movd %%mm0, (%5,%4,4) \n"
  164. "inc %4 \n"
  165. "and $1, %k2 \n"
  166. "sub %2, %4 \n" // output += !(mv == pmv || mv == 0)
  167. "3: \n"
  168. :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
  169. :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
  170. );
  171. return i;
  172. }
  173. /* Same as the above, except we do (mv + 2) >> 2 on the input. */
  174. #define x264_predictor_roundclip x264_predictor_roundclip_mmx2
  175. static ALWAYS_INLINE int x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
  176. {
  177. static const uint64_t pw_2 = 0x0002000200020002ULL;
  178. static const uint32_t pd_32 = 0x20;
  179. intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
  180. asm(
  181. "movq (%2), %%mm5 \n"
  182. "movq %6, %%mm7 \n"
  183. "movd %7, %%mm3 \n"
  184. "pshufw $0xEE, %%mm5, %%mm6 \n"
  185. "dec %k3 \n"
  186. "jz 2f \n"
  187. "punpckldq %%mm3, %%mm3 \n"
  188. "punpckldq %%mm5, %%mm5 \n"
  189. "movd %8, %%mm4 \n"
  190. "lea (%0,%3,4), %3 \n"
  191. "1: \n"
  192. "movq (%0), %%mm0 \n"
  193. "add $8, %0 \n"
  194. "paddw %%mm7, %%mm0 \n"
  195. "psraw $2, %%mm0 \n"
  196. "movq %%mm3, %%mm1 \n"
  197. "pxor %%mm2, %%mm2 \n"
  198. "pcmpeqd %%mm0, %%mm1 \n"
  199. "pcmpeqd %%mm0, %%mm2 \n"
  200. "por %%mm1, %%mm2 \n"
  201. "pmovmskb %%mm2, %k2 \n"
  202. "pmaxsw %%mm5, %%mm0 \n"
  203. "pminsw %%mm6, %%mm0 \n"
  204. "pand %%mm4, %%mm2 \n"
  205. "psrlq %%mm2, %%mm0 \n"
  206. "movq %%mm0, (%5,%4,4) \n"
  207. "and $24, %k2 \n"
  208. "add $2, %4 \n"
  209. "add $8, %k2 \n"
  210. "shr $4, %k2 \n"
  211. "sub %2, %4 \n"
  212. "cmp %3, %0 \n"
  213. "jl 1b \n"
  214. "jg 3f \n"
  215. /* Do the last iteration */
  216. "2: \n"
  217. "movd (%0), %%mm0 \n"
  218. "paddw %%mm7, %%mm0 \n"
  219. "psraw $2, %%mm0 \n"
  220. "pxor %%mm2, %%mm2 \n"
  221. "pcmpeqd %%mm0, %%mm3 \n"
  222. "pcmpeqd %%mm0, %%mm2 \n"
  223. "por %%mm3, %%mm2 \n"
  224. "pmovmskb %%mm2, %k2 \n"
  225. "pmaxsw %%mm5, %%mm0 \n"
  226. "pminsw %%mm6, %%mm0 \n"
  227. "movd %%mm0, (%5,%4,4) \n"
  228. "inc %4 \n"
  229. "and $1, %k2 \n"
  230. "sub %2, %4 \n"
  231. "3: \n"
  232. :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
  233. :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
  234. );
  235. return i;
  236. }
  237. #endif
  238. #endif