x264-cl.h 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
  2. constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
  3. /* 7.18.1.1 Exact-width integer types */
  4. typedef signed char int8_t;
  5. typedef unsigned char uint8_t;
  6. typedef short int16_t;
  7. typedef unsigned short uint16_t;
  8. typedef int int32_t;
  9. typedef unsigned uint32_t;
  10. typedef uint8_t pixel;
  11. typedef uint16_t sum_t;
  12. typedef uint32_t sum2_t;
  13. #define LOWRES_COST_MASK ((1<<14)-1)
  14. #define LOWRES_COST_SHIFT 14
  15. #define COST_MAX (1<<28)
  16. #define PIXEL_MAX 255
  17. #define BITS_PER_SUM (8 * sizeof(sum_t))
  18. /* Constants for offsets into frame statistics buffer */
  19. #define COST_EST 0
  20. #define COST_EST_AQ 1
  21. #define INTRA_MBS 2
  22. #define COPY2_IF_LT( x, y, a, b )\
  23. if( (y) < (x) )\
  24. {\
  25. (x) = (y);\
  26. (a) = (b);\
  27. }
  28. constant int2 dia_offs[4] =
  29. {
  30. {0, -1}, {-1, 0}, {1, 0}, {0, 1},
  31. };
  32. inline pixel x264_clip_pixel( int x )
  33. {
  34. return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
  35. }
  36. inline int2 x264_median_mv( short2 a, short2 b, short2 c )
  37. {
  38. short2 t1 = min(a, b);
  39. short2 t2 = min(max(a, b), c);
  40. return convert_int2(max(t1, t2));
  41. }
  42. inline sum2_t abs2( sum2_t a )
  43. {
  44. sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
  45. return (a + s) ^ s;
  46. }
  47. #define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
  48. sum2_t t0 = s0 + s1;\
  49. sum2_t t1 = s0 - s1;\
  50. sum2_t t2 = s2 + s3;\
  51. sum2_t t3 = s2 - s3;\
  52. d0 = t0 + t2;\
  53. d2 = t0 - t2;\
  54. d1 = t1 + t3;\
  55. d3 = t1 - t3;\
  56. }
  57. #define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
  58. int2 t0 = s0 + s1;\
  59. int2 t1 = s0 - s1;\
  60. int2 t2 = s2 + s3;\
  61. int2 t3 = s2 - s3;\
  62. d0 = t0 + t2;\
  63. d2 = t0 - t2;\
  64. d1 = t1 + t3;\
  65. d3 = t1 - t3;\
  66. }
  67. #define SATD_C_8x4_Q( name, q1, q2 )\
  68. int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
  69. {\
  70. sum2_t tmp[4][4];\
  71. sum2_t a0, a1, a2, a3;\
  72. sum2_t sum = 0;\
  73. for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
  74. {\
  75. a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
  76. a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
  77. a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
  78. a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
  79. HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
  80. }\
  81. for( int i = 0; i < 4; i++ )\
  82. {\
  83. HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
  84. sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
  85. }\
  86. return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
  87. }
  88. /*
  89. * Utility function to perform a parallel sum reduction of an array of integers
  90. */
  91. int parallel_sum( int value, int x, volatile local int *array )
  92. {
  93. array[x] = value;
  94. barrier( CLK_LOCAL_MEM_FENCE );
  95. int dim = get_local_size( 0 );
  96. while( dim > 1 )
  97. {
  98. dim >>= 1;
  99. if( x < dim )
  100. array[x] += array[x + dim];
  101. if( dim > 32 )
  102. barrier( CLK_LOCAL_MEM_FENCE );
  103. }
  104. return array[0];
  105. }
  106. int mv_cost( uint2 mvd )
  107. {
  108. float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
  109. float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
  110. return (int) (cost.x + cost.y);
  111. }