subpel.cl 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. /* OpenCL lowres subpel Refine */
  2. /* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are
  3. * calculated simultaneously */
  4. int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
  5. {
  6. int2 frefpos = qpos >> 2;
  7. int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
  8. uint mask_shift = 8 * hpel_idx;
  9. uint4 cost4 = 0;
  10. for( int y = 0; y < 8; y++ )
  11. {
  12. uint4 enc, val4;
  13. enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
  14. val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
  15. val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
  16. val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
  17. val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
  18. cost4 += abs_diff( enc, val4 );
  19. enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
  20. val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
  21. val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
  22. val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
  23. val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
  24. cost4 += abs_diff( enc, val4 );
  25. }
  26. return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
  27. }
  28. /* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
  29. int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
  30. {
  31. int2 frefApos = qpos >> 2;
  32. int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
  33. int2 qposB = qpos + ((qpos & 1) << 1);
  34. int2 frefBpos = qposB >> 2;
  35. int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
  36. uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
  37. int cost = 0;
  38. for( int y = 0; y < 8; y++ )
  39. {
  40. for( int x = 0; x < 8; x++ )
  41. {
  42. uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
  43. uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
  44. uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
  45. cost += abs_diff( enc, rhadd( vA, vB ) );
  46. }
  47. }
  48. return cost;
  49. }
  50. /* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
  51. *
  52. * Each thread collects 1/4 of the rows of diffs and processes one quarter of
  53. * the transforms
  54. */
  55. int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
  56. int2 fencpos,
  57. read_only image2d_t fref_planes,
  58. int2 qpos,
  59. local sum2_t *tmpp,
  60. int idx )
  61. {
  62. volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
  63. sum2_t b0, b1, b2, b3;
  64. // fencpos is full-pel position of original MB
  65. // qpos is qpel position within reference frame
  66. int2 frefApos = qpos >> 2;
  67. int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
  68. int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
  69. int2 frefBpos = qposB >> 2;
  70. int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
  71. uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
  72. uint vA, vB;
  73. uint a0, a1;
  74. uint enc;
  75. sum2_t sum = 0;
  76. #define READ_DIFF( OUT, X )\
  77. enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
  78. vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
  79. vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
  80. OUT = enc - rhadd( vA, vB );
  81. #define READ_DIFF_EX( OUT, a, b )\
  82. {\
  83. READ_DIFF( a0, a );\
  84. READ_DIFF( a1, b );\
  85. OUT = a0 + (a1<<BITS_PER_SUM);\
  86. }
  87. #define ROW_8x4_SATD( a, b )\
  88. {\
  89. fencpos.y += a;\
  90. frefApos.y += b;\
  91. frefBpos.y += b;\
  92. READ_DIFF_EX( b0, 0, 4 );\
  93. READ_DIFF_EX( b1, 1, 5 );\
  94. READ_DIFF_EX( b2, 2, 6 );\
  95. READ_DIFF_EX( b3, 3, 7 );\
  96. HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
  97. HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
  98. sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
  99. }
  100. ROW_8x4_SATD( 0, 0 );
  101. ROW_8x4_SATD( 4, 4 );
  102. #undef READ_DIFF
  103. #undef READ_DIFF_EX
  104. #undef ROW_8x4_SATD
  105. return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
  106. }
  107. constant int2 hpoffs[4] =
  108. {
  109. {0, -2}, {-2, 0}, {2, 0}, {0, 2}
  110. };
  111. /* sub pixel refinement of motion vectors, output MVs and costs are moved from
  112. * temporary buffers into final per-frame buffer
  113. *
  114. * global launch dimensions: [mb_width * 4, mb_height]
  115. *
  116. * With X being the source 16x16 pixels, F is the lowres pixel used by the
  117. * motion search. We will now utilize the H V and C pixels (stored in separate
  118. * planes) to search at half-pel increments.
  119. *
  120. * X X X X X X
  121. * F H F H F
  122. * X X X X X X
  123. * V C V C V
  124. * X X X X X X
  125. * F H F H F
  126. * X X X X X X
  127. *
  128. * The YX HPEL bits of the motion vector selects the plane we search in. The
  129. * four planes are packed in the fref_planes 2D image buffer. Each sample
  130. * returns: s0 = F, s1 = H, s2 = V, s3 = C */
  131. kernel void subpel_refine( read_only image2d_t fenc,
  132. read_only image2d_t fref_planes,
  133. const global short2 *in_mvs,
  134. const global int16_t *in_sad_mv_costs,
  135. local int16_t *cost_local,
  136. local sum2_t *satd_local,
  137. local short2 *mvc_local,
  138. global short2 *fenc_lowres_mv,
  139. global int16_t *fenc_lowres_mv_costs,
  140. int mb_width,
  141. int lambda,
  142. int b,
  143. int ref,
  144. int b_islist1 )
  145. {
  146. int mb_x = get_global_id( 0 ) >> 2;
  147. if( mb_x >= mb_width )
  148. return;
  149. int mb_height = get_global_size( 1 );
  150. int mb_i = get_global_id( 0 ) & 3;
  151. int mb_y = get_global_id( 1 );
  152. int mb_xy = mb_y * mb_width + mb_x;
  153. /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
  154. * hold many frames worth of motion vectors. We must offset into the correct
  155. * location for this frame's vectors. The kernel will be passed the correct
  156. * directional buffer for the direction of the search: list1 or list0
  157. *
  158. * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
  159. * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
  160. fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
  161. fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
  162. /* Adjust pointers into local memory buffers for this thread's data */
  163. int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
  164. cost_local += mb_in_group * 4;
  165. satd_local += mb_in_group * 16;
  166. mvc_local += mb_in_group * 4;
  167. int i_mvc = 0;
  168. mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
  169. #define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
  170. if( mb_x > 0 )
  171. MVC( -1, 0 );
  172. if( mb_y > 0 )
  173. {
  174. MVC( 0, -1 );
  175. if( mb_x < mb_width - 1 )
  176. MVC( 1, -1 );
  177. if( mb_x > 0 )
  178. MVC( -1, -1 );
  179. }
  180. #undef MVC
  181. int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
  182. int bcost = in_sad_mv_costs[mb_xy];
  183. int2 coord = (int2)(mb_x, mb_y) << 3;
  184. int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
  185. /* Make mvp and bmv QPEL MV */
  186. mvp <<= 2; bmv <<= 2;
  187. #define HPEL_QPEL( ARR, FUNC )\
  188. {\
  189. int2 trymv = bmv + ARR[mb_i];\
  190. int2 qpos = (coord << 2) + trymv;\
  191. int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
  192. cost_local[mb_i] = (cost<<2) + mb_i;\
  193. cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
  194. if( (cost>>2) < bcost )\
  195. {\
  196. bmv += ARR[cost&3];\
  197. bcost = cost>>2;\
  198. }\
  199. }
  200. HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
  201. HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
  202. fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
  203. /* remeasure cost of bmv using SATD */
  204. int2 qpos = (coord << 2) + bmv;
  205. cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
  206. bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
  207. bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
  208. fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
  209. }