motionsearch.cl 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. /* Hierarchical (iterative) OpenCL lowres motion search */
  2. inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
  3. {
  4. /* edge macroblocks might not have a direct descendant, use nearest */
  5. x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
  6. y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
  7. return (mb_width>>1) * y + x;
  8. }
  9. /* Four threads calculate an 8x8 SAD. Each does two rows */
  10. int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
  11. {
  12. frefpos.y += idx << 1;
  13. fencpos.y += idx << 1;
  14. int cost = 0;
  15. if( frefpos.x < 0 )
  16. {
  17. /* slow path when MV goes past left edge. The GPU clamps reads from
  18. * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
  19. * want are [0, 0, 1, 2]
  20. */
  21. for( int y = 0; y < 2; y++ )
  22. {
  23. for( int x = 0; x < 8; x++ )
  24. {
  25. pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  26. pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  27. cost += abs_diff( enc, ref );
  28. }
  29. }
  30. }
  31. else
  32. {
  33. uint4 enc, ref, costs = 0;
  34. enc = read_imageui( fenc, sampler, fencpos );
  35. ref = read_imageui( fref, sampler, frefpos );
  36. costs += abs_diff( enc, ref );
  37. enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
  38. ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
  39. costs += abs_diff( enc, ref );
  40. enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
  41. ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
  42. costs += abs_diff( enc, ref );
  43. enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
  44. ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
  45. costs += abs_diff( enc, ref );
  46. cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
  47. }
  48. costs[idx] = cost;
  49. return costs[0] + costs[1] + costs[2] + costs[3];
  50. }
  51. /* One thread performs 8x8 SAD */
  52. int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
  53. {
  54. if( frefpos.x < 0 )
  55. {
  56. /* slow path when MV goes past left edge */
  57. int cost = 0;
  58. for( int y = 0; y < 8; y++ )
  59. {
  60. for( int x = 0; x < 8; x++ )
  61. {
  62. uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  63. uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  64. cost += abs_diff( enc, ref );
  65. }
  66. }
  67. return cost;
  68. }
  69. else
  70. {
  71. uint4 enc, ref, cost = 0;
  72. for( int y = 0; y < 8; y++ )
  73. {
  74. for( int x = 0; x < 8; x += 4 )
  75. {
  76. enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
  77. ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
  78. cost += abs_diff( enc, ref );
  79. }
  80. }
  81. return cost.s0 + cost.s1 + cost.s2 + cost.s3;
  82. }
  83. }
  84. /*
  85. * hierarchical motion estimation
  86. *
  87. * Each kernel launch is a single iteration
  88. *
  89. * MB per work group is determined by lclx / 4 * lcly
  90. *
  91. * global launch dimensions: [mb_width * 4, mb_height]
  92. */
  93. kernel void hierarchical_motion( read_only image2d_t fenc,
  94. read_only image2d_t fref,
  95. const global short2 *in_mvs,
  96. global short2 *out_mvs,
  97. global int16_t *out_mv_costs,
  98. global short2 *mvp_buffer,
  99. local int16_t *cost_local,
  100. local short2 *mvc_local,
  101. int mb_width,
  102. int lambda,
  103. int me_range,
  104. int scale,
  105. int b_shift_index,
  106. int b_first_iteration,
  107. int b_reverse_references )
  108. {
  109. int mb_x = get_global_id( 0 ) >> 2;
  110. if( mb_x >= mb_width )
  111. return;
  112. int mb_height = get_global_size( 1 );
  113. int mb_i = get_global_id( 0 ) & 3;
  114. int mb_y = get_global_id( 1 );
  115. int mb_xy = mb_y * mb_width + mb_x;
  116. const int mb_size = 8;
  117. int2 coord = (int2)(mb_x, mb_y) * mb_size;
  118. const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
  119. cost_local += 4 * mb_in_group;
  120. int i_mvc = 0;
  121. mvc_local += 4 * mb_in_group;
  122. mvc_local[mb_i] = 0;
  123. int2 mvp =0;
  124. if( !b_first_iteration )
  125. {
  126. #define MVC( DX, DY )\
  127. {\
  128. int px = mb_x + DX;\
  129. int py = mb_y + DY;\
  130. mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
  131. in_mvs[mb_width * py + px];\
  132. mvc_local[i_mvc] >>= (short) scale;\
  133. i_mvc++;\
  134. }
  135. /* Find MVP from median of MVCs */
  136. if( b_reverse_references )
  137. {
  138. /* odd iterations: derive MVP from down and right */
  139. if( mb_x < mb_width - 1 )
  140. MVC( 1, 0 );
  141. if( mb_y < mb_height - 1 )
  142. {
  143. MVC( 0, 1 );
  144. if( mb_x > b_shift_index )
  145. MVC( -1, 1 );
  146. if( mb_x < mb_width - 1 )
  147. MVC( 1, 1 );
  148. }
  149. }
  150. else
  151. {
  152. /* even iterations: derive MVP from up and left */
  153. if( mb_x > 0 )
  154. MVC( -1, 0 );
  155. if( mb_y > 0 )
  156. {
  157. MVC( 0, -1 );
  158. if( mb_x < mb_width - 1 )
  159. MVC( 1, -1 );
  160. if( mb_x > b_shift_index )
  161. MVC( -1, -1 );
  162. }
  163. }
  164. #undef MVC
  165. mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
  166. }
  167. /* current mvp matches the previous mvp and we have not changed scale. We know
  168. * we're going to arrive at the same MV again, so just copy the previous
  169. * result to our output. */
  170. if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
  171. {
  172. out_mvs[mb_xy] = in_mvs[mb_xy];
  173. return;
  174. }
  175. mvp_buffer[mb_xy] = convert_short2_sat(mvp);
  176. int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
  177. int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
  178. int2 bestmv = clamp(mvp, mv_min, mv_max);
  179. int2 refcrd = coord + bestmv;
  180. /* measure cost at bestmv */
  181. int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
  182. lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
  183. do
  184. {
  185. /* measure costs at offsets from bestmv */
  186. refcrd = coord + bestmv + dia_offs[mb_i];
  187. int2 trymv = bestmv + dia_offs[mb_i];
  188. int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
  189. lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
  190. cost_local[mb_i] = (cost<<2) | mb_i;
  191. cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
  192. if( (cost >> 2) >= bcost )
  193. break;
  194. bestmv += dia_offs[cost&3];
  195. bcost = cost>>2;
  196. if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
  197. break;
  198. }
  199. while( --me_range > 0 );
  200. int2 trymv = 0, diff = 0;
  201. #define COST_MV_NO_PAD( L )\
  202. trymv = clamp( trymv, mv_min, mv_max );\
  203. diff = convert_int2_sat(abs_diff( mvp, trymv ));\
  204. if( diff.x > 1 || diff.y > 1 ) {\
  205. int2 refcrd = coord + trymv;\
  206. int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
  207. L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
  208. if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
  209. COST_MV_NO_PAD( 0 );
  210. if( !b_first_iteration )
  211. {
  212. /* try cost at previous iteration's MV, if MVP was too far away */
  213. int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
  214. prevmv >>= scale;
  215. trymv = prevmv;
  216. COST_MV_NO_PAD( lambda );
  217. }
  218. for( int i = 0; i < i_mvc; i++ )
  219. {
  220. /* try cost at each candidate MV, if MVP was too far away */
  221. trymv = convert_int2_sat( mvc_local[i] );
  222. COST_MV_NO_PAD( lambda );
  223. }
  224. if( mb_i == 0 )
  225. {
  226. bestmv <<= scale;
  227. out_mvs[mb_xy] = convert_short2_sat(bestmv);
  228. out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
  229. }
  230. }