downscale.cl 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. /*
  2. * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
  3. *
  4. * --
  5. *
  6. * fenc_img is an output image (area of memory referenced through a texture
  7. * cache). A read of any pixel location (x,y) returns four pixel values:
  8. *
  9. * val.s0 = P(x,y)
  10. * val.s1 = P(x+1,y)
  11. * val.s2 = P(x+2,y)
  12. * val.s3 = P(x+3,y)
  13. *
  14. * This is a 4x replication of the lowres pixels, a trade-off between memory
  15. * size and read latency.
  16. *
  17. * --
  18. *
  19. * hpel_planes is an output image that contains the four HPEL planes used for
  20. * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
  21. * the four planar values C | V | H | F
  22. *
  23. * launch dimensions: [lowres-width, lowres-height]
  24. */
  25. kernel void downscale_hpel( const global pixel *fenc,
  26. write_only image2d_t fenc_img,
  27. write_only image2d_t hpel_planes,
  28. int stride )
  29. {
  30. int x = get_global_id( 0 );
  31. int y = get_global_id( 1 );
  32. uint4 values;
  33. fenc += y * stride * 2;
  34. const global pixel *src1 = fenc + stride;
  35. const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
  36. int2 pos = (int2)(x, y);
  37. pixel right, left;
  38. right = rhadd( fenc[x*2], src1[x*2] );
  39. left = rhadd( fenc[x*2+1], src1[x*2+1] );
  40. values.s0 = rhadd( right, left ); // F
  41. right = rhadd( fenc[2*x+1], src1[2*x+1] );
  42. left = rhadd( fenc[2*x+2], src1[2*x+2] );
  43. values.s1 = rhadd( right, left ); // H
  44. right = rhadd( src1[2*x], src2[2*x] );
  45. left = rhadd( src1[2*x+1], src2[2*x+1] );
  46. values.s2 = rhadd( right, left ); // V
  47. right = rhadd( src1[2*x+1], src2[2*x+1] );
  48. left = rhadd( src1[2*x+2], src2[2*x+2] );
  49. values.s3 = rhadd( right, left ); // C
  50. uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
  51. write_imageui( hpel_planes, pos, val );
  52. x = select( x, x+1, x+1 < get_global_size( 0 ) );
  53. right = rhadd( fenc[x*2], src1[x*2] );
  54. left = rhadd( fenc[x*2+1], src1[x*2+1] );
  55. values.s1 = rhadd( right, left );
  56. x = select( x, x+1, x+1 < get_global_size( 0 ) );
  57. right = rhadd( fenc[x*2], src1[x*2] );
  58. left = rhadd( fenc[x*2+1], src1[x*2+1] );
  59. values.s2 = rhadd( right, left );
  60. x = select( x, x+1, x+1 < get_global_size( 0 ) );
  61. right = rhadd( fenc[x*2], src1[x*2] );
  62. left = rhadd( fenc[x*2+1], src1[x*2+1] );
  63. values.s3 = rhadd( right, left );
  64. write_imageui( fenc_img, pos, values );
  65. }
  66. /*
  67. * downscale lowres hierarchical motion search image, copy from one image to
  68. * another decimated image. This kernel is called iteratively to generate all
  69. * of the downscales.
  70. *
  71. * launch dimensions: [lower_res width, lower_res height]
  72. */
  73. kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
  74. {
  75. int x = get_global_id( 0 );
  76. int y = get_global_id( 1 );
  77. int2 pos = (int2)(x, y);
  78. int gs = get_global_size( 0 );
  79. uint4 top, bot, values;
  80. top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
  81. bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
  82. values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
  83. /* these select statements appear redundant, and they should be, but tests break when
  84. * they are not here. I believe this was caused by a driver bug
  85. */
  86. values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
  87. top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
  88. bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
  89. values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
  90. values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
  91. write_imageui( lower_res, pos, (uint4)(values) );
  92. }
  93. /*
  94. * Second copy of downscale kernel, no differences. This is a (no perf loss)
  95. * workaround for a scheduling bug in current Tahiti drivers. This bug has
  96. * theoretically been fixed in the July 2012 driver release from AMD.
  97. */
  98. kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
  99. {
  100. int x = get_global_id( 0 );
  101. int y = get_global_id( 1 );
  102. int2 pos = (int2)(x, y);
  103. int gs = get_global_size( 0 );
  104. uint4 top, bot, values;
  105. top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
  106. bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
  107. values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
  108. // see comment in above function copy
  109. values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
  110. top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
  111. bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
  112. values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
  113. values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
  114. write_imageui( lower_res, pos, (uint4)(values) );
  115. }
  116. /* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
  117. kernel void memset_int16( global int16_t *buf, int16_t value )
  118. {
  119. buf[get_global_id( 0 )] = value;
  120. }