intra.cl 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072
  1. /* Lookahead lowres intra analysis
  2. *
  3. * Each intra analysis function has been implemented twice, once for scalar GPUs
  4. * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects
  5. * the GPU type and sets the -DVECTORIZE compile flag accordingly.
  6. *
  7. * All the intra analysis functions were based on their C versions in pixel.c
  8. * and produce the exact same results.
  9. */
  10. /* force all clamp arguments and return value to int, prevent ambiguous types */
  11. #define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
  12. #if VECTORIZE
  13. int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
  14. {
  15. int8 a_v, d_v;
  16. int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
  17. int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
  18. d_v = convert_int8( vload8( 0, data ) );
  19. a_v.s01234567 = (d_v - pr0).s04152637;
  20. HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  21. data += data_stride;
  22. d_v = convert_int8( vload8( 0, data ) );
  23. a_v.s01234567 = (d_v - pr1).s04152637;
  24. HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  25. data += data_stride;
  26. d_v = convert_int8( vload8( 0, data ) );
  27. a_v.s01234567 = (d_v - pr2).s04152637;
  28. HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  29. data += data_stride;
  30. d_v = convert_int8( vload8( 0, data ) );
  31. a_v.s01234567 = (d_v - pr3).s04152637;
  32. HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
  33. uint8 sum_v;
  34. HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
  35. sum_v = abs( a_v );
  36. HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
  37. sum_v += abs( a_v );
  38. HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
  39. sum_v += abs( a_v );
  40. HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
  41. sum_v += abs( a_v );
  42. uint4 sum2 = sum_v.hi + sum_v.lo;
  43. uint2 sum3 = sum2.hi + sum2.lo;
  44. return ( sum3.hi + sum3.lo ) >> 1;
  45. }
  46. #else
  47. SATD_C_8x4_Q( satd_8x4_lp, const local, private )
  48. #endif
  49. /****************************************************************************
  50. * 8x8 prediction for intra luma block
  51. ****************************************************************************/
  52. #define F1 rhadd
  53. #define F2( a, b, c ) ( a+2*b+c+2 )>>2
  54. #if VECTORIZE
  55. int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
  56. {
  57. int8 pr0, pr1, pr2, pr3;
  58. // Upper half of pred[]
  59. pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
  60. pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  61. pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  62. pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  63. pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  64. pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  65. pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  66. pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  67. pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
  68. pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  69. pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  70. pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  71. pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  72. pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  73. pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  74. pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  75. pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
  76. pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  77. pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  78. pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  79. pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  80. pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  81. pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  82. pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  83. pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
  84. pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  85. pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  86. pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  87. pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  88. pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  89. pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  90. pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  91. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  92. // Lower half of pred[]
  93. pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
  94. pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  95. pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  96. pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  97. pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  98. pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  99. pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  100. pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  101. pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
  102. pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  103. pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  104. pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  105. pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  106. pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  107. pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  108. pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  109. pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
  110. pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  111. pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  112. pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  113. pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  114. pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  115. pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  116. pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
  117. pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
  118. pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
  119. pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
  120. pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
  121. pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
  122. pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
  123. pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
  124. pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
  125. return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  126. }
  127. int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  128. {
  129. int8 pr0, pr1, pr2, pr3;
  130. // Upper half of pred[]
  131. pr3.s0 = F2( left[1], left[2], left[3] );
  132. pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
  133. pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
  134. pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
  135. pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
  136. pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
  137. pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
  138. pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
  139. pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
  140. pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
  141. pr0.s7 = F2( top[5], top[6], top[7] );
  142. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  143. // Lower half of pred[]
  144. pr3.s0 = F2( left[5], left[6], left[7] );
  145. pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
  146. pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
  147. pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
  148. pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
  149. pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
  150. pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
  151. pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
  152. pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
  153. pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
  154. pr0.s7 = F2( top[1], top[2], top[3] );
  155. return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  156. }
  157. int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  158. {
  159. int8 pr0, pr1, pr2, pr3;
  160. // Upper half of pred[]
  161. pr2.s0 = F2( left[1], left[0], left_top );
  162. pr3.s0 = F2( left[2], left[1], left[0] );
  163. pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
  164. pr0.s0 = pr2.s1 = F1( left_top, top[0] );
  165. pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
  166. pr0.s1 = pr2.s2 = F1( top[0], top[1] );
  167. pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
  168. pr0.s2 = pr2.s3 = F1( top[1], top[2] );
  169. pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
  170. pr0.s3 = pr2.s4 = F1( top[2], top[3] );
  171. pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
  172. pr0.s4 = pr2.s5 = F1( top[3], top[4] );
  173. pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
  174. pr0.s5 = pr2.s6 = F1( top[4], top[5] );
  175. pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
  176. pr0.s6 = pr2.s7 = F1( top[5], top[6] );
  177. pr1.s7 = F2( top[5], top[6], top[7] );
  178. pr0.s7 = F1( top[6], top[7] );
  179. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  180. // Lower half of pred[]
  181. pr2.s0 = F2( left[5], left[4], left[3] );
  182. pr3.s0 = F2( left[6], left[5], left[4] );
  183. pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
  184. pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
  185. pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
  186. pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
  187. pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
  188. pr0.s2 = pr2.s3 = F1( left_top, top[0] );
  189. pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
  190. pr0.s3 = pr2.s4 = F1( top[0], top[1] );
  191. pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
  192. pr0.s4 = pr2.s5 = F1( top[1], top[2] );
  193. pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
  194. pr0.s5 = pr2.s6 = F1( top[2], top[3] );
  195. pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
  196. pr0.s6 = pr2.s7 = F1( top[3], top[4] );
  197. pr1.s7 = F2( top[3], top[4], top[5] );
  198. pr0.s7 = F1( top[4], top[5] );
  199. return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  200. #undef PRED
  201. }
  202. int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  203. {
  204. int8 pr0, pr1, pr2, pr3;
  205. // Upper half of pred[]
  206. pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  207. pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
  208. pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
  209. pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
  210. pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  211. pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  212. pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
  213. pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
  214. pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  215. pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  216. pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  217. pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
  218. pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  219. pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  220. pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  221. pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
  222. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  223. // Lower half of pred[]
  224. pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  225. pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  226. pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  227. pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
  228. pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  229. pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  230. pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  231. pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  232. pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  233. pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  234. pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  235. pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  236. pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  237. pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  238. pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  239. pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  240. return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
  241. }
  242. int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
  243. {
  244. int8 pr0, pr1, pr2, pr3;
  245. // Upper half of pred[]
  246. pr0.s0 = F1( top[0], top[1] );
  247. pr1.s0 = F2( top[0], top[1], top[2] );
  248. pr2.s0 = pr0.s1 = F1( top[1], top[2] );
  249. pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
  250. pr2.s1 = pr0.s2 = F1( top[2], top[3] );
  251. pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
  252. pr2.s2 = pr0.s3 = F1( top[3], top[4] );
  253. pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
  254. pr2.s3 = pr0.s4 = F1( top[4], top[5] );
  255. pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
  256. pr2.s4 = pr0.s5 = F1( top[5], top[6] );
  257. pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
  258. pr2.s5 = pr0.s6 = F1( top[6], top[7] );
  259. pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
  260. pr2.s6 = pr0.s7 = F1( top[7], top[8] );
  261. pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
  262. pr2.s7 = F1( top[8], top[9] );
  263. pr3.s7 = F2( top[8], top[9], top[10] );
  264. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  265. // Lower half of pred[]
  266. pr0.s0 = F1( top[2], top[3] );
  267. pr1.s0 = F2( top[2], top[3], top[4] );
  268. pr2.s0 = pr0.s1 = F1( top[3], top[4] );
  269. pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
  270. pr2.s1 = pr0.s2 = F1( top[4], top[5] );
  271. pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
  272. pr2.s2 = pr0.s3 = F1( top[5], top[6] );
  273. pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
  274. pr2.s3 = pr0.s4 = F1( top[6], top[7] );
  275. pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
  276. pr2.s4 = pr0.s5 = F1( top[7], top[8] );
  277. pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
  278. pr2.s5 = pr0.s6 = F1( top[8], top[9] );
  279. pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
  280. pr2.s6 = pr0.s7 = F1( top[9], top[10] );
  281. pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
  282. pr2.s7 = F1( top[10], top[11] );
  283. pr3.s7 = F2( top[10], top[11], top[12] );
  284. return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  285. }
  286. int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
  287. {
  288. int8 pr0, pr1, pr2, pr3;
  289. // Upper half of pred[]
  290. pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
  291. pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  292. pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  293. pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  294. pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
  295. pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  296. pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  297. pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  298. pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
  299. pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  300. pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  301. pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  302. pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
  303. pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  304. pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  305. pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  306. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  307. // Lower half of pred[]
  308. pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
  309. pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  310. pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  311. pr0.s6 = left[7]; pr0.s7 = left[7];
  312. pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
  313. pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  314. pr1.s4 = left[7]; pr1.s5 = left[7];
  315. pr1.s6 = left[7]; pr1.s7 = left[7];
  316. pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
  317. pr2.s2 = left[7]; pr2.s3 = left[7];
  318. pr2.s4 = left[7]; pr2.s5 = left[7];
  319. pr2.s6 = left[7]; pr2.s7 = left[7];
  320. pr3 = (int8)left[7];
  321. return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  322. }
  323. int x264_predict_8x8c_h( const local pixel *src, int src_stride )
  324. {
  325. const local pixel *src_l = src;
  326. int8 pr0, pr1, pr2, pr3;
  327. // Upper half of pred[]
  328. pr0 = (int8)src[-1]; src += src_stride;
  329. pr1 = (int8)src[-1]; src += src_stride;
  330. pr2 = (int8)src[-1]; src += src_stride;
  331. pr3 = (int8)src[-1]; src += src_stride;
  332. int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
  333. //Lower half of pred[]
  334. pr0 = (int8)src[-1]; src += src_stride;
  335. pr1 = (int8)src[-1]; src += src_stride;
  336. pr2 = (int8)src[-1]; src += src_stride;
  337. pr3 = (int8)src[-1];
  338. return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  339. }
  340. int x264_predict_8x8c_v( const local pixel *src, int src_stride )
  341. {
  342. int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
  343. return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
  344. satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
  345. }
  346. int x264_predict_8x8c_p( const local pixel *src, int src_stride )
  347. {
  348. int H = 0, V = 0;
  349. for( int i = 0; i < 4; i++ )
  350. {
  351. H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
  352. V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
  353. }
  354. int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
  355. int b = (17 * H + 16) >> 5;
  356. int c = (17 * V + 16) >> 5;
  357. int i00 = a - 3 * b - 3 * c + 16;
  358. // Upper half of pred[]
  359. int pix = i00;
  360. int8 pr0, pr1, pr2, pr3;
  361. pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  362. pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  363. pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  364. pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  365. pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  366. pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  367. pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  368. pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  369. pix = i00;
  370. pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  371. pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  372. pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  373. pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  374. pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  375. pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  376. pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  377. pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  378. pix = i00;
  379. pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  380. pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  381. pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  382. pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  383. pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  384. pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  385. pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  386. pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  387. pix = i00;
  388. pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  389. pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  390. pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  391. pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  392. pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  393. pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  394. pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  395. pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  396. int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
  397. //Lower half of pred[]
  398. pix = i00;
  399. pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  400. pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  401. pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  402. pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  403. pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  404. pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  405. pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  406. pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  407. pix = i00;
  408. pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  409. pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  410. pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  411. pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  412. pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  413. pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  414. pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  415. pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  416. pix = i00;
  417. pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  418. pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  419. pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  420. pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  421. pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  422. pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  423. pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  424. pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  425. pix = i00;
  426. pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
  427. pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
  428. pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
  429. pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
  430. pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
  431. pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
  432. pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
  433. pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
  434. return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
  435. }
  436. int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
  437. {
  438. int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
  439. for( int i = 0; i < 4; i++ )
  440. {
  441. s0 += src[i - src_stride];
  442. s1 += src[i + 4 - src_stride];
  443. s2 += src[-1 + i * src_stride];
  444. s3 += src[-1 + (i+4)*src_stride];
  445. }
  446. // Upper half of pred[]
  447. int8 dc0;
  448. dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
  449. dc0.hi = (int4)( (s1 + 2) >> 2 );
  450. int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
  451. // Lower half of pred[]
  452. dc0.lo = (int4)( (s3 + 2) >> 2 );
  453. dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
  454. return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
  455. }
  456. #else /* not vectorized: private is cheap registers are scarce */
  457. int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
  458. {
  459. private pixel pred[32];
  460. // Upper half of pred[]
  461. for( int y = 0; y < 4; y++ )
  462. {
  463. for( int x = 0; x < 8; x++ )
  464. {
  465. pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
  466. pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
  467. }
  468. }
  469. int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  470. //Lower half of pred[]
  471. for( int y = 4; y < 8; y++ )
  472. {
  473. for( int x = 0; x < 8; x++ )
  474. {
  475. pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
  476. pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
  477. }
  478. }
  479. pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
  480. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  481. return satd;
  482. }
  483. int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  484. {
  485. private pixel pred[32];
  486. #define PRED( x, y ) pred[(x) + (y)*8]
  487. // Upper half of pred[]
  488. PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
  489. PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
  490. PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
  491. PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
  492. PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
  493. PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
  494. PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
  495. PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
  496. PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
  497. PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
  498. PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
  499. int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  500. // Lower half of pred[]
  501. PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
  502. PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
  503. PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
  504. PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
  505. PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
  506. PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
  507. PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
  508. PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
  509. PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
  510. PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
  511. PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
  512. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  513. return satd;
  514. #undef PRED
  515. }
  516. int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  517. {
  518. private pixel pred[32];
  519. #define PRED( x, y ) pred[(x) + (y)*8]
  520. // Upper half of pred[]
  521. PRED( 0, 2 ) = F2( left[1], left[0], left_top );
  522. PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
  523. PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
  524. PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
  525. PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
  526. PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
  527. PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
  528. PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
  529. PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
  530. PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
  531. PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
  532. PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
  533. PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
  534. PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
  535. PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
  536. PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
  537. PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
  538. PRED( 7, 0 ) = F1( top[6], top[7] );
  539. int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  540. //Lower half of pred[]
  541. PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
  542. PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
  543. PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
  544. PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
  545. PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
  546. PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
  547. PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
  548. PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
  549. PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
  550. PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
  551. PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
  552. PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
  553. PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
  554. PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
  555. PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
  556. PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
  557. PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
  558. PRED( 7, 0 ) = F1( top[4], top[5] );
  559. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  560. return satd;
  561. #undef PRED
  562. }
  563. inline uint32_t pack16to32( uint32_t a, uint32_t b )
  564. {
  565. return a + (b << 16);
  566. }
  567. inline uint32_t pack8to16( uint32_t a, uint32_t b )
  568. {
  569. return a + (b << 8);
  570. }
  571. int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
  572. {
  573. private pixel pred[32];
  574. int satd;
  575. int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
  576. int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
  577. int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
  578. int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
  579. int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
  580. int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
  581. int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
  582. int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
  583. int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
  584. int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
  585. int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
  586. // Upper half of pred[]
  587. vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
  588. vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
  589. vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
  590. vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
  591. vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
  592. vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
  593. vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
  594. vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
  595. satd = satd_8x4_lp( src, src_stride, pred, 8 );
  596. // Lower half of pred[]
  597. vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
  598. vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
  599. vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
  600. vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
  601. vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
  602. vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
  603. vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
  604. vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
  605. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  606. return satd;
  607. }
  608. int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
  609. {
  610. private pixel pred[32];
  611. int satd;
  612. #define PRED( x, y ) pred[(x) + (y)*8]
  613. // Upper half of pred[]
  614. PRED( 0, 0 ) = F1( top[0], top[1] );
  615. PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
  616. PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
  617. PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
  618. PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
  619. PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
  620. PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
  621. PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
  622. PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
  623. PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
  624. PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
  625. PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
  626. PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
  627. PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
  628. PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
  629. PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
  630. PRED( 7, 2 ) = F1( top[8], top[9] );
  631. PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
  632. satd = satd_8x4_lp( src, src_stride, pred, 8 );
  633. // Lower half of pred[]
  634. PRED( 0, 0 ) = F1( top[2], top[3] );
  635. PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
  636. PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
  637. PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
  638. PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
  639. PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
  640. PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
  641. PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
  642. PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
  643. PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
  644. PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
  645. PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
  646. PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
  647. PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
  648. PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
  649. PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
  650. PRED( 7, 2 ) = F1( top[10], top[11] );
  651. PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
  652. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  653. return satd;
  654. #undef PRED
  655. }
  656. int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
  657. {
  658. private pixel pred[32];
  659. int satd;
  660. int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
  661. int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
  662. int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
  663. int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
  664. int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
  665. int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
  666. int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
  667. int p8 = pack8to16( left[7], left[7] );
  668. // Upper half of pred[]
  669. vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
  670. vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
  671. vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
  672. vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
  673. vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
  674. vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
  675. vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
  676. vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
  677. satd = satd_8x4_lp( src, src_stride, pred, 8 );
  678. // Lower half of pred[]
  679. vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
  680. vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
  681. vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
  682. vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
  683. vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
  684. vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
  685. vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
  686. vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
  687. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  688. return satd;
  689. }
  690. int x264_predict_8x8c_h( const local pixel *src, int src_stride )
  691. {
  692. private pixel pred[32];
  693. const local pixel *src_l = src;
  694. // Upper half of pred[]
  695. vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
  696. vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
  697. vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
  698. vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
  699. int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
  700. // Lower half of pred[]
  701. vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
  702. vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
  703. vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
  704. vstore8( (uchar8)(src[-1]), 3, pred );
  705. return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
  706. }
  707. int x264_predict_8x8c_v( const local pixel *src, int src_stride )
  708. {
  709. private pixel pred[32];
  710. uchar16 v16;
  711. v16.lo = vload8( 0, &src[-src_stride] );
  712. v16.hi = vload8( 0, &src[-src_stride] );
  713. vstore16( v16, 0, pred );
  714. vstore16( v16, 1, pred );
  715. return satd_8x4_lp( src, src_stride, pred, 8 ) +
  716. satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
  717. }
  718. int x264_predict_8x8c_p( const local pixel *src, int src_stride )
  719. {
  720. int H = 0, V = 0;
  721. private pixel pred[32];
  722. int satd;
  723. for( int i = 0; i < 4; i++ )
  724. {
  725. H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
  726. V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
  727. }
  728. int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
  729. int b = (17 * H + 16) >> 5;
  730. int c = (17 * V + 16) >> 5;
  731. int i00 = a - 3 * b - 3 * c + 16;
  732. // Upper half of pred[]
  733. for( int y = 0; y < 4; y++ )
  734. {
  735. int pix = i00;
  736. for( int x = 0; x < 8; x++ )
  737. {
  738. pred[x + y*8] = x264_clip_pixel( pix >> 5 );
  739. pix += b;
  740. }
  741. i00 += c;
  742. }
  743. satd = satd_8x4_lp( src, src_stride, pred, 8 );
  744. // Lower half of pred[]
  745. for( int y = 0; y < 4; y++ )
  746. {
  747. int pix = i00;
  748. for( int x = 0; x < 8; x++ )
  749. {
  750. pred[x + y*8] = x264_clip_pixel( pix >> 5 );
  751. pix += b;
  752. }
  753. i00 += c;
  754. }
  755. satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  756. return satd;
  757. }
  758. int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
  759. {
  760. private pixel pred[32];
  761. int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
  762. for( int i = 0; i < 4; i++ )
  763. {
  764. s0 += src[i - src_stride];
  765. s1 += src[i + 4 - src_stride];
  766. s2 += src[-1 + i * src_stride];
  767. s3 += src[-1 + (i+4)*src_stride];
  768. }
  769. // Upper half of pred[]
  770. uchar8 dc0;
  771. dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
  772. dc0.hi = (uchar4)( (s1 + 2) >> 2 );
  773. vstore8( dc0, 0, pred );
  774. vstore8( dc0, 1, pred );
  775. vstore8( dc0, 2, pred );
  776. vstore8( dc0, 3, pred );
  777. int satd = satd_8x4_lp( src, src_stride, pred, 8 );
  778. // Lower half of pred[]
  779. dc0.lo = (uchar4)( (s3 + 2) >> 2 );
  780. dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
  781. vstore8( dc0, 0, pred );
  782. vstore8( dc0, 1, pred );
  783. vstore8( dc0, 2, pred );
  784. vstore8( dc0, 3, pred );
  785. return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
  786. }
  787. #endif
  788. /* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
  789. *
  790. * Loads 33 macroblocks plus the pixels directly above them into local memory,
  791. * padding where necessary with edge pixels. It then cooperatively calculates
  792. * smoothed top and left pixels for use in some of the analysis.
  793. *
  794. * Then groups of 32 threads each calculate a single intra mode for each 8x8
  795. * block. Since consecutive threads are calculating the same intra mode there
  796. * is no code-path divergence. 8 intra costs are calculated simultaneously. If
  797. * the "slow" argument is not zero, the final two (least likely) intra modes are
  798. * tested in a second pass. The slow mode is only enabled for presets slow,
  799. * slower, and placebo.
  800. *
  801. * This allows all of the pixels functions to read pixels from local memory, and
  802. * avoids re-fetching edge pixels from global memory. And it allows us to
  803. * calculate all of the intra mode costs simultaneously without branch divergence.
  804. *
  805. * Local dimension: [ 32, 8 ]
  806. * Global dimensions: [ paddedWidth, height ] */
  807. kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc,
  808. global uint16_t *fenc_intra_cost,
  809. global int *frame_stats,
  810. int lambda,
  811. int mb_width,
  812. int slow )
  813. {
  814. #define CACHE_STRIDE 265
  815. #define BLOCK_OFFSET 266
  816. local pixel cache[2385];
  817. local int cost_buf[32];
  818. local pixel top[32 * 16];
  819. local pixel left[32 * 8];
  820. local pixel left_top[32];
  821. int lx = get_local_id( 0 );
  822. int ly = get_local_id( 1 );
  823. int gx = get_global_id( 0 );
  824. int gy = get_global_id( 1 );
  825. int gidx = get_group_id( 0 );
  826. int gidy = get_group_id( 1 );
  827. int linear_id = ly * get_local_size( 0 ) + lx;
  828. int satd = COST_MAX;
  829. int basex = gidx << 8;
  830. int basey = (gidy << 3) - 1;
  831. /* Load 33 8x8 macroblocks and the pixels above them into local cache */
  832. for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
  833. {
  834. int x = linear_id << 2;
  835. uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
  836. cache[y * CACHE_STRIDE + 1 + x] = data.s0;
  837. cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
  838. cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
  839. cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
  840. }
  841. /* load pixels on left edge */
  842. if( linear_id < 9 )
  843. cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
  844. barrier( CLK_LOCAL_MEM_FENCE );
  845. // Cooperatively build the top edge for the macroblock using lowpass filter
  846. int j = ly;
  847. top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
  848. 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
  849. cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
  850. j += 8;
  851. top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
  852. 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
  853. cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
  854. // Cooperatively build the left edge for the macroblock using lowpass filter
  855. left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
  856. 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
  857. cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
  858. // One left_top per macroblock
  859. if( 0 == ly )
  860. {
  861. left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
  862. cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
  863. cost_buf[lx] = COST_MAX;
  864. }
  865. barrier( CLK_LOCAL_MEM_FENCE );
  866. // each warp/wavefront generates a different prediction type; no divergence
  867. switch( ly )
  868. {
  869. case 0:
  870. satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  871. break;
  872. case 1:
  873. satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  874. break;
  875. case 2:
  876. satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  877. break;
  878. case 3:
  879. satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
  880. break;
  881. case 4:
  882. satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  883. break;
  884. case 5:
  885. satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  886. break;
  887. case 6:
  888. satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
  889. break;
  890. case 7:
  891. satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
  892. break;
  893. default:
  894. break;
  895. }
  896. atom_min( &cost_buf[lx], satd );
  897. if( slow )
  898. {
  899. // Do the remaining two (least likely) prediction modes
  900. switch( ly )
  901. {
  902. case 0: // DDL
  903. satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
  904. atom_min( &cost_buf[lx], satd );
  905. break;
  906. case 1: // VL
  907. satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
  908. atom_min( &cost_buf[lx], satd );
  909. break;
  910. default:
  911. break;
  912. }
  913. }
  914. barrier( CLK_LOCAL_MEM_FENCE );
  915. if( (0 == ly) && (gx < mb_width) )
  916. fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
  917. // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
  918. if( gx < 2 && gy == 0 )
  919. frame_stats[gx] = 0;
  920. #undef CACHE_STRIDE
  921. #undef BLOCK_OFFSET
  922. }
  923. /*
  924. * parallel sum intra costs
  925. *
  926. * global launch dimensions: [256, mb_height]
  927. */
  928. kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
  929. const global uint16_t *inv_qscale_factor,
  930. global int *fenc_row_satds,
  931. global int *frame_stats,
  932. int mb_width )
  933. {
  934. int y = get_global_id( 1 );
  935. int mb_height = get_global_size( 1 );
  936. int row_satds = 0;
  937. int cost_est = 0;
  938. int cost_est_aq = 0;
  939. for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
  940. {
  941. int mb_xy = x + y * mb_width;
  942. int cost = fenc_intra_cost[mb_xy];
  943. int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
  944. int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
  945. row_satds += cost_aq;
  946. if( b_frame_score_mb )
  947. {
  948. cost_est += cost;
  949. cost_est_aq += cost_aq;
  950. }
  951. }
  952. local int buffer[256];
  953. int x = get_global_id( 0 );
  954. row_satds = parallel_sum( row_satds, x, buffer );
  955. cost_est = parallel_sum( cost_est, x, buffer );
  956. cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
  957. if( get_global_id( 0 ) == 0 )
  958. {
  959. fenc_row_satds[y] = row_satds;
  960. atomic_add( frame_stats + COST_EST, cost_est );
  961. atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
  962. }
  963. }