opencl.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. /*****************************************************************************
  2. * opencl.c: OpenCL initialization and kernel compilation
  3. *****************************************************************************
  4. * Copyright (C) 2012-2018 x264 project
  5. *
  6. * Authors: Steve Borho <sborho@multicorewareinc.com>
  7. * Anton Mitrofanov <BugMaster@narod.ru>
  8. *
  9. * This program is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU General Public License as published by
  11. * the Free Software Foundation; either version 2 of the License, or
  12. * (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. *
  23. * This program is also available under a commercial proprietary license.
  24. * For more information, contact us at licensing@x264.com.
  25. *****************************************************************************/
  26. #include "common.h"
  27. #ifdef _WIN32
  28. #include <windows.h>
  29. #define ocl_open LoadLibraryW( L"OpenCL" )
  30. #define ocl_close FreeLibrary
  31. #define ocl_address GetProcAddress
  32. #else
  33. #include <dlfcn.h> //dlopen, dlsym, dlclose
  34. #if SYS_MACOSX
  35. #define ocl_open dlopen( "/System/Library/Frameworks/OpenCL.framework/OpenCL", RTLD_NOW )
  36. #else
  37. #define ocl_open dlopen( "libOpenCL.so", RTLD_NOW )
  38. #endif
  39. #define ocl_close dlclose
  40. #define ocl_address dlsym
  41. #endif
  42. #define LOAD_OCL_FUNC(name, continue_on_fail)\
  43. {\
  44. ocl->name = (void*)ocl_address( ocl->library, #name );\
  45. if( !continue_on_fail && !ocl->name )\
  46. goto fail;\
  47. }
  48. /* load the library and functions we require from it */
  49. x264_opencl_function_t *x264_opencl_load_library( void )
  50. {
  51. x264_opencl_function_t *ocl;
  52. #undef fail
  53. #define fail fail0
  54. CHECKED_MALLOCZERO( ocl, sizeof(x264_opencl_function_t) );
  55. #undef fail
  56. #define fail fail1
  57. ocl->library = ocl_open;
  58. if( !ocl->library )
  59. goto fail;
  60. #undef fail
  61. #define fail fail2
  62. LOAD_OCL_FUNC( clBuildProgram, 0 );
  63. LOAD_OCL_FUNC( clCreateBuffer, 0 );
  64. LOAD_OCL_FUNC( clCreateCommandQueue, 0 );
  65. LOAD_OCL_FUNC( clCreateContext, 0 );
  66. LOAD_OCL_FUNC( clCreateImage2D, 0 );
  67. LOAD_OCL_FUNC( clCreateKernel, 0 );
  68. LOAD_OCL_FUNC( clCreateProgramWithBinary, 0 );
  69. LOAD_OCL_FUNC( clCreateProgramWithSource, 0 );
  70. LOAD_OCL_FUNC( clEnqueueCopyBuffer, 0 );
  71. LOAD_OCL_FUNC( clEnqueueMapBuffer, 0 );
  72. LOAD_OCL_FUNC( clEnqueueNDRangeKernel, 0 );
  73. LOAD_OCL_FUNC( clEnqueueReadBuffer, 0 );
  74. LOAD_OCL_FUNC( clEnqueueWriteBuffer, 0 );
  75. LOAD_OCL_FUNC( clFinish, 0 );
  76. LOAD_OCL_FUNC( clGetCommandQueueInfo, 0 );
  77. LOAD_OCL_FUNC( clGetDeviceIDs, 0 );
  78. LOAD_OCL_FUNC( clGetDeviceInfo, 0 );
  79. LOAD_OCL_FUNC( clGetKernelWorkGroupInfo, 0 );
  80. LOAD_OCL_FUNC( clGetPlatformIDs, 0 );
  81. LOAD_OCL_FUNC( clGetProgramBuildInfo, 0 );
  82. LOAD_OCL_FUNC( clGetProgramInfo, 0 );
  83. LOAD_OCL_FUNC( clGetSupportedImageFormats, 0 );
  84. LOAD_OCL_FUNC( clReleaseCommandQueue, 0 );
  85. LOAD_OCL_FUNC( clReleaseContext, 0 );
  86. LOAD_OCL_FUNC( clReleaseKernel, 0 );
  87. LOAD_OCL_FUNC( clReleaseMemObject, 0 );
  88. LOAD_OCL_FUNC( clReleaseProgram, 0 );
  89. LOAD_OCL_FUNC( clSetKernelArg, 0 );
  90. return ocl;
  91. #undef fail
  92. fail2:
  93. ocl_close( ocl->library );
  94. fail1:
  95. x264_free( ocl );
  96. fail0:
  97. return NULL;
  98. }
  99. void x264_opencl_close_library( x264_opencl_function_t *ocl )
  100. {
  101. if( !ocl )
  102. return;
  103. ocl_close( ocl->library );
  104. x264_free( ocl );
  105. }
  106. /* define from recent cl_ext.h, copied here in case headers are old */
  107. #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
  108. /* Requires full include path in case of out-of-tree builds */
  109. #include "common/oclobj.h"
  110. static int detect_switchable_graphics( void );
  111. /* Try to load the cached compiled program binary, verify the device context is
  112. * still valid before reuse */
  113. static cl_program opencl_cache_load( x264_t *h, const char *dev_name, const char *dev_vendor, const char *driver_version )
  114. {
  115. /* try to load cached program binary */
  116. FILE *fp = x264_fopen( h->param.psz_clbin_file, "rb" );
  117. if( !fp )
  118. return NULL;
  119. x264_opencl_function_t *ocl = h->opencl.ocl;
  120. cl_program program = NULL;
  121. uint8_t *binary = NULL;
  122. fseek( fp, 0, SEEK_END );
  123. size_t size = ftell( fp );
  124. rewind( fp );
  125. CHECKED_MALLOC( binary, size );
  126. if( fread( binary, 1, size, fp ) != size )
  127. goto fail;
  128. const uint8_t *ptr = (const uint8_t*)binary;
  129. #define CHECK_STRING( STR )\
  130. do {\
  131. size_t len = strlen( STR );\
  132. if( size <= len || strncmp( (char*)ptr, STR, len ) )\
  133. goto fail;\
  134. else {\
  135. size -= (len+1); ptr += (len+1);\
  136. }\
  137. } while( 0 )
  138. CHECK_STRING( dev_name );
  139. CHECK_STRING( dev_vendor );
  140. CHECK_STRING( driver_version );
  141. CHECK_STRING( x264_opencl_source_hash );
  142. #undef CHECK_STRING
  143. cl_int status;
  144. program = ocl->clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
  145. if( status != CL_SUCCESS )
  146. program = NULL;
  147. fail:
  148. fclose( fp );
  149. x264_free( binary );
  150. return program;
  151. }
  152. /* Save the compiled program binary to a file for later reuse. Device context
  153. * is also saved in the cache file so we do not reuse stale binaries */
  154. static void opencl_cache_save( x264_t *h, cl_program program, const char *dev_name, const char *dev_vendor, const char *driver_version )
  155. {
  156. FILE *fp = x264_fopen( h->param.psz_clbin_file, "wb" );
  157. if( !fp )
  158. {
  159. x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write\n" );
  160. return;
  161. }
  162. x264_opencl_function_t *ocl = h->opencl.ocl;
  163. uint8_t *binary = NULL;
  164. size_t size = 0;
  165. cl_int status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
  166. if( status != CL_SUCCESS || !size )
  167. {
  168. x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated\n" );
  169. goto fail;
  170. }
  171. CHECKED_MALLOC( binary, size );
  172. status = ocl->clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
  173. if( status != CL_SUCCESS )
  174. {
  175. x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated\n" );
  176. goto fail;
  177. }
  178. fputs( dev_name, fp );
  179. fputc( '\n', fp );
  180. fputs( dev_vendor, fp );
  181. fputc( '\n', fp );
  182. fputs( driver_version, fp );
  183. fputc( '\n', fp );
  184. fputs( x264_opencl_source_hash, fp );
  185. fputc( '\n', fp );
  186. fwrite( binary, 1, size, fp );
  187. fail:
  188. fclose( fp );
  189. x264_free( binary );
  190. return;
  191. }
  192. /* The OpenCL source under common/opencl will be merged into common/oclobj.h by
  193. * the Makefile. It defines a x264_opencl_source byte array which we will pass
  194. * to clCreateProgramWithSource(). We also attempt to use a cache file for the
  195. * compiled binary, stored in the current working folder. */
  196. static cl_program opencl_compile( x264_t *h )
  197. {
  198. x264_opencl_function_t *ocl = h->opencl.ocl;
  199. cl_program program = NULL;
  200. char *build_log = NULL;
  201. char dev_name[64];
  202. char dev_vendor[64];
  203. char driver_version[64];
  204. cl_int status;
  205. status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
  206. status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL );
  207. status |= ocl->clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driver_version), driver_version, NULL );
  208. if( status != CL_SUCCESS )
  209. return NULL;
  210. // Most AMD GPUs have vector registers
  211. int vectorize = !strcmp( dev_vendor, "Advanced Micro Devices, Inc." );
  212. h->opencl.b_device_AMD_SI = 0;
  213. if( vectorize )
  214. {
  215. /* Disable OpenCL on Intel/AMD switchable graphics devices */
  216. if( detect_switchable_graphics() )
  217. {
  218. x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
  219. return NULL;
  220. }
  221. /* Detect AMD SouthernIsland or newer device (single-width registers) */
  222. cl_uint simdwidth = 4;
  223. status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
  224. if( status == CL_SUCCESS && simdwidth == 1 )
  225. {
  226. vectorize = 0;
  227. h->opencl.b_device_AMD_SI = 1;
  228. }
  229. }
  230. x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", dev_vendor, dev_name, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
  231. program = opencl_cache_load( h, dev_name, dev_vendor, driver_version );
  232. if( !program )
  233. {
  234. /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
  235. x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
  236. const char *strptr = (const char*)x264_opencl_source;
  237. size_t size = sizeof(x264_opencl_source);
  238. program = ocl->clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
  239. if( status != CL_SUCCESS || !program )
  240. {
  241. x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
  242. return NULL;
  243. }
  244. }
  245. /* Build the program binary for the OpenCL device */
  246. const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
  247. status = ocl->clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
  248. if( status == CL_SUCCESS )
  249. {
  250. opencl_cache_save( h, program, dev_name, dev_vendor, driver_version );
  251. return program;
  252. }
  253. /* Compile failure, should not happen with production code. */
  254. size_t build_log_len = 0;
  255. status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_len );
  256. if( status != CL_SUCCESS || !build_log_len )
  257. {
  258. x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
  259. goto fail;
  260. }
  261. build_log = x264_malloc( build_log_len );
  262. if( !build_log )
  263. {
  264. x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
  265. goto fail;
  266. }
  267. status = ocl->clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
  268. if( status != CL_SUCCESS )
  269. {
  270. x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
  271. goto fail;
  272. }
  273. FILE *log_file = x264_fopen( "x264_kernel_build_log.txt", "w" );
  274. if( !log_file )
  275. {
  276. x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to create file x264_kernel_build_log.txt\n" );
  277. goto fail;
  278. }
  279. fwrite( build_log, 1, build_log_len, log_file );
  280. fclose( log_file );
  281. x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
  282. fail:
  283. x264_free( build_log );
  284. if( program )
  285. ocl->clReleaseProgram( program );
  286. return NULL;
  287. }
  288. static int opencl_lookahead_alloc( x264_t *h )
  289. {
  290. if( !h->param.rc.i_lookahead )
  291. return -1;
  292. static const char *kernelnames[] = {
  293. "mb_intra_cost_satd_8x8",
  294. "sum_intra_cost",
  295. "downscale_hpel",
  296. "downscale1",
  297. "downscale2",
  298. "memset_int16",
  299. "weightp_scaled_images",
  300. "weightp_hpel",
  301. "hierarchical_motion",
  302. "subpel_refine",
  303. "mode_selection",
  304. "sum_inter_cost"
  305. };
  306. cl_kernel *kernels[] = {
  307. &h->opencl.intra_kernel,
  308. &h->opencl.rowsum_intra_kernel,
  309. &h->opencl.downscale_hpel_kernel,
  310. &h->opencl.downscale_kernel1,
  311. &h->opencl.downscale_kernel2,
  312. &h->opencl.memset_kernel,
  313. &h->opencl.weightp_scaled_images_kernel,
  314. &h->opencl.weightp_hpel_kernel,
  315. &h->opencl.hme_kernel,
  316. &h->opencl.subpel_refine_kernel,
  317. &h->opencl.mode_select_kernel,
  318. &h->opencl.rowsum_inter_kernel
  319. };
  320. x264_opencl_function_t *ocl = h->opencl.ocl;
  321. cl_int status;
  322. h->opencl.lookahead_program = opencl_compile( h );
  323. if( !h->opencl.lookahead_program )
  324. goto fail;
  325. for( int i = 0; i < ARRAY_ELEMS(kernelnames); i++ )
  326. {
  327. *kernels[i] = ocl->clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
  328. if( status != CL_SUCCESS )
  329. {
  330. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
  331. goto fail;
  332. }
  333. }
  334. h->opencl.page_locked_buffer = ocl->clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
  335. if( status != CL_SUCCESS )
  336. {
  337. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
  338. goto fail;
  339. }
  340. h->opencl.page_locked_ptr = ocl->clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
  341. 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
  342. if( status != CL_SUCCESS )
  343. {
  344. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
  345. goto fail;
  346. }
  347. return 0;
  348. fail:
  349. x264_opencl_lookahead_delete( h );
  350. return -1;
  351. }
  352. static void CL_CALLBACK opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
  353. {
  354. /* Any error notification can be assumed to be fatal to the OpenCL context.
  355. * We need to stop using it immediately to prevent further damage. */
  356. x264_t *h = (x264_t*)user_data;
  357. h->param.b_opencl = 0;
  358. h->opencl.b_fatal_error = 1;
  359. x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
  360. x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
  361. }
  362. int x264_opencl_lookahead_init( x264_t *h )
  363. {
  364. x264_opencl_function_t *ocl = h->opencl.ocl;
  365. cl_platform_id *platforms = NULL;
  366. cl_device_id *devices = NULL;
  367. cl_image_format *imageType = NULL;
  368. cl_context context = NULL;
  369. int ret = -1;
  370. cl_uint numPlatforms = 0;
  371. cl_int status = ocl->clGetPlatformIDs( 0, NULL, &numPlatforms );
  372. if( status != CL_SUCCESS || !numPlatforms )
  373. {
  374. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
  375. goto fail;
  376. }
  377. platforms = (cl_platform_id*)x264_malloc( sizeof(cl_platform_id) * numPlatforms );
  378. if( !platforms )
  379. {
  380. x264_log( h, X264_LOG_WARNING, "OpenCL: malloc of installed platforms buffer failed\n" );
  381. goto fail;
  382. }
  383. status = ocl->clGetPlatformIDs( numPlatforms, platforms, NULL );
  384. if( status != CL_SUCCESS )
  385. {
  386. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n" );
  387. goto fail;
  388. }
  389. /* Select the first OpenCL platform with a GPU device that supports our
  390. * required image (texture) formats */
  391. for( cl_uint i = 0; i < numPlatforms; i++ )
  392. {
  393. cl_uint gpu_count = 0;
  394. status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
  395. if( status != CL_SUCCESS || !gpu_count )
  396. continue;
  397. x264_free( devices );
  398. devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
  399. if( !devices )
  400. continue;
  401. status = ocl->clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
  402. if( status != CL_SUCCESS )
  403. continue;
  404. /* Find a GPU device that supports our image formats */
  405. for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
  406. {
  407. h->opencl.device = devices[gpu];
  408. /* if the user has specified an exact device ID, skip all other
  409. * GPUs. If this device matches, allow it to continue through the
  410. * checks for supported images, etc. */
  411. if( h->param.opencl_device_id && devices[gpu] != (cl_device_id)h->param.opencl_device_id )
  412. continue;
  413. cl_bool image_support = 0;
  414. status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
  415. if( status != CL_SUCCESS || !image_support )
  416. continue;
  417. if( context )
  418. ocl->clReleaseContext( context );
  419. context = ocl->clCreateContext( NULL, 1, &h->opencl.device, (void*)opencl_error_notify, (void*)h, &status );
  420. if( status != CL_SUCCESS || !context )
  421. continue;
  422. cl_uint imagecount = 0;
  423. status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
  424. if( status != CL_SUCCESS || !imagecount )
  425. continue;
  426. x264_free( imageType );
  427. imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
  428. if( !imageType )
  429. continue;
  430. status = ocl->clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
  431. if( status != CL_SUCCESS )
  432. continue;
  433. int b_has_r = 0;
  434. int b_has_rgba = 0;
  435. for( cl_uint j = 0; j < imagecount; j++ )
  436. {
  437. if( imageType[j].image_channel_order == CL_R &&
  438. imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
  439. b_has_r = 1;
  440. else if( imageType[j].image_channel_order == CL_RGBA &&
  441. imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
  442. b_has_rgba = 1;
  443. }
  444. if( !b_has_r || !b_has_rgba )
  445. {
  446. char dev_name[64];
  447. status = ocl->clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL );
  448. if( status == CL_SUCCESS )
  449. {
  450. /* emit warning if we are discarding the user's explicit choice */
  451. int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
  452. x264_log( h, level, "OpenCL: %s does not support required image formats\n", dev_name );
  453. }
  454. continue;
  455. }
  456. /* user selection of GPU device, skip N first matches */
  457. if( h->param.i_opencl_device )
  458. {
  459. h->param.i_opencl_device--;
  460. continue;
  461. }
  462. h->opencl.queue = ocl->clCreateCommandQueue( context, h->opencl.device, 0, &status );
  463. if( status != CL_SUCCESS || !h->opencl.queue )
  464. continue;
  465. h->opencl.context = context;
  466. context = NULL;
  467. ret = 0;
  468. break;
  469. }
  470. if( !ret )
  471. break;
  472. }
  473. if( !h->param.psz_clbin_file )
  474. h->param.psz_clbin_file = "x264_lookahead.clbin";
  475. if( ret )
  476. x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n" );
  477. else
  478. ret = opencl_lookahead_alloc( h );
  479. fail:
  480. if( context )
  481. ocl->clReleaseContext( context );
  482. x264_free( imageType );
  483. x264_free( devices );
  484. x264_free( platforms );
  485. return ret;
  486. }
  487. static void opencl_lookahead_free( x264_t *h )
  488. {
  489. x264_opencl_function_t *ocl = h->opencl.ocl;
  490. #define RELEASE( a, f ) do { if( a ) { ocl->f( a ); a = NULL; } } while( 0 )
  491. RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel );
  492. RELEASE( h->opencl.downscale_kernel1, clReleaseKernel );
  493. RELEASE( h->opencl.downscale_kernel2, clReleaseKernel );
  494. RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel );
  495. RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel );
  496. RELEASE( h->opencl.memset_kernel, clReleaseKernel );
  497. RELEASE( h->opencl.intra_kernel, clReleaseKernel );
  498. RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel );
  499. RELEASE( h->opencl.hme_kernel, clReleaseKernel );
  500. RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel );
  501. RELEASE( h->opencl.mode_select_kernel, clReleaseKernel );
  502. RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel );
  503. RELEASE( h->opencl.lookahead_program, clReleaseProgram );
  504. RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject );
  505. RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject );
  506. RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject );
  507. for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
  508. RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject );
  509. RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject );
  510. RELEASE( h->opencl.row_satds[0], clReleaseMemObject );
  511. RELEASE( h->opencl.row_satds[1], clReleaseMemObject );
  512. RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject );
  513. RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject );
  514. RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject );
  515. RELEASE( h->opencl.mvp_buffer, clReleaseMemObject );
  516. RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject );
  517. RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject );
  518. RELEASE( h->opencl.frame_stats[0], clReleaseMemObject );
  519. RELEASE( h->opencl.frame_stats[1], clReleaseMemObject );
  520. #undef RELEASE
  521. }
  522. void x264_opencl_lookahead_delete( x264_t *h )
  523. {
  524. x264_opencl_function_t *ocl = h->opencl.ocl;
  525. if( !ocl )
  526. return;
  527. if( h->opencl.queue )
  528. ocl->clFinish( h->opencl.queue );
  529. opencl_lookahead_free( h );
  530. if( h->opencl.queue )
  531. {
  532. ocl->clReleaseCommandQueue( h->opencl.queue );
  533. h->opencl.queue = NULL;
  534. }
  535. if( h->opencl.context )
  536. {
  537. ocl->clReleaseContext( h->opencl.context );
  538. h->opencl.context = NULL;
  539. }
  540. }
  541. void x264_opencl_frame_delete( x264_frame_t *frame )
  542. {
  543. x264_opencl_function_t *ocl = frame->opencl.ocl;
  544. if( !ocl )
  545. return;
  546. #define RELEASEBUF(mem) do { if( mem ) { ocl->clReleaseMemObject( mem ); mem = NULL; } } while( 0 )
  547. for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
  548. RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
  549. RELEASEBUF( frame->opencl.luma_hpel );
  550. RELEASEBUF( frame->opencl.inv_qscale_factor );
  551. RELEASEBUF( frame->opencl.intra_cost );
  552. RELEASEBUF( frame->opencl.lowres_mvs0 );
  553. RELEASEBUF( frame->opencl.lowres_mvs1 );
  554. RELEASEBUF( frame->opencl.lowres_mv_costs0 );
  555. RELEASEBUF( frame->opencl.lowres_mv_costs1 );
  556. #undef RELEASEBUF
  557. }
  558. /* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
  559. * we consult AMD's ADL interface to detect this situation and disable
  560. * OpenCL on these machines (Linux and Windows) */
  561. #ifdef _WIN32
  562. #define ADL_API_CALL
  563. #define ADL_CALLBACK __stdcall
  564. #define adl_close FreeLibrary
  565. #define adl_address GetProcAddress
  566. #else
  567. #define ADL_API_CALL
  568. #define ADL_CALLBACK
  569. #define adl_close dlclose
  570. #define adl_address dlsym
  571. #endif
  572. typedef void* ( ADL_CALLBACK *ADL_MAIN_MALLOC_CALLBACK )( int );
  573. typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_CREATE )( ADL_MAIN_MALLOC_CALLBACK, int );
  574. typedef int ( ADL_API_CALL *ADL_ADAPTER_NUMBEROFADAPTERS_GET )( int * );
  575. typedef int ( ADL_API_CALL *ADL_POWERXPRESS_SCHEME_GET )( int, int *, int *, int * );
  576. typedef int ( ADL_API_CALL *ADL_MAIN_CONTROL_DESTROY )( void );
  577. #define ADL_OK 0
  578. #define ADL_PX_SCHEME_DYNAMIC 2
  579. static void* ADL_CALLBACK adl_malloc_wrapper( int iSize )
  580. {
  581. return x264_malloc( iSize );
  582. }
  583. static int detect_switchable_graphics( void )
  584. {
  585. void *hDLL;
  586. ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
  587. ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
  588. ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get;
  589. ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
  590. int ret = 0;
  591. #ifdef _WIN32
  592. hDLL = LoadLibraryW( L"atiadlxx.dll" );
  593. if( !hDLL )
  594. hDLL = LoadLibraryW( L"atiadlxy.dll" );
  595. #else
  596. hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
  597. #endif
  598. if( !hDLL )
  599. goto fail0;
  600. ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE)adl_address(hDLL, "ADL_Main_Control_Create");
  601. ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY)adl_address(hDLL, "ADL_Main_Control_Destroy");
  602. ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET)adl_address(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
  603. ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET)adl_address(hDLL, "ADL_PowerXpress_Scheme_Get");
  604. if( !ADL_Main_Control_Create || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
  605. !ADL_PowerXpress_Scheme_Get )
  606. goto fail1;
  607. if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1 ) )
  608. goto fail1;
  609. int numAdapters = 0;
  610. if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
  611. goto fail2;
  612. for( int i = 0; i < numAdapters; i++ )
  613. {
  614. int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
  615. if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
  616. break;
  617. if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
  618. {
  619. ret = 1;
  620. break;
  621. }
  622. }
  623. fail2:
  624. ADL_Main_Control_Destroy();
  625. fail1:
  626. adl_close( hDLL );
  627. fail0:
  628. return ret;
  629. }