checkasm.c 118 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979
  1. /*****************************************************************************
  2. * checkasm.c: assembly check tool
  3. *****************************************************************************
  4. * Copyright (C) 2003-2018 x264 project
  5. *
  6. * Authors: Loren Merritt <lorenm@u.washington.edu>
  7. * Laurent Aimar <fenrir@via.ecp.fr>
  8. * Fiona Glaser <fiona@x264.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include <ctype.h>
  28. #include "common/common.h"
  29. #include "encoder/macroblock.h"
  30. #ifdef _WIN32
  31. #include <windows.h>
  32. #endif
  33. // GCC doesn't align stack variables on ARM, so use .bss
  34. #if ARCH_ARM
  35. #undef ALIGNED_16
  36. #define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
  37. #endif
  38. /* buf1, buf2: initialised to random data and shouldn't write into them */
  39. static uint8_t *buf1, *buf2;
  40. /* buf3, buf4: used to store output */
  41. static uint8_t *buf3, *buf4;
  42. /* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
  43. static pixel *pbuf1, *pbuf2;
  44. /* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
  45. static pixel *pbuf3, *pbuf4;
  46. static int quiet = 0;
  47. #define report( name ) { \
  48. if( used_asm && !quiet ) \
  49. fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
  50. if( !ok ) ret = -1; \
  51. }
  52. #define BENCH_RUNS 2000 // tradeoff between accuracy and speed
  53. #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
  54. #define MAX_CPUS 30 // number of different combinations of cpu flags
  55. typedef struct
  56. {
  57. void *pointer; // just for detecting duplicates
  58. uint32_t cpu;
  59. uint64_t cycles;
  60. uint32_t den;
  61. } bench_t;
  62. typedef struct
  63. {
  64. char *name;
  65. bench_t vers[MAX_CPUS];
  66. } bench_func_t;
  67. static int do_bench = 0;
  68. static int bench_pattern_len = 0;
  69. static const char *bench_pattern = "";
  70. static char func_name[100];
  71. static bench_func_t benchs[MAX_FUNCS];
  72. static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
  73. static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
  74. static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
  75. static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
  76. static const char **intra_predict_8x8_names = intra_predict_4x4_names;
  77. static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
  78. #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
  79. static inline uint32_t read_time(void)
  80. {
  81. uint32_t a = 0;
  82. #if HAVE_X86_INLINE_ASM
  83. asm volatile( "lfence \n"
  84. "rdtsc \n"
  85. : "=a"(a) :: "edx", "memory" );
  86. #elif ARCH_PPC
  87. asm volatile( "mftb %0" : "=r"(a) :: "memory" );
  88. #elif HAVE_ARM_INLINE_ASM // ARMv7 only
  89. asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
  90. #elif ARCH_AARCH64
  91. uint64_t b = 0;
  92. asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" );
  93. a = b;
  94. #elif ARCH_MIPS
  95. asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
  96. #endif
  97. return a;
  98. }
  99. static bench_t* get_bench( const char *name, int cpu )
  100. {
  101. int i, j;
  102. for( i = 0; benchs[i].name && strcmp(name, benchs[i].name); i++ )
  103. assert( i < MAX_FUNCS );
  104. if( !benchs[i].name )
  105. benchs[i].name = strdup( name );
  106. if( !cpu )
  107. return &benchs[i].vers[0];
  108. for( j = 1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ )
  109. assert( j < MAX_CPUS );
  110. benchs[i].vers[j].cpu = cpu;
  111. return &benchs[i].vers[j];
  112. }
  113. static int cmp_nop( const void *a, const void *b )
  114. {
  115. return *(uint16_t*)a - *(uint16_t*)b;
  116. }
  117. static int cmp_bench( const void *a, const void *b )
  118. {
  119. // asciibetical sort except preserving numbers
  120. const char *sa = ((bench_func_t*)a)->name;
  121. const char *sb = ((bench_func_t*)b)->name;
  122. for( ;; sa++, sb++ )
  123. {
  124. if( !*sa && !*sb )
  125. return 0;
  126. if( isdigit( *sa ) && isdigit( *sb ) && isdigit( sa[1] ) != isdigit( sb[1] ) )
  127. return isdigit( sa[1] ) - isdigit( sb[1] );
  128. if( *sa != *sb )
  129. return *sa - *sb;
  130. }
  131. }
  132. static void print_bench(void)
  133. {
  134. uint16_t nops[10000];
  135. int nfuncs, nop_time=0;
  136. for( int i = 0; i < 10000; i++ )
  137. {
  138. uint32_t t = read_time();
  139. nops[i] = read_time() - t;
  140. }
  141. qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
  142. for( int i = 500; i < 9500; i++ )
  143. nop_time += nops[i];
  144. nop_time /= 900;
  145. printf( "nop: %d\n", nop_time );
  146. for( nfuncs = 0; nfuncs < MAX_FUNCS && benchs[nfuncs].name; nfuncs++ );
  147. qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench );
  148. for( int i = 0; i < nfuncs; i++ )
  149. for( int j = 0; j < MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ )
  150. {
  151. int k;
  152. bench_t *b = &benchs[i].vers[j];
  153. if( !b->den )
  154. continue;
  155. for( k = 0; k < j && benchs[i].vers[k].pointer != b->pointer; k++ );
  156. if( k < j )
  157. continue;
  158. printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
  159. #if HAVE_MMX
  160. b->cpu&X264_CPU_AVX512 ? "avx512" :
  161. b->cpu&X264_CPU_AVX2 ? "avx2" :
  162. b->cpu&X264_CPU_BMI2 ? "bmi2" :
  163. b->cpu&X264_CPU_BMI1 ? "bmi1" :
  164. b->cpu&X264_CPU_FMA3 ? "fma3" :
  165. b->cpu&X264_CPU_FMA4 ? "fma4" :
  166. b->cpu&X264_CPU_XOP ? "xop" :
  167. b->cpu&X264_CPU_AVX ? "avx" :
  168. b->cpu&X264_CPU_SSE42 ? "sse42" :
  169. b->cpu&X264_CPU_SSE4 ? "sse4" :
  170. b->cpu&X264_CPU_SSSE3 ? "ssse3" :
  171. b->cpu&X264_CPU_SSE3 ? "sse3" :
  172. b->cpu&X264_CPU_LZCNT ? "lzcnt" :
  173. /* print sse2slow only if there's also a sse2fast version of the same func */
  174. b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
  175. b->cpu&X264_CPU_SSE2 ? "sse2" :
  176. b->cpu&X264_CPU_SSE ? "sse" :
  177. b->cpu&X264_CPU_MMX ? "mmx" :
  178. #elif ARCH_PPC
  179. b->cpu&X264_CPU_ALTIVEC ? "altivec" :
  180. #elif ARCH_ARM
  181. b->cpu&X264_CPU_NEON ? "neon" :
  182. b->cpu&X264_CPU_ARMV6 ? "armv6" :
  183. #elif ARCH_AARCH64
  184. b->cpu&X264_CPU_NEON ? "neon" :
  185. b->cpu&X264_CPU_ARMV8 ? "armv8" :
  186. #elif ARCH_MIPS
  187. b->cpu&X264_CPU_MSA ? "msa" :
  188. #endif
  189. "c",
  190. #if HAVE_MMX
  191. b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
  192. b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
  193. b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
  194. b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
  195. b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" :
  196. b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
  197. #elif ARCH_ARM
  198. b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
  199. #endif
  200. "",
  201. (int64_t)(10*b->cycles/b->den - nop_time)/4 );
  202. }
  203. }
  204. /* YMM and ZMM registers on x86 are turned off to save power when they haven't been
  205. * used for some period of time. When they are used there will be a "warmup" period
  206. * during which performance will be reduced and inconsistent which is problematic when
  207. * trying to benchmark individual functions. We can work around this by periodically
  208. * issuing "dummy" instructions that uses those registers to keep them powered on. */
  209. static void (*simd_warmup_func)( void ) = NULL;
  210. #define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 )
  211. #if ARCH_X86 || ARCH_X86_64
  212. int x264_stack_pagealign( int (*func)(), int align );
  213. void x264_checkasm_warmup_avx( void );
  214. void x264_checkasm_warmup_avx512( void );
  215. /* detect when callee-saved regs aren't saved
  216. * needs an explicit asm check because it only sometimes crashes in normal use. */
  217. intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
  218. #else
  219. #define x264_stack_pagealign( func, align ) func()
  220. #endif
  221. #if ARCH_AARCH64
  222. intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
  223. #endif
  224. #if ARCH_ARM
  225. intptr_t x264_checkasm_call_neon( intptr_t (*func)(), int *ok, ... );
  226. intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... );
  227. intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon;
  228. #endif
  229. #define call_c1(func,...) func(__VA_ARGS__)
  230. #if ARCH_X86_64
  231. /* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
  232. * This is done by clobbering the stack with junk around the stack pointer and calling the
  233. * assembly function through x264_checkasm_call with added dummy arguments which forces all
  234. * real arguments to be passed on the stack and not in registers. For 32-bit argument the
  235. * upper half of the 64-bit register location on the stack will now contain junk. Note that
  236. * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
  237. * overwrite the junk written to the stack so there's no guarantee that it will always
  238. * detect all functions that assumes zero-extension.
  239. */
  240. void x264_checkasm_stack_clobber( uint64_t clobber, ... );
  241. #define call_a1(func,...) ({ \
  242. uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
  243. x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
  244. simd_warmup(); \
  245. x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
  246. #elif ARCH_AARCH64 && !defined(__APPLE__)
  247. void x264_checkasm_stack_clobber( uint64_t clobber, ... );
  248. #define call_a1(func,...) ({ \
  249. uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
  250. x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \
  251. x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); })
  252. #elif ARCH_X86 || ARCH_ARM
  253. #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
  254. #else
  255. #define call_a1 call_c1
  256. #endif
  257. #if ARCH_ARM
  258. #define call_a1_64(func,...) ((uint64_t (*)(intptr_t(*)(), int*, ...))x264_checkasm_call)( (intptr_t(*)())func, &ok, __VA_ARGS__ )
  259. #else
  260. #define call_a1_64 call_a1
  261. #endif
  262. #define call_bench(func,cpu,...)\
  263. if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
  264. {\
  265. uint64_t tsum = 0;\
  266. int tcount = 0;\
  267. call_a1(func, __VA_ARGS__);\
  268. for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
  269. {\
  270. simd_warmup();\
  271. uint32_t t = read_time();\
  272. func(__VA_ARGS__);\
  273. func(__VA_ARGS__);\
  274. func(__VA_ARGS__);\
  275. func(__VA_ARGS__);\
  276. t = read_time() - t;\
  277. if( (uint64_t)t*tcount <= tsum*4 && ti > 0 )\
  278. {\
  279. tsum += t;\
  280. tcount++;\
  281. }\
  282. }\
  283. bench_t *b = get_bench( func_name, cpu );\
  284. b->cycles += tsum;\
  285. b->den += tcount;\
  286. b->pointer = func;\
  287. }
  288. /* for most functions, run benchmark and correctness test at the same time.
  289. * for those that modify their inputs, run the above macros separately */
  290. #define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
  291. #define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
  292. #define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
  293. #define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })
  294. #define call_a64(func,...) ({ call_a2(func,__VA_ARGS__); call_a1_64(func,__VA_ARGS__); })
  295. static int check_pixel( int cpu_ref, int cpu_new )
  296. {
  297. x264_pixel_function_t pixel_c;
  298. x264_pixel_function_t pixel_ref;
  299. x264_pixel_function_t pixel_asm;
  300. x264_predict_t predict_4x4[12];
  301. x264_predict8x8_t predict_8x8[12];
  302. x264_predict_8x8_filter_t predict_8x8_filter;
  303. ALIGNED_16( pixel edge[36] );
  304. uint16_t cost_mv[32];
  305. int ret = 0, ok, used_asm;
  306. x264_pixel_init( 0, &pixel_c );
  307. x264_pixel_init( cpu_ref, &pixel_ref );
  308. x264_pixel_init( cpu_new, &pixel_asm );
  309. x264_predict_4x4_init( 0, predict_4x4 );
  310. x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
  311. predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
  312. // maximize sum
  313. for( int i = 0; i < 256; i++ )
  314. {
  315. int z = i|(i>>4);
  316. z ^= z>>2;
  317. z ^= z>>1;
  318. pbuf4[i] = -(z&1) & PIXEL_MAX;
  319. pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
  320. }
  321. // random pattern made of maxed pixel differences, in case an intermediate value overflows
  322. for( int i = 256; i < 0x1000; i++ )
  323. {
  324. pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
  325. pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
  326. }
  327. #define TEST_PIXEL( name, align ) \
  328. ok = 1, used_asm = 0; \
  329. for( int i = 0; i < ARRAY_ELEMS(pixel_c.name); i++ ) \
  330. { \
  331. int res_c, res_asm; \
  332. if( pixel_asm.name[i] != pixel_ref.name[i] ) \
  333. { \
  334. set_func_name( "%s_%s", #name, pixel_names[i] ); \
  335. used_asm = 1; \
  336. for( int j = 0; j < 64; j++ ) \
  337. { \
  338. intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
  339. res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
  340. res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
  341. if( res_c != res_asm ) \
  342. { \
  343. ok = 0; \
  344. fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
  345. break; \
  346. } \
  347. } \
  348. for( int j = 0; j < 0x1000 && ok; j += 256 ) \
  349. { \
  350. res_c = pixel_c .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
  351. res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
  352. if( res_c != res_asm ) \
  353. { \
  354. ok = 0; \
  355. fprintf( stderr, #name "[%d]: overflow %d != %d\n", i, res_c, res_asm ); \
  356. } \
  357. } \
  358. } \
  359. } \
  360. report( "pixel " #name " :" );
  361. TEST_PIXEL( sad, 0 );
  362. TEST_PIXEL( sad_aligned, 1 );
  363. TEST_PIXEL( ssd, 1 );
  364. TEST_PIXEL( satd, 0 );
  365. TEST_PIXEL( sa8d, 1 );
  366. ok = 1, used_asm = 0;
  367. if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
  368. {
  369. set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
  370. used_asm = 1;
  371. for( int j = 0; j < 64; j++ )
  372. {
  373. uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
  374. uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
  375. uint64_t res_a = call_a64( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
  376. uint32_t cost8_a = res_a;
  377. uint32_t cost4_a = res_a >> 32;
  378. if( cost8_a != cost8_c || cost4_a != cost4_c )
  379. {
  380. ok = 0;
  381. fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
  382. cost8_c, cost4_c, cost8_a, cost4_a );
  383. break;
  384. }
  385. }
  386. for( int j = 0; j < 0x1000 && ok; j += 256 ) \
  387. {
  388. uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
  389. uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
  390. uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
  391. uint32_t cost8_a = res_a;
  392. uint32_t cost4_a = res_a >> 32;
  393. if( cost8_a != cost8_c || cost4_a != cost4_c )
  394. {
  395. ok = 0;
  396. fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
  397. cost8_c, cost4_c, cost8_a, cost4_a );
  398. }
  399. }
  400. }
  401. report( "pixel sa8d_satd :" );
  402. #define TEST_PIXEL_X( N ) \
  403. ok = 1; used_asm = 0; \
  404. for( int i = 0; i < 7; i++ ) \
  405. { \
  406. ALIGNED_16( int res_c[4] ) = {0}; \
  407. ALIGNED_16( int res_asm[4] ) = {0}; \
  408. if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
  409. { \
  410. set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
  411. used_asm = 1; \
  412. for( int j = 0; j < 64; j++ ) \
  413. { \
  414. pixel *pix2 = pbuf2+j; \
  415. res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
  416. res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
  417. res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
  418. if( N == 4 ) \
  419. { \
  420. res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
  421. call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
  422. } \
  423. else \
  424. call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
  425. if( memcmp(res_c, res_asm, N*sizeof(int)) ) \
  426. { \
  427. ok = 0; \
  428. fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
  429. i, res_c[0], res_c[1], res_c[2], res_c[3], \
  430. res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
  431. } \
  432. if( N == 4 ) \
  433. call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
  434. else \
  435. call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
  436. } \
  437. } \
  438. } \
  439. report( "pixel sad_x"#N" :" );
  440. TEST_PIXEL_X(3);
  441. TEST_PIXEL_X(4);
  442. #define TEST_PIXEL_VAR( i ) \
  443. if( pixel_asm.var[i] != pixel_ref.var[i] ) \
  444. { \
  445. set_func_name( "%s_%s", "var", pixel_names[i] ); \
  446. used_asm = 1; \
  447. /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
  448. call_c1( pixel_c.var[i], pbuf1, 16 ); \
  449. call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
  450. uint64_t res_c = pixel_c.var[i]( pbuf1, 16 ); \
  451. uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
  452. if( res_c != res_asm ) \
  453. { \
  454. ok = 0; \
  455. fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
  456. } \
  457. call_c2( pixel_c.var[i], pbuf1, (intptr_t)16 ); \
  458. call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
  459. }
  460. ok = 1; used_asm = 0;
  461. TEST_PIXEL_VAR( PIXEL_16x16 );
  462. TEST_PIXEL_VAR( PIXEL_8x16 );
  463. TEST_PIXEL_VAR( PIXEL_8x8 );
  464. report( "pixel var :" );
  465. #define TEST_PIXEL_VAR2( i ) \
  466. if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
  467. { \
  468. int res_c, res_asm; \
  469. ALIGNED_ARRAY_8( int, ssd_c, [2] ); \
  470. ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \
  471. set_func_name( "%s_%s", "var2", pixel_names[i] ); \
  472. used_asm = 1; \
  473. res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \
  474. res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \
  475. if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \
  476. { \
  477. ok = 0; \
  478. fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \
  479. } \
  480. }
  481. ok = 1; used_asm = 0;
  482. TEST_PIXEL_VAR2( PIXEL_8x16 );
  483. TEST_PIXEL_VAR2( PIXEL_8x8 );
  484. report( "pixel var2 :" );
  485. ok = 1; used_asm = 0;
  486. for( int i = 0; i < 4; i++ )
  487. if( pixel_asm.hadamard_ac[i] != pixel_ref.hadamard_ac[i] )
  488. {
  489. set_func_name( "hadamard_ac_%s", pixel_names[i] );
  490. used_asm = 1;
  491. for( int j = 0; j < 32; j++ )
  492. {
  493. pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
  494. call_c1( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 );
  495. call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
  496. uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
  497. uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
  498. if( rc != ra )
  499. {
  500. ok = 0;
  501. fprintf( stderr, "hadamard_ac[%d]: %d,%d != %d,%d\n", i, (int)rc, (int)(rc>>32), (int)ra, (int)(ra>>32) );
  502. break;
  503. }
  504. }
  505. call_c2( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 );
  506. call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
  507. }
  508. report( "pixel hadamard_ac :" );
  509. // maximize sum
  510. for( int i = 0; i < 32; i++ )
  511. for( int j = 0; j < 16; j++ )
  512. pbuf4[16*i+j] = -((i+j)&1) & PIXEL_MAX;
  513. ok = 1; used_asm = 0;
  514. if( pixel_asm.vsad != pixel_ref.vsad )
  515. {
  516. for( int h = 2; h <= 32; h += 2 )
  517. {
  518. int res_c, res_asm;
  519. set_func_name( "vsad" );
  520. used_asm = 1;
  521. for( int j = 0; j < 2 && ok; j++ )
  522. {
  523. pixel *p = j ? pbuf4 : pbuf1;
  524. res_c = call_c( pixel_c.vsad, p, (intptr_t)16, h );
  525. res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h );
  526. if( res_c != res_asm )
  527. {
  528. ok = 0;
  529. fprintf( stderr, "vsad: height=%d, %d != %d\n", h, res_c, res_asm );
  530. break;
  531. }
  532. }
  533. }
  534. }
  535. report( "pixel vsad :" );
  536. ok = 1; used_asm = 0;
  537. if( pixel_asm.asd8 != pixel_ref.asd8 )
  538. {
  539. set_func_name( "asd8" );
  540. used_asm = 1;
  541. int res_c = call_c( pixel_c.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
  542. int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
  543. if( res_c != res_a )
  544. {
  545. ok = 0;
  546. fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
  547. }
  548. }
  549. report( "pixel asd :" );
  550. #define TEST_INTRA_X3( name, i8x8, ... ) \
  551. if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
  552. { \
  553. ALIGNED_16( int res_c[4] ); \
  554. ALIGNED_16( int res_asm[4] ); \
  555. set_func_name( #name ); \
  556. used_asm = 1; \
  557. call_c( pixel_c.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_c ); \
  558. call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \
  559. if( memcmp(res_c, res_asm, 3 * sizeof(*res_c)) ) \
  560. { \
  561. ok = 0; \
  562. fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
  563. res_c[0], res_c[1], res_c[2], \
  564. res_asm[0], res_asm[1], res_asm[2] ); \
  565. } \
  566. }
  567. #define TEST_INTRA_X9( name, cmp ) \
  568. if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
  569. { \
  570. set_func_name( #name ); \
  571. used_asm = 1; \
  572. ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
  573. for( int i=0; i<17; i++ ) \
  574. bitcosts[i] = 9*(i!=8); \
  575. memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
  576. memcpy( pbuf4, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); \
  577. for( int i=0; i<32; i++ ) \
  578. { \
  579. pixel *fenc = pbuf1+48+i*12; \
  580. pixel *fdec1 = pbuf3+48+i*12; \
  581. pixel *fdec2 = pbuf4+48+i*12; \
  582. int pred_mode = i%9; \
  583. int res_c = INT_MAX; \
  584. for( int j=0; j<9; j++ ) \
  585. { \
  586. predict_4x4[j]( fdec1 ); \
  587. int cost = pixel_c.cmp[PIXEL_4x4]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
  588. if( cost < (uint16_t)res_c ) \
  589. res_c = cost + (j<<16); \
  590. } \
  591. predict_4x4[res_c>>16]( fdec1 ); \
  592. int res_a = call_a( pixel_asm.name, fenc, fdec2, bitcosts+8-pred_mode ); \
  593. if( res_c != res_a ) \
  594. { \
  595. ok = 0; \
  596. fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
  597. break; \
  598. } \
  599. if( memcmp(fdec1, fdec2, 4*FDEC_STRIDE*sizeof(pixel)) ) \
  600. { \
  601. ok = 0; \
  602. fprintf( stderr, #name" [FAILED]\n" ); \
  603. for( int j=0; j<16; j++ ) \
  604. fprintf( stderr, "%02x ", fdec1[(j&3)+(j>>2)*FDEC_STRIDE] ); \
  605. fprintf( stderr, "\n" ); \
  606. for( int j=0; j<16; j++ ) \
  607. fprintf( stderr, "%02x ", fdec2[(j&3)+(j>>2)*FDEC_STRIDE] ); \
  608. fprintf( stderr, "\n" ); \
  609. break; \
  610. } \
  611. } \
  612. }
  613. #define TEST_INTRA8_X9( name, cmp ) \
  614. if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
  615. { \
  616. set_func_name( #name ); \
  617. used_asm = 1; \
  618. ALIGNED_ARRAY_64( uint16_t, bitcosts,[17] ); \
  619. ALIGNED_ARRAY_16( uint16_t, satds_c,[16] ); \
  620. ALIGNED_ARRAY_16( uint16_t, satds_a,[16] ); \
  621. memset( satds_c, 0, 16 * sizeof(*satds_c) ); \
  622. memset( satds_a, 0, 16 * sizeof(*satds_a) ); \
  623. for( int i=0; i<17; i++ ) \
  624. bitcosts[i] = 9*(i!=8); \
  625. for( int i=0; i<32; i++ ) \
  626. { \
  627. pixel *fenc = pbuf1+48+i*12; \
  628. pixel *fdec1 = pbuf3+48+i*12; \
  629. pixel *fdec2 = pbuf4+48+i*12; \
  630. int pred_mode = i%9; \
  631. int res_c = INT_MAX; \
  632. predict_8x8_filter( fdec1, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); \
  633. for( int j=0; j<9; j++ ) \
  634. { \
  635. predict_8x8[j]( fdec1, edge ); \
  636. satds_c[j] = pixel_c.cmp[PIXEL_8x8]( fenc, FENC_STRIDE, fdec1, FDEC_STRIDE ) + 9*(j!=pred_mode); \
  637. if( satds_c[j] < (uint16_t)res_c ) \
  638. res_c = satds_c[j] + (j<<16); \
  639. } \
  640. predict_8x8[res_c>>16]( fdec1, edge ); \
  641. int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
  642. if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \
  643. { \
  644. ok = 0; \
  645. fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
  646. for( int j = 0; j < 9; j++ ) \
  647. fprintf( stderr, "%5d ", satds_c[j]); \
  648. fprintf( stderr, "\n" ); \
  649. for( int j = 0; j < 9; j++ ) \
  650. fprintf( stderr, "%5d ", satds_a[j]); \
  651. fprintf( stderr, "\n" ); \
  652. break; \
  653. } \
  654. for( int j=0; j<8; j++ ) \
  655. if( memcmp(fdec1+j*FDEC_STRIDE, fdec2+j*FDEC_STRIDE, 8*sizeof(pixel)) ) \
  656. ok = 0; \
  657. if( !ok ) \
  658. { \
  659. fprintf( stderr, #name" [FAILED]\n" ); \
  660. for( int j=0; j<8; j++ ) \
  661. { \
  662. for( int k=0; k<8; k++ ) \
  663. fprintf( stderr, "%02x ", fdec1[k+j*FDEC_STRIDE] ); \
  664. fprintf( stderr, "\n" ); \
  665. } \
  666. fprintf( stderr, "\n" ); \
  667. for( int j=0; j<8; j++ ) \
  668. { \
  669. for( int k=0; k<8; k++ ) \
  670. fprintf( stderr, "%02x ", fdec2[k+j*FDEC_STRIDE] ); \
  671. fprintf( stderr, "\n" ); \
  672. } \
  673. fprintf( stderr, "\n" ); \
  674. break; \
  675. } \
  676. } \
  677. }
  678. memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
  679. ok = 1; used_asm = 0;
  680. TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
  681. TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
  682. TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
  683. TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
  684. TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
  685. report( "intra satd_x3 :" );
  686. ok = 1; used_asm = 0;
  687. TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
  688. TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
  689. TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
  690. TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
  691. TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
  692. report( "intra sad_x3 :" );
  693. ok = 1; used_asm = 0;
  694. TEST_INTRA_X9( intra_satd_x9_4x4, satd );
  695. TEST_INTRA8_X9( intra_sa8d_x9_8x8, sa8d );
  696. report( "intra satd_x9 :" );
  697. ok = 1; used_asm = 0;
  698. TEST_INTRA_X9( intra_sad_x9_4x4, sad );
  699. TEST_INTRA8_X9( intra_sad_x9_8x8, sad );
  700. report( "intra sad_x9 :" );
  701. ok = 1; used_asm = 0;
  702. if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )
  703. {
  704. used_asm = 1;
  705. set_func_name( "ssd_nv12" );
  706. uint64_t res_u_c, res_v_c, res_u_a, res_v_a;
  707. for( int w = 8; w <= 360; w += 8 )
  708. {
  709. pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_c, &res_v_c );
  710. pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, w, 8, &res_u_a, &res_v_a );
  711. if( res_u_c != res_u_a || res_v_c != res_v_a )
  712. {
  713. ok = 0;
  714. fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
  715. res_u_c, res_v_c, res_u_a, res_v_a );
  716. }
  717. }
  718. call_c( pixel_c.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c );
  719. call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a );
  720. }
  721. report( "ssd_nv12 :" );
  722. if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
  723. pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
  724. {
  725. int cnt;
  726. float res_c, res_a;
  727. ALIGNED_16( int sums[5][4] ) = {{0}};
  728. used_asm = ok = 1;
  729. x264_emms();
  730. res_c = x264_pixel_ssim_wxh( &pixel_c, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
  731. res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3, &cnt );
  732. if( fabs( res_c - res_a ) > 1e-6 )
  733. {
  734. ok = 0;
  735. fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
  736. }
  737. set_func_name( "ssim_core" );
  738. call_c( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
  739. call_a( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
  740. set_func_name( "ssim_end" );
  741. call_c2( pixel_c.ssim_end4, sums, sums, 4 );
  742. call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
  743. /* check incorrect assumptions that 32-bit ints are zero-extended to 64-bit */
  744. call_c1( pixel_c.ssim_end4, sums, sums, 3 );
  745. call_a1( pixel_asm.ssim_end4, sums, sums, 3 );
  746. report( "ssim :" );
  747. }
  748. ok = 1; used_asm = 0;
  749. for( int i = 0; i < 32; i++ )
  750. cost_mv[i] = i*10;
  751. for( int i = 0; i < 100 && ok; i++ )
  752. if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
  753. {
  754. ALIGNED_16( uint16_t sums[72] );
  755. ALIGNED_16( int dc[4] );
  756. ALIGNED_16( int16_t mvs_a[48] );
  757. ALIGNED_16( int16_t mvs_c[48] );
  758. int mvn_a, mvn_c;
  759. int thresh = rand() & 0x3fff;
  760. set_func_name( "esa_ads" );
  761. for( int j = 0; j < 72; j++ )
  762. sums[j] = rand() & 0x3fff;
  763. for( int j = 0; j < 4; j++ )
  764. dc[j] = rand() & 0x3fff;
  765. used_asm = 1;
  766. mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
  767. mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
  768. if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
  769. {
  770. ok = 0;
  771. printf( "c%d: ", i&3 );
  772. for( int j = 0; j < mvn_c; j++ )
  773. printf( "%d ", mvs_c[j] );
  774. printf( "\na%d: ", i&3 );
  775. for( int j = 0; j < mvn_a; j++ )
  776. printf( "%d ", mvs_a[j] );
  777. printf( "\n\n" );
  778. }
  779. }
  780. report( "esa ads:" );
  781. return ret;
  782. }
  783. static int check_dct( int cpu_ref, int cpu_new )
  784. {
  785. x264_dct_function_t dct_c;
  786. x264_dct_function_t dct_ref;
  787. x264_dct_function_t dct_asm;
  788. x264_quant_function_t qf;
  789. int ret = 0, ok, used_asm, interlace = 0;
  790. ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
  791. ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] );
  792. ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
  793. ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
  794. ALIGNED_16( dctcoef dctdc[2][8] );
  795. x264_t h_buf;
  796. x264_t *h = &h_buf;
  797. x264_dct_init( 0, &dct_c );
  798. x264_dct_init( cpu_ref, &dct_ref);
  799. x264_dct_init( cpu_new, &dct_asm );
  800. memset( h, 0, sizeof(*h) );
  801. x264_param_default( &h->param );
  802. h->sps->i_chroma_format_idc = 1;
  803. h->chroma_qp_table = i_chroma_qp_table + 12;
  804. h->param.analyse.i_luma_deadzone[0] = 0;
  805. h->param.analyse.i_luma_deadzone[1] = 0;
  806. h->param.analyse.b_transform_8x8 = 1;
  807. for( int i = 0; i < 6; i++ )
  808. h->sps->scaling_list[i] = x264_cqm_flat16;
  809. x264_cqm_init( h );
  810. x264_quant_init( h, 0, &qf );
  811. /* overflow test cases */
  812. for( int i = 0; i < 5; i++ )
  813. {
  814. pixel *enc = &pbuf3[16*i*FENC_STRIDE];
  815. pixel *dec = &pbuf4[16*i*FDEC_STRIDE];
  816. for( int j = 0; j < 16; j++ )
  817. {
  818. int cond_a = (i < 2) ? 1 : ((j&3) == 0 || (j&3) == (i-1));
  819. int cond_b = (i == 0) ? 1 : !cond_a;
  820. enc[0] = enc[1] = enc[4] = enc[5] = enc[8] = enc[9] = enc[12] = enc[13] = cond_a ? PIXEL_MAX : 0;
  821. enc[2] = enc[3] = enc[6] = enc[7] = enc[10] = enc[11] = enc[14] = enc[15] = cond_b ? PIXEL_MAX : 0;
  822. for( int k = 0; k < 4; k++ )
  823. dec[k] = PIXEL_MAX - enc[k];
  824. enc += FENC_STRIDE;
  825. dec += FDEC_STRIDE;
  826. }
  827. }
  828. #define TEST_DCT( name, t1, t2, size ) \
  829. if( dct_asm.name != dct_ref.name ) \
  830. { \
  831. set_func_name( #name ); \
  832. used_asm = 1; \
  833. pixel *enc = pbuf3; \
  834. pixel *dec = pbuf4; \
  835. for( int j = 0; j < 5; j++) \
  836. { \
  837. call_c( dct_c.name, t1, &pbuf1[j*64], &pbuf2[j*64] ); \
  838. call_a( dct_asm.name, t2, &pbuf1[j*64], &pbuf2[j*64] ); \
  839. if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
  840. { \
  841. ok = 0; \
  842. fprintf( stderr, #name " [FAILED]\n" ); \
  843. for( int k = 0; k < size; k++ )\
  844. printf( "%d ", ((dctcoef*)t1)[k] );\
  845. printf("\n");\
  846. for( int k = 0; k < size; k++ )\
  847. printf( "%d ", ((dctcoef*)t2)[k] );\
  848. printf("\n");\
  849. break; \
  850. } \
  851. call_c( dct_c.name, t1, enc, dec ); \
  852. call_a( dct_asm.name, t2, enc, dec ); \
  853. if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
  854. { \
  855. ok = 0; \
  856. fprintf( stderr, #name " [FAILED] (overflow)\n" ); \
  857. break; \
  858. } \
  859. enc += 16*FENC_STRIDE; \
  860. dec += 16*FDEC_STRIDE; \
  861. } \
  862. }
  863. ok = 1; used_asm = 0;
  864. TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
  865. TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
  866. TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
  867. TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
  868. TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
  869. report( "sub_dct4 :" );
  870. ok = 1; used_asm = 0;
  871. TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
  872. TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
  873. report( "sub_dct8 :" );
  874. #undef TEST_DCT
  875. // fdct and idct are denormalized by different factors, so quant/dequant
  876. // is needed to force the coefs into the right range.
  877. dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 );
  878. dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 );
  879. for( int i = 0; i < 16; i++ )
  880. {
  881. qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
  882. qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
  883. }
  884. for( int i = 0; i < 4; i++ )
  885. {
  886. qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
  887. qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
  888. }
  889. x264_cqm_delete( h );
  890. #define TEST_IDCT( name, src ) \
  891. if( dct_asm.name != dct_ref.name ) \
  892. { \
  893. set_func_name( #name ); \
  894. used_asm = 1; \
  895. memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
  896. memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
  897. memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
  898. memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
  899. call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
  900. call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
  901. if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
  902. { \
  903. ok = 0; \
  904. fprintf( stderr, #name " [FAILED]\n" ); \
  905. } \
  906. call_c2( dct_c.name, pbuf3, (void*)dct1 ); \
  907. call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \
  908. }
  909. ok = 1; used_asm = 0;
  910. TEST_IDCT( add4x4_idct, dct4 );
  911. TEST_IDCT( add8x8_idct, dct4 );
  912. TEST_IDCT( add8x8_idct_dc, dct4 );
  913. TEST_IDCT( add16x16_idct, dct4 );
  914. TEST_IDCT( add16x16_idct_dc, dct4 );
  915. report( "add_idct4 :" );
  916. ok = 1; used_asm = 0;
  917. TEST_IDCT( add8x8_idct8, dct8 );
  918. TEST_IDCT( add16x16_idct8, dct8 );
  919. report( "add_idct8 :" );
  920. #undef TEST_IDCT
  921. #define TEST_DCTDC( name )\
  922. ok = 1; used_asm = 0;\
  923. if( dct_asm.name != dct_ref.name )\
  924. {\
  925. set_func_name( #name );\
  926. used_asm = 1;\
  927. uint16_t *p = (uint16_t*)buf1;\
  928. for( int i = 0; i < 16 && ok; i++ )\
  929. {\
  930. for( int j = 0; j < 16; j++ )\
  931. dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
  932. : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
  933. : ((*p++)&0x1fff)-0x1000; /* general case */\
  934. memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
  935. call_c1( dct_c.name, dct1[0] );\
  936. call_a1( dct_asm.name, dct2[0] );\
  937. if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
  938. ok = 0;\
  939. }\
  940. call_c2( dct_c.name, dct1[0] );\
  941. call_a2( dct_asm.name, dct2[0] );\
  942. }\
  943. report( #name " :" );
  944. TEST_DCTDC( dct4x4dc );
  945. TEST_DCTDC( idct4x4dc );
  946. #undef TEST_DCTDC
  947. #define TEST_DCTDC_CHROMA( name )\
  948. ok = 1; used_asm = 0;\
  949. if( dct_asm.name != dct_ref.name )\
  950. {\
  951. set_func_name( #name );\
  952. used_asm = 1;\
  953. uint16_t *p = (uint16_t*)buf1;\
  954. for( int i = 0; i < 16 && ok; i++ )\
  955. {\
  956. for( int j = 0; j < 8; j++ )\
  957. dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
  958. : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
  959. : ((*p++)&0x1fff)-0x1000; /* general case */\
  960. memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
  961. call_c1( dct_c.name, dctdc[0], dct1 );\
  962. call_a1( dct_asm.name, dctdc[1], dct2 );\
  963. if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
  964. {\
  965. ok = 0;\
  966. fprintf( stderr, #name " [FAILED]\n" ); \
  967. }\
  968. }\
  969. call_c2( dct_c.name, dctdc[0], dct1 );\
  970. call_a2( dct_asm.name, dctdc[1], dct2 );\
  971. }\
  972. report( #name " :" );
  973. TEST_DCTDC_CHROMA( dct2x4dc );
  974. #undef TEST_DCTDC_CHROMA
  975. x264_zigzag_function_t zigzag_c[2];
  976. x264_zigzag_function_t zigzag_ref[2];
  977. x264_zigzag_function_t zigzag_asm[2];
  978. ALIGNED_ARRAY_64( dctcoef, level1,[64] );
  979. ALIGNED_ARRAY_64( dctcoef, level2,[64] );
  980. #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
  981. if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
  982. { \
  983. set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  984. used_asm = 1; \
  985. for( int i = 0; i < size*size; i++ ) \
  986. dct[i] = i; \
  987. call_c( zigzag_c[interlace].name, t1, dct ); \
  988. call_a( zigzag_asm[interlace].name, t2, dct ); \
  989. if( memcmp( t1, t2, size*size*sizeof(dctcoef) ) ) \
  990. { \
  991. ok = 0; \
  992. for( int i = 0; i < 2; i++ ) \
  993. { \
  994. dctcoef *d = (dctcoef*)(i ? t2 : t1); \
  995. for( int j = 0; j < size; j++ ) \
  996. { \
  997. for( int k = 0; k < size; k++ ) \
  998. fprintf( stderr, "%2d ", d[k+j*8] ); \
  999. fprintf( stderr, "\n" ); \
  1000. } \
  1001. fprintf( stderr, "\n" ); \
  1002. } \
  1003. fprintf( stderr, #name " [FAILED]\n" ); \
  1004. } \
  1005. }
  1006. #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
  1007. if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
  1008. { \
  1009. int nz_a, nz_c; \
  1010. set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  1011. used_asm = 1; \
  1012. memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
  1013. memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
  1014. nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
  1015. nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
  1016. if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
  1017. { \
  1018. ok = 0; \
  1019. fprintf( stderr, #name " [FAILED]\n" ); \
  1020. } \
  1021. call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3 ); \
  1022. call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4 ); \
  1023. }
  1024. #define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
  1025. if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
  1026. { \
  1027. int nz_a, nz_c; \
  1028. dctcoef dc_a, dc_c; \
  1029. set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  1030. used_asm = 1; \
  1031. for( int i = 0; i < 2; i++ ) \
  1032. { \
  1033. memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
  1034. memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
  1035. for( int j = 0; j < 4; j++ ) \
  1036. { \
  1037. memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
  1038. memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
  1039. } \
  1040. nz_c = call_c1( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
  1041. nz_a = call_a1( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
  1042. if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
  1043. { \
  1044. ok = 0; \
  1045. fprintf( stderr, #name " [FAILED]\n" ); \
  1046. break; \
  1047. } \
  1048. } \
  1049. call_c2( zigzag_c[interlace].name, t1, pbuf2, pbuf3, &dc_c ); \
  1050. call_a2( zigzag_asm[interlace].name, t2, pbuf2, pbuf4, &dc_a ); \
  1051. }
  1052. #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
  1053. if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
  1054. { \
  1055. for( int j = 0; j < 100; j++ ) \
  1056. { \
  1057. set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
  1058. used_asm = 1; \
  1059. memcpy(dct, buf1, size*sizeof(dctcoef)); \
  1060. for( int i = 0; i < size; i++ ) \
  1061. dct[i] = rand()&0x1F ? 0 : dct[i]; \
  1062. memcpy(buf3, buf4, 10); \
  1063. call_c( zigzag_c[interlace].name, t1, dct, buf3 ); \
  1064. call_a( zigzag_asm[interlace].name, t2, dct, buf4 ); \
  1065. if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( buf3, buf4, 10 ) ) \
  1066. { \
  1067. ok = 0; printf("%d: %d %d %d %d\n%d %d %d %d\n\n",memcmp( t1, t2, size*sizeof(dctcoef) ),buf3[0], buf3[1], buf3[8], buf3[9], buf4[0], buf4[1], buf4[8], buf4[9]);break;\
  1068. } \
  1069. } \
  1070. }
  1071. x264_zigzag_init( 0, &zigzag_c[0], &zigzag_c[1] );
  1072. x264_zigzag_init( cpu_ref, &zigzag_ref[0], &zigzag_ref[1] );
  1073. x264_zigzag_init( cpu_new, &zigzag_asm[0], &zigzag_asm[1] );
  1074. ok = 1; used_asm = 0;
  1075. TEST_INTERLEAVE( interleave_8x8_cavlc, level1, level2, dct8[0], 64 );
  1076. report( "zigzag_interleave :" );
  1077. for( interlace = 0; interlace <= 1; interlace++ )
  1078. {
  1079. ok = 1; used_asm = 0;
  1080. TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, dct8[0], 8 );
  1081. TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 4 );
  1082. TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
  1083. TEST_ZIGZAG_SUB( sub_8x8, level1, level2, 64 );
  1084. TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
  1085. report( interlace ? "zigzag_field :" : "zigzag_frame :" );
  1086. }
  1087. #undef TEST_ZIGZAG_SCAN
  1088. #undef TEST_ZIGZAG_SUB
  1089. return ret;
  1090. }
  1091. static int check_mc( int cpu_ref, int cpu_new )
  1092. {
  1093. x264_mc_functions_t mc_c;
  1094. x264_mc_functions_t mc_ref;
  1095. x264_mc_functions_t mc_a;
  1096. x264_pixel_function_t pixf;
  1097. pixel *src = &(pbuf1)[2*64+2];
  1098. pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2],
  1099. &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] };
  1100. pixel *dst1 = pbuf3;
  1101. pixel *dst2 = pbuf4;
  1102. int ret = 0, ok, used_asm;
  1103. x264_mc_init( 0, &mc_c, 0 );
  1104. x264_mc_init( cpu_ref, &mc_ref, 0 );
  1105. x264_mc_init( cpu_new, &mc_a, 0 );
  1106. x264_pixel_init( 0, &pixf );
  1107. #define MC_TEST_LUMA( w, h ) \
  1108. if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
  1109. { \
  1110. const x264_weight_t *weight = x264_weight_none; \
  1111. set_func_name( "mc_luma_%dx%d", w, h ); \
  1112. used_asm = 1; \
  1113. for( int i = 0; i < 1024; i++ ) \
  1114. pbuf3[i] = pbuf4[i] = 0xCD; \
  1115. call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
  1116. call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
  1117. if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
  1118. { \
  1119. fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
  1120. ok = 0; \
  1121. } \
  1122. } \
  1123. if( mc_a.get_ref != mc_ref.get_ref ) \
  1124. { \
  1125. pixel *ref = dst2; \
  1126. intptr_t ref_stride = 32; \
  1127. int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
  1128. const x264_weight_t *weight = x264_weight_none; \
  1129. set_func_name( "get_ref_%dx%d", w_checked, h ); \
  1130. used_asm = 1; \
  1131. for( int i = 0; i < 1024; i++ ) \
  1132. pbuf3[i] = pbuf4[i] = 0xCD; \
  1133. call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
  1134. ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \
  1135. for( int i = 0; i < h; i++ ) \
  1136. if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
  1137. { \
  1138. fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w_checked, h ); \
  1139. ok = 0; \
  1140. break; \
  1141. } \
  1142. }
  1143. #define MC_TEST_CHROMA( w, h ) \
  1144. if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
  1145. { \
  1146. set_func_name( "mc_chroma_%dx%d", w, h ); \
  1147. used_asm = 1; \
  1148. for( int i = 0; i < 1024; i++ ) \
  1149. pbuf3[i] = pbuf4[i] = 0xCD; \
  1150. call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
  1151. call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
  1152. /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
  1153. for( int j = 0; j < h; j++ ) \
  1154. for( int i = w; i < 8; i++ ) \
  1155. { \
  1156. dst2[i+j*16+8] = dst1[i+j*16+8]; \
  1157. dst2[i+j*16 ] = dst1[i+j*16 ]; \
  1158. } \
  1159. if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
  1160. { \
  1161. fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
  1162. ok = 0; \
  1163. } \
  1164. }
  1165. ok = 1; used_asm = 0;
  1166. for( int dy = -8; dy < 8; dy++ )
  1167. for( int dx = -128; dx < 128; dx++ )
  1168. {
  1169. if( rand()&15 ) continue; // running all of them is too slow
  1170. MC_TEST_LUMA( 20, 18 );
  1171. MC_TEST_LUMA( 16, 16 );
  1172. MC_TEST_LUMA( 16, 8 );
  1173. MC_TEST_LUMA( 12, 10 );
  1174. MC_TEST_LUMA( 8, 16 );
  1175. MC_TEST_LUMA( 8, 8 );
  1176. MC_TEST_LUMA( 8, 4 );
  1177. MC_TEST_LUMA( 4, 8 );
  1178. MC_TEST_LUMA( 4, 4 );
  1179. }
  1180. report( "mc luma :" );
  1181. ok = 1; used_asm = 0;
  1182. for( int dy = -1; dy < 9; dy++ )
  1183. for( int dx = -128; dx < 128; dx++ )
  1184. {
  1185. if( rand()&15 ) continue;
  1186. MC_TEST_CHROMA( 8, 8 );
  1187. MC_TEST_CHROMA( 8, 4 );
  1188. MC_TEST_CHROMA( 4, 8 );
  1189. MC_TEST_CHROMA( 4, 4 );
  1190. MC_TEST_CHROMA( 4, 2 );
  1191. MC_TEST_CHROMA( 2, 4 );
  1192. MC_TEST_CHROMA( 2, 2 );
  1193. }
  1194. report( "mc chroma :" );
  1195. #undef MC_TEST_LUMA
  1196. #undef MC_TEST_CHROMA
  1197. #define MC_TEST_AVG( name, weight ) \
  1198. { \
  1199. for( int i = 0; i < 12; i++ ) \
  1200. { \
  1201. memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
  1202. memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
  1203. if( mc_a.name[i] != mc_ref.name[i] ) \
  1204. { \
  1205. set_func_name( "%s_%s", #name, pixel_names[i] ); \
  1206. used_asm = 1; \
  1207. call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
  1208. call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
  1209. if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
  1210. { \
  1211. ok = 0; \
  1212. fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
  1213. } \
  1214. call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
  1215. call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
  1216. } \
  1217. } \
  1218. }
  1219. ok = 1, used_asm = 0;
  1220. for( int w = -63; w <= 127 && ok; w++ )
  1221. MC_TEST_AVG( avg, w );
  1222. report( "mc wpredb :" );
  1223. #define MC_TEST_WEIGHT( name, weight, aligned ) \
  1224. int align_off = (aligned ? 0 : rand()%16); \
  1225. for( int i = 1; i <= 5; i++ ) \
  1226. { \
  1227. ALIGNED_16( pixel buffC[640] ); \
  1228. ALIGNED_16( pixel buffA[640] ); \
  1229. int j = X264_MAX( i*4, 2 ); \
  1230. memset( buffC, 0, 640 * sizeof(pixel) ); \
  1231. memset( buffA, 0, 640 * sizeof(pixel) ); \
  1232. x264_t ha; \
  1233. ha.mc = mc_a; \
  1234. /* w12 is the same as w16 in some cases */ \
  1235. if( i == 3 && mc_a.name[i] == mc_a.name[i+1] ) \
  1236. continue; \
  1237. if( mc_a.name[i] != mc_ref.name[i] ) \
  1238. { \
  1239. set_func_name( "%s_w%d", #name, j ); \
  1240. used_asm = 1; \
  1241. call_c1( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
  1242. mc_a.weight_cache(&ha, &weight); \
  1243. call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
  1244. for( int k = 0; k < 16; k++ ) \
  1245. if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
  1246. { \
  1247. ok = 0; \
  1248. fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
  1249. break; \
  1250. } \
  1251. /* omit unlikely high scales for benchmarking */ \
  1252. if( (s << (8-d)) < 512 ) \
  1253. { \
  1254. call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
  1255. call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
  1256. } \
  1257. } \
  1258. }
  1259. ok = 1; used_asm = 0;
  1260. int align_cnt = 0;
  1261. for( int s = 0; s <= 127 && ok; s++ )
  1262. {
  1263. for( int o = -128; o <= 127 && ok; o++ )
  1264. {
  1265. if( rand() & 2047 ) continue;
  1266. for( int d = 0; d <= 7 && ok; d++ )
  1267. {
  1268. if( s == 1<<d )
  1269. continue;
  1270. x264_weight_t weight = { .i_scale = s, .i_denom = d, .i_offset = o };
  1271. MC_TEST_WEIGHT( weight, weight, (align_cnt++ % 4) );
  1272. }
  1273. }
  1274. }
  1275. report( "mc weight :" );
  1276. ok = 1; used_asm = 0;
  1277. for( int o = 0; o <= 127 && ok; o++ )
  1278. {
  1279. int s = 1, d = 0;
  1280. if( rand() & 15 ) continue;
  1281. x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
  1282. MC_TEST_WEIGHT( offsetadd, weight, (align_cnt++ % 4) );
  1283. }
  1284. report( "mc offsetadd :" );
  1285. ok = 1; used_asm = 0;
  1286. for( int o = -128; o < 0 && ok; o++ )
  1287. {
  1288. int s = 1, d = 0;
  1289. if( rand() & 15 ) continue;
  1290. x264_weight_t weight = { .i_scale = 1, .i_denom = 0, .i_offset = o };
  1291. MC_TEST_WEIGHT( offsetsub, weight, (align_cnt++ % 4) );
  1292. }
  1293. report( "mc offsetsub :" );
  1294. memset( pbuf3, 0, 64*16 );
  1295. memset( pbuf4, 0, 64*16 );
  1296. ok = 1; used_asm = 0;
  1297. for( int height = 8; height <= 16; height += 8 )
  1298. {
  1299. if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
  1300. {
  1301. set_func_name( "store_interleave_chroma" );
  1302. used_asm = 1;
  1303. call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
  1304. call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
  1305. if( memcmp( pbuf3, pbuf4, 64*height ) )
  1306. {
  1307. ok = 0;
  1308. fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
  1309. break;
  1310. }
  1311. }
  1312. if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
  1313. {
  1314. set_func_name( "load_deinterleave_chroma_fenc" );
  1315. used_asm = 1;
  1316. call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height );
  1317. call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height );
  1318. if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
  1319. {
  1320. ok = 0;
  1321. fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
  1322. break;
  1323. }
  1324. }
  1325. if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
  1326. {
  1327. set_func_name( "load_deinterleave_chroma_fdec" );
  1328. used_asm = 1;
  1329. call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height );
  1330. call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height );
  1331. if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
  1332. {
  1333. ok = 0;
  1334. fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
  1335. break;
  1336. }
  1337. }
  1338. }
  1339. report( "store_interleave :" );
  1340. struct plane_spec {
  1341. int w, h, src_stride;
  1342. } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} };
  1343. ok = 1; used_asm = 0;
  1344. if( mc_a.plane_copy != mc_ref.plane_copy )
  1345. {
  1346. set_func_name( "plane_copy" );
  1347. used_asm = 1;
  1348. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1349. {
  1350. int w = plane_specs[i].w;
  1351. int h = plane_specs[i].h;
  1352. intptr_t src_stride = plane_specs[i].src_stride;
  1353. intptr_t dst_stride = (w + 127) & ~63;
  1354. assert( dst_stride * h <= 0x1000 );
  1355. pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
  1356. memset( pbuf3, 0, 0x1000*sizeof(pixel) );
  1357. memset( pbuf4, 0, 0x1000*sizeof(pixel) );
  1358. call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
  1359. call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h );
  1360. for( int y = 0; y < h; y++ )
  1361. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
  1362. {
  1363. ok = 0;
  1364. fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1365. break;
  1366. }
  1367. }
  1368. }
  1369. if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap )
  1370. {
  1371. set_func_name( "plane_copy_swap" );
  1372. used_asm = 1;
  1373. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1374. {
  1375. int w = (plane_specs[i].w + 1) >> 1;
  1376. int h = plane_specs[i].h;
  1377. intptr_t src_stride = plane_specs[i].src_stride;
  1378. intptr_t dst_stride = (2*w + 127) & ~63;
  1379. assert( dst_stride * h <= 0x1000 );
  1380. pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
  1381. memset( pbuf3, 0, 0x1000*sizeof(pixel) );
  1382. memset( pbuf4, 0, 0x1000*sizeof(pixel) );
  1383. call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h );
  1384. call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h );
  1385. for( int y = 0; y < h; y++ )
  1386. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
  1387. {
  1388. ok = 0;
  1389. fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1390. break;
  1391. }
  1392. }
  1393. }
  1394. if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
  1395. {
  1396. set_func_name( "plane_copy_interleave" );
  1397. used_asm = 1;
  1398. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1399. {
  1400. int w = (plane_specs[i].w + 1) >> 1;
  1401. int h = plane_specs[i].h;
  1402. intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
  1403. intptr_t dst_stride = (2*w + 127) & ~63;
  1404. assert( dst_stride * h <= 0x1000 );
  1405. pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
  1406. memset( pbuf3, 0, 0x1000*sizeof(pixel) );
  1407. memset( pbuf4, 0, 0x1000*sizeof(pixel) );
  1408. call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
  1409. call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
  1410. for( int y = 0; y < h; y++ )
  1411. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
  1412. {
  1413. ok = 0;
  1414. fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1415. break;
  1416. }
  1417. }
  1418. }
  1419. if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave )
  1420. {
  1421. set_func_name( "plane_copy_deinterleave" );
  1422. used_asm = 1;
  1423. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1424. {
  1425. int w = (plane_specs[i].w + 1) >> 1;
  1426. int h = plane_specs[i].h;
  1427. intptr_t dst_stride = w;
  1428. intptr_t src_stride = (2*w + 127) & ~63;
  1429. intptr_t offv = (dst_stride*h + 63) & ~31;
  1430. memset( pbuf3, 0, 0x1000 );
  1431. memset( pbuf4, 0, 0x1000 );
  1432. call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
  1433. call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h );
  1434. for( int y = 0; y < h; y++ )
  1435. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w ) ||
  1436. memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
  1437. {
  1438. ok = 0;
  1439. fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1440. break;
  1441. }
  1442. }
  1443. }
  1444. if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv )
  1445. {
  1446. set_func_name( "plane_copy_deinterleave_yuyv" );
  1447. used_asm = 1;
  1448. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1449. {
  1450. int w = (plane_specs[i].w + 1) >> 1;
  1451. int h = plane_specs[i].h;
  1452. intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) );
  1453. intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
  1454. intptr_t offv = dst_stride*h;
  1455. pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
  1456. memset( pbuf3, 0, 0x1000 );
  1457. memset( pbuf4, 0, 0x1000 );
  1458. /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */
  1459. call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h );
  1460. call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h );
  1461. for( int y = 0; y < h; y++ )
  1462. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) ||
  1463. memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) )
  1464. {
  1465. fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1466. break;
  1467. }
  1468. }
  1469. }
  1470. if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
  1471. {
  1472. set_func_name( "plane_copy_deinterleave_rgb" );
  1473. used_asm = 1;
  1474. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1475. {
  1476. int w = (plane_specs[i].w + 2) >> 2;
  1477. int h = plane_specs[i].h;
  1478. intptr_t src_stride = plane_specs[i].src_stride;
  1479. intptr_t dst_stride = ALIGN( w, 16 );
  1480. intptr_t offv = dst_stride*h + 16;
  1481. for( int pw = 3; pw <= 4; pw++ )
  1482. {
  1483. memset( pbuf3, 0, 0x1000 );
  1484. memset( pbuf4, 0, 0x1000 );
  1485. call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
  1486. call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
  1487. for( int y = 0; y < h; y++ )
  1488. if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) ||
  1489. memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) ||
  1490. memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) )
  1491. {
  1492. ok = 0;
  1493. fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw );
  1494. break;
  1495. }
  1496. }
  1497. }
  1498. }
  1499. report( "plane_copy :" );
  1500. if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
  1501. {
  1502. set_func_name( "plane_copy_deinterleave_v210" );
  1503. ok = 1; used_asm = 1;
  1504. for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
  1505. {
  1506. int w = (plane_specs[i].w + 1) >> 1;
  1507. int h = plane_specs[i].h;
  1508. intptr_t dst_stride = ALIGN( w, 32 );
  1509. intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
  1510. intptr_t offv = dst_stride*h + 32;
  1511. memset( pbuf3, 0, 0x1000 );
  1512. memset( pbuf4, 0, 0x1000 );
  1513. call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
  1514. call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
  1515. for( int y = 0; y < h; y++ )
  1516. if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) ||
  1517. memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
  1518. {
  1519. ok = 0;
  1520. fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
  1521. break;
  1522. }
  1523. }
  1524. report( "v210 :" );
  1525. }
  1526. if( mc_a.hpel_filter != mc_ref.hpel_filter )
  1527. {
  1528. pixel *srchpel = pbuf1+8+2*64;
  1529. pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 };
  1530. pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 };
  1531. void *tmp = pbuf3+49*64;
  1532. set_func_name( "hpel_filter" );
  1533. ok = 1; used_asm = 1;
  1534. memset( pbuf3, 0, 4096 * sizeof(pixel) );
  1535. memset( pbuf4, 0, 4096 * sizeof(pixel) );
  1536. call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp );
  1537. call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp );
  1538. for( int i = 0; i < 3; i++ )
  1539. for( int j = 0; j < 10; j++ )
  1540. //FIXME ideally the first pixels would match too, but they aren't actually used
  1541. if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * sizeof(pixel) ) )
  1542. {
  1543. ok = 0;
  1544. fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
  1545. for( int k = 0; k < 48; k++ )
  1546. printf( "%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " " );
  1547. printf( "\n" );
  1548. for( int k = 0; k < 48; k++ )
  1549. printf( "%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " " );
  1550. printf( "\n" );
  1551. break;
  1552. }
  1553. report( "hpel filter :" );
  1554. }
  1555. if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
  1556. {
  1557. pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 };
  1558. pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
  1559. set_func_name( "lowres_init" );
  1560. ok = 1; used_asm = 1;
  1561. for( int w = 96; w <= 96+24; w += 8 )
  1562. {
  1563. intptr_t stride = (w*2+31)&~31;
  1564. intptr_t stride_lowres = (w+31)&~31;
  1565. call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], stride, stride_lowres, w, 8 );
  1566. call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], stride, stride_lowres, w, 8 );
  1567. for( int i = 0; i < 8; i++ )
  1568. {
  1569. for( int j = 0; j < 4; j++ )
  1570. if( memcmp( dstc[j]+i*stride_lowres, dsta[j]+i*stride_lowres, w * sizeof(pixel) ) )
  1571. {
  1572. ok = 0;
  1573. fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
  1574. for( int k = 0; k < w; k++ )
  1575. printf( "%d ", dstc[j][k+i*stride_lowres] );
  1576. printf( "\n" );
  1577. for( int k = 0; k < w; k++ )
  1578. printf( "%d ", dsta[j][k+i*stride_lowres] );
  1579. printf( "\n" );
  1580. break;
  1581. }
  1582. }
  1583. }
  1584. report( "lowres init :" );
  1585. }
  1586. #define INTEGRAL_INIT( name, size, offset, cmp_len, ... )\
  1587. if( mc_a.name != mc_ref.name )\
  1588. {\
  1589. intptr_t stride = 96;\
  1590. set_func_name( #name );\
  1591. used_asm = 1;\
  1592. memcpy( buf3, buf1, size*2*stride );\
  1593. memcpy( buf4, buf1, size*2*stride );\
  1594. uint16_t *sum = (uint16_t*)buf3;\
  1595. call_c1( mc_c.name, sum+offset, __VA_ARGS__ );\
  1596. sum = (uint16_t*)buf4;\
  1597. call_a1( mc_a.name, sum+offset, __VA_ARGS__ );\
  1598. if( memcmp( buf3+2*offset, buf4+2*offset, cmp_len*2 )\
  1599. || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
  1600. ok = 0;\
  1601. call_c2( mc_c.name, sum+offset, __VA_ARGS__ );\
  1602. call_a2( mc_a.name, sum+offset, __VA_ARGS__ );\
  1603. }
  1604. ok = 1; used_asm = 0;
  1605. INTEGRAL_INIT( integral_init4h, 2, stride, stride-4, pbuf2, stride );
  1606. INTEGRAL_INIT( integral_init8h, 2, stride, stride-8, pbuf2, stride );
  1607. INTEGRAL_INIT( integral_init4v, 14, 0, stride-8, sum+9*stride, stride );
  1608. INTEGRAL_INIT( integral_init8v, 9, 0, stride-8, stride );
  1609. report( "integral init :" );
  1610. ok = 1; used_asm = 0;
  1611. if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
  1612. {
  1613. used_asm = 1;
  1614. x264_emms();
  1615. for( int i = 0; i < 10; i++ )
  1616. {
  1617. float fps_factor = (rand()&65535) / 65535.0f;
  1618. set_func_name( "mbtree_propagate_cost" );
  1619. int16_t *dsta = (int16_t*)buf3;
  1620. int16_t *dstc = dsta+400;
  1621. uint16_t *prop = (uint16_t*)buf1;
  1622. uint16_t *intra = (uint16_t*)buf4;
  1623. uint16_t *inter = intra+128;
  1624. uint16_t *qscale = inter+128;
  1625. uint16_t *rnd = (uint16_t*)buf2;
  1626. x264_emms();
  1627. for( int j = 0; j < 100; j++ )
  1628. {
  1629. intra[j] = *rnd++ & 0x7fff;
  1630. intra[j] += !intra[j];
  1631. inter[j] = *rnd++ & 0x7fff;
  1632. qscale[j] = *rnd++ & 0x7fff;
  1633. }
  1634. call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
  1635. call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
  1636. // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
  1637. x264_emms();
  1638. for( int j = 0; j < 100 && ok; j++ )
  1639. {
  1640. ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
  1641. if( !ok )
  1642. fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
  1643. }
  1644. }
  1645. }
  1646. if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
  1647. {
  1648. used_asm = 1;
  1649. for( int i = 0; i < 8; i++ )
  1650. {
  1651. set_func_name( "mbtree_propagate_list" );
  1652. x264_t h;
  1653. int height = 4;
  1654. int width = 128;
  1655. int size = width*height;
  1656. h.mb.i_mb_stride = width;
  1657. h.mb.i_mb_width = width;
  1658. h.mb.i_mb_height = height;
  1659. uint16_t *ref_costsc = (uint16_t*)buf3 + width;
  1660. uint16_t *ref_costsa = (uint16_t*)buf4 + width;
  1661. int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size);
  1662. int16_t *propagate_amount = (int16_t*)(mvs + width);
  1663. uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
  1664. h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size);
  1665. int bipred_weight = (rand()%63)+1;
  1666. int mb_y = rand()&3;
  1667. int list = i&1;
  1668. for( int j = -width; j < size+width; j++ )
  1669. ref_costsc[j] = ref_costsa[j] = rand()&32767;
  1670. for( int j = 0; j < width; j++ )
  1671. {
  1672. static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
  1673. for( int k = 0; k < 2; k++ )
  1674. mvs[j][k] = (rand()&127) - 64;
  1675. propagate_amount[j] = rand()&32767;
  1676. lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
  1677. }
  1678. call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
  1679. call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
  1680. for( int j = -width; j < size+width && ok; j++ )
  1681. {
  1682. ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
  1683. if( !ok )
  1684. fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
  1685. }
  1686. call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
  1687. call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
  1688. }
  1689. }
  1690. static const uint16_t mbtree_fix8_counts[] = { 5, 384, 392, 400, 415 };
  1691. if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
  1692. {
  1693. set_func_name( "mbtree_fix8_pack" );
  1694. used_asm = 1;
  1695. float *fix8_src = (float*)(buf3 + 0x800);
  1696. uint16_t *dstc = (uint16_t*)buf3;
  1697. uint16_t *dsta = (uint16_t*)buf4;
  1698. for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ )
  1699. {
  1700. int count = mbtree_fix8_counts[i];
  1701. for( int j = 0; j < count; j++ )
  1702. fix8_src[j] = (int16_t)(rand()) / 256.0f;
  1703. dsta[count] = 0xAAAA;
  1704. call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count );
  1705. call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count );
  1706. if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA )
  1707. {
  1708. ok = 0;
  1709. fprintf( stderr, "mbtree_fix8_pack FAILED\n" );
  1710. break;
  1711. }
  1712. }
  1713. }
  1714. if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack )
  1715. {
  1716. set_func_name( "mbtree_fix8_unpack" );
  1717. used_asm = 1;
  1718. uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
  1719. float *dstc = (float*)buf3;
  1720. float *dsta = (float*)buf4;
  1721. for( int i = 0; i < ARRAY_ELEMS(mbtree_fix8_counts); i++ )
  1722. {
  1723. int count = mbtree_fix8_counts[i];
  1724. for( int j = 0; j < count; j++ )
  1725. fix8_src[j] = rand();
  1726. M32( &dsta[count] ) = 0xAAAAAAAA;
  1727. call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count );
  1728. call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count );
  1729. if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA )
  1730. {
  1731. ok = 0;
  1732. fprintf( stderr, "mbtree_fix8_unpack FAILED\n" );
  1733. break;
  1734. }
  1735. }
  1736. }
  1737. report( "mbtree :" );
  1738. if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
  1739. {
  1740. set_func_name( "memcpy_aligned" );
  1741. ok = 1; used_asm = 1;
  1742. for( size_t size = 16; size < 512; size += 16 )
  1743. {
  1744. for( int i = 0; i < size; i++ )
  1745. buf1[i] = rand();
  1746. memset( buf4-1, 0xAA, size + 2 );
  1747. call_c( mc_c.memcpy_aligned, buf3, buf1, size );
  1748. call_a( mc_a.memcpy_aligned, buf4, buf1, size );
  1749. if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
  1750. {
  1751. ok = 0;
  1752. fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
  1753. break;
  1754. }
  1755. }
  1756. report( "memcpy aligned :" );
  1757. }
  1758. if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
  1759. {
  1760. set_func_name( "memzero_aligned" );
  1761. ok = 1; used_asm = 1;
  1762. for( size_t size = 128; size < 1024; size += 128 )
  1763. {
  1764. memset( buf4-1, 0xAA, size + 2 );
  1765. call_c( mc_c.memzero_aligned, buf3, size );
  1766. call_a( mc_a.memzero_aligned, buf4, size );
  1767. if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
  1768. {
  1769. ok = 0;
  1770. fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
  1771. break;
  1772. }
  1773. }
  1774. report( "memzero aligned :" );
  1775. }
  1776. return ret;
  1777. }
  1778. static int check_deblock( int cpu_ref, int cpu_new )
  1779. {
  1780. x264_deblock_function_t db_c;
  1781. x264_deblock_function_t db_ref;
  1782. x264_deblock_function_t db_a;
  1783. int ret = 0, ok = 1, used_asm = 0;
  1784. int alphas[36], betas[36];
  1785. int8_t tcs[36][4];
  1786. x264_deblock_init( 0, &db_c, 0 );
  1787. x264_deblock_init( cpu_ref, &db_ref, 0 );
  1788. x264_deblock_init( cpu_new, &db_a, 0 );
  1789. /* not exactly the real values of a,b,tc but close enough */
  1790. for( int i = 35, a = 255, c = 250; i >= 0; i-- )
  1791. {
  1792. alphas[i] = a << (BIT_DEPTH-8);
  1793. betas[i] = (i+1)/2 << (BIT_DEPTH-8);
  1794. tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
  1795. tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
  1796. tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
  1797. a = a*9/10;
  1798. c = c*9/10;
  1799. }
  1800. #define TEST_DEBLOCK( name, align, ... ) \
  1801. for( int i = 0; i < 36; i++ ) \
  1802. { \
  1803. intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
  1804. for( int j = 0; j < 1024; j++ ) \
  1805. /* two distributions of random to excersize different failure modes */ \
  1806. pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
  1807. memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
  1808. if( db_a.name != db_ref.name ) \
  1809. { \
  1810. set_func_name( #name ); \
  1811. used_asm = 1; \
  1812. call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  1813. call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  1814. if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
  1815. { \
  1816. ok = 0; \
  1817. fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
  1818. break; \
  1819. } \
  1820. call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  1821. call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
  1822. } \
  1823. }
  1824. TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
  1825. TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
  1826. TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
  1827. TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
  1828. TEST_DEBLOCK( deblock_chroma_420_mbaff, 0, tcs[i] );
  1829. TEST_DEBLOCK( deblock_chroma_422_mbaff, 0, tcs[i] );
  1830. TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
  1831. TEST_DEBLOCK( deblock_luma_intra[0], 0 );
  1832. TEST_DEBLOCK( deblock_luma_intra[1], 1 );
  1833. TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
  1834. TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
  1835. TEST_DEBLOCK( deblock_chroma_420_intra_mbaff, 0 );
  1836. TEST_DEBLOCK( deblock_chroma_422_intra_mbaff, 0 );
  1837. TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
  1838. if( db_a.deblock_strength != db_ref.deblock_strength )
  1839. {
  1840. set_func_name( "deblock_strength" );
  1841. used_asm = 1;
  1842. for( int i = 0; i < 100; i++ )
  1843. {
  1844. ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] );
  1845. uint8_t *nnz = &nnz_buf[8];
  1846. ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
  1847. ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
  1848. ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
  1849. memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
  1850. for( int j = 0; j < X264_SCAN8_SIZE; j++ )
  1851. nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
  1852. for( int j = 0; j < 2; j++ )
  1853. for( int k = 0; k < X264_SCAN8_LUMA_SIZE; k++ )
  1854. {
  1855. ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
  1856. for( int l = 0; l < 2; l++ )
  1857. mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192;
  1858. }
  1859. call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
  1860. call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
  1861. if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
  1862. {
  1863. ok = 0;
  1864. fprintf( stderr, "deblock_strength: [FAILED]\n" );
  1865. for( int j = 0; j < 2; j++ )
  1866. {
  1867. for( int k = 0; k < 2; k++ )
  1868. for( int l = 0; l < 4; l++ )
  1869. {
  1870. for( int m = 0; m < 4; m++ )
  1871. printf("%d ",bs[j][k][l][m]);
  1872. printf("\n");
  1873. }
  1874. printf("\n");
  1875. }
  1876. break;
  1877. }
  1878. }
  1879. }
  1880. report( "deblock :" );
  1881. return ret;
  1882. }
  1883. static int check_quant( int cpu_ref, int cpu_new )
  1884. {
  1885. x264_quant_function_t qf_c;
  1886. x264_quant_function_t qf_ref;
  1887. x264_quant_function_t qf_a;
  1888. ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
  1889. ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
  1890. ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
  1891. ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
  1892. ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
  1893. int ret = 0, ok, used_asm;
  1894. int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
  1895. x264_t h_buf;
  1896. x264_t *h = &h_buf;
  1897. memset( h, 0, sizeof(*h) );
  1898. h->sps->i_chroma_format_idc = 1;
  1899. x264_param_default( &h->param );
  1900. h->chroma_qp_table = i_chroma_qp_table + 12;
  1901. h->param.analyse.b_transform_8x8 = 1;
  1902. for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
  1903. {
  1904. if( i_cqm == 0 )
  1905. {
  1906. for( int i = 0; i < 6; i++ )
  1907. h->sps->scaling_list[i] = x264_cqm_flat16;
  1908. h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_FLAT;
  1909. }
  1910. else if( i_cqm == 1 )
  1911. {
  1912. for( int i = 0; i < 6; i++ )
  1913. h->sps->scaling_list[i] = x264_cqm_jvt[i];
  1914. h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_JVT;
  1915. }
  1916. else
  1917. {
  1918. int max_scale = BIT_DEPTH < 10 ? 255 : 228;
  1919. if( i_cqm == 2 )
  1920. for( int i = 0; i < 64; i++ )
  1921. cqm_buf[i] = 10 + rand() % (max_scale - 9);
  1922. else
  1923. for( int i = 0; i < 64; i++ )
  1924. cqm_buf[i] = 1;
  1925. for( int i = 0; i < 6; i++ )
  1926. h->sps->scaling_list[i] = cqm_buf;
  1927. h->param.i_cqm_preset = h->sps->i_cqm_preset = X264_CQM_CUSTOM;
  1928. }
  1929. h->param.rc.i_qp_min = 0;
  1930. h->param.rc.i_qp_max = QP_MAX_SPEC;
  1931. x264_cqm_init( h );
  1932. x264_quant_init( h, 0, &qf_c );
  1933. x264_quant_init( h, cpu_ref, &qf_ref );
  1934. x264_quant_init( h, cpu_new, &qf_a );
  1935. #define INIT_QUANT8(j,max) \
  1936. { \
  1937. static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
  1938. for( int i = 0; i < max; i++ ) \
  1939. { \
  1940. unsigned int scale = (255*scale1d[(i>>3)&7]*scale1d[i&7])/16; \
  1941. dct1[i] = dct2[i] = (j>>(i>>6))&1 ? (rand()%(2*scale+1))-scale : 0; \
  1942. } \
  1943. }
  1944. #define INIT_QUANT4(j,max) \
  1945. { \
  1946. static const int scale1d[4] = {4,6,4,6}; \
  1947. for( int i = 0; i < max; i++ ) \
  1948. { \
  1949. unsigned int scale = 255*scale1d[(i>>2)&3]*scale1d[i&3]; \
  1950. dct1[i] = dct2[i] = (j>>(i>>4))&1 ? (rand()%(2*scale+1))-scale : 0; \
  1951. } \
  1952. }
  1953. #define TEST_QUANT_DC( name, cqm ) \
  1954. if( qf_a.name != qf_ref.name ) \
  1955. { \
  1956. set_func_name( #name ); \
  1957. used_asms[0] = 1; \
  1958. for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
  1959. { \
  1960. for( int j = 0; j < 2; j++ ) \
  1961. { \
  1962. int result_c, result_a; \
  1963. for( int i = 0; i < 16; i++ ) \
  1964. dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
  1965. result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  1966. result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  1967. if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
  1968. { \
  1969. oks[0] = 0; \
  1970. fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
  1971. break; \
  1972. } \
  1973. call_c2( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  1974. call_a2( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
  1975. } \
  1976. } \
  1977. }
  1978. #define TEST_QUANT( qname, block, type, w, maxj ) \
  1979. if( qf_a.qname != qf_ref.qname ) \
  1980. { \
  1981. set_func_name( #qname ); \
  1982. used_asms[0] = 1; \
  1983. for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
  1984. { \
  1985. for( int j = 0; j < maxj; j++ ) \
  1986. { \
  1987. INIT_QUANT##type(j, w*w) \
  1988. int result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
  1989. int result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
  1990. if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
  1991. { \
  1992. oks[0] = 0; \
  1993. fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
  1994. break; \
  1995. } \
  1996. call_c2( qf_c.qname, (void*)dct1, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
  1997. call_a2( qf_a.qname, (void*)dct2, h->quant##type##_mf[block][qp], h->quant##type##_bias[block][qp] ); \
  1998. } \
  1999. } \
  2000. }
  2001. TEST_QUANT( quant_8x8, CQM_8IY, 8, 8, 2 );
  2002. TEST_QUANT( quant_8x8, CQM_8PY, 8, 8, 2 );
  2003. TEST_QUANT( quant_4x4, CQM_4IY, 4, 4, 2 );
  2004. TEST_QUANT( quant_4x4, CQM_4PY, 4, 4, 2 );
  2005. TEST_QUANT( quant_4x4x4, CQM_4IY, 4, 8, 16 );
  2006. TEST_QUANT( quant_4x4x4, CQM_4PY, 4, 8, 16 );
  2007. TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
  2008. TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
  2009. #define TEST_DEQUANT( qname, dqname, block, w ) \
  2010. if( qf_a.dqname != qf_ref.dqname ) \
  2011. { \
  2012. set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
  2013. used_asms[1] = 1; \
  2014. for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
  2015. { \
  2016. INIT_QUANT##w(1, w*w) \
  2017. qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
  2018. memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
  2019. call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
  2020. call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
  2021. if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
  2022. { \
  2023. oks[1] = 0; \
  2024. fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
  2025. break; \
  2026. } \
  2027. call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
  2028. call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
  2029. } \
  2030. }
  2031. TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
  2032. TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
  2033. TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
  2034. TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
  2035. #define TEST_DEQUANT_DC( qname, dqname, block, w ) \
  2036. if( qf_a.dqname != qf_ref.dqname ) \
  2037. { \
  2038. set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
  2039. used_asms[1] = 1; \
  2040. for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
  2041. { \
  2042. for( int i = 0; i < 16; i++ ) \
  2043. dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
  2044. qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
  2045. memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
  2046. call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
  2047. call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
  2048. if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
  2049. { \
  2050. oks[1] = 0; \
  2051. fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
  2052. } \
  2053. call_c2( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
  2054. call_a2( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
  2055. } \
  2056. }
  2057. TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
  2058. if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
  2059. {
  2060. set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
  2061. used_asms[1] = 1;
  2062. for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
  2063. {
  2064. for( int i = 0; i < 8; i++ )
  2065. dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
  2066. qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
  2067. qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
  2068. call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
  2069. call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
  2070. for( int i = 0; i < 8; i++ )
  2071. if( dct3[i][0] != dct4[i][0] )
  2072. {
  2073. oks[1] = 0;
  2074. fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
  2075. break;
  2076. }
  2077. }
  2078. }
  2079. if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
  2080. {
  2081. set_func_name( "idct_dequant_2x4_dconly_%s", i_cqm?"cqm":"flat" );
  2082. used_asms[1] = 1;
  2083. for( int qp = h->chroma_qp_table[h->param.rc.i_qp_max]; qp >= h->chroma_qp_table[h->param.rc.i_qp_min]; qp-- )
  2084. {
  2085. for( int i = 0; i < 8; i++ )
  2086. dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
  2087. qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
  2088. qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
  2089. memcpy( dct2, dct1, 8*sizeof(dctcoef) );
  2090. call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
  2091. call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
  2092. if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
  2093. {
  2094. oks[1] = 0;
  2095. fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
  2096. break;
  2097. }
  2098. call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
  2099. call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
  2100. }
  2101. }
  2102. #define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
  2103. if( qf_a.optname != qf_ref.optname ) \
  2104. { \
  2105. set_func_name( #optname ); \
  2106. used_asms[2] = 1; \
  2107. for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
  2108. { \
  2109. int qpdc = qp + (size == 8 ? 3 : 0); \
  2110. int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
  2111. if( dmf > 32*64 ) \
  2112. continue; \
  2113. for( int i = 16;; i <<= 1 ) \
  2114. { \
  2115. int res_c, res_asm; \
  2116. int max = X264_MIN( i, PIXEL_MAX*16 ); \
  2117. for( int j = 0; j < size; j++ ) \
  2118. dct1[j] = rand()%(max*2+1) - max; \
  2119. for( int j = 0; j <= size; j += 4 ) \
  2120. qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
  2121. memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
  2122. res_c = call_c1( qf_c.optname, dct1, dmf ); \
  2123. res_asm = call_a1( qf_a.optname, dct2, dmf ); \
  2124. if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
  2125. { \
  2126. oks[2] = 0; \
  2127. fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
  2128. } \
  2129. call_c2( qf_c.optname, dct1, dmf ); \
  2130. call_a2( qf_a.optname, dct2, dmf ); \
  2131. if( i >= PIXEL_MAX*16 ) \
  2132. break; \
  2133. } \
  2134. } \
  2135. }
  2136. TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
  2137. TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
  2138. x264_cqm_delete( h );
  2139. }
  2140. ok = oks[0]; used_asm = used_asms[0];
  2141. report( "quant :" );
  2142. ok = oks[1]; used_asm = used_asms[1];
  2143. report( "dequant :" );
  2144. ok = oks[2]; used_asm = used_asms[2];
  2145. report( "optimize chroma dc :" );
  2146. ok = 1; used_asm = 0;
  2147. if( qf_a.denoise_dct != qf_ref.denoise_dct )
  2148. {
  2149. used_asm = 1;
  2150. for( int size = 16; size <= 64; size += 48 )
  2151. {
  2152. set_func_name( "denoise_dct" );
  2153. memcpy( dct1, buf1, size*sizeof(dctcoef) );
  2154. memcpy( dct2, buf1, size*sizeof(dctcoef) );
  2155. memcpy( buf3+256, buf3, 256 );
  2156. call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
  2157. call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
  2158. if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
  2159. ok = 0;
  2160. call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
  2161. call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
  2162. }
  2163. }
  2164. report( "denoise dct :" );
  2165. #define TEST_DECIMATE( decname, w, ac, thresh ) \
  2166. if( qf_a.decname != qf_ref.decname ) \
  2167. { \
  2168. set_func_name( #decname ); \
  2169. used_asm = 1; \
  2170. for( int i = 0; i < 100; i++ ) \
  2171. { \
  2172. static const int distrib[16] = {1,1,1,1,1,1,1,1,1,1,1,1,2,3,4};\
  2173. static const int zerorate_lut[4] = {3,7,15,31};\
  2174. int zero_rate = zerorate_lut[i&3];\
  2175. for( int idx = 0; idx < w*w; idx++ ) \
  2176. { \
  2177. int sign = (rand()&1) ? -1 : 1; \
  2178. int abs_level = distrib[rand()&15]; \
  2179. if( abs_level == 4 ) abs_level = rand()&0x3fff; \
  2180. int zero = !(rand()&zero_rate); \
  2181. dct1[idx] = zero * abs_level * sign; \
  2182. } \
  2183. if( ac ) \
  2184. dct1[0] = 0; \
  2185. int result_c = call_c( qf_c.decname, dct1 ); \
  2186. int result_a = call_a( qf_a.decname, dct1 ); \
  2187. if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
  2188. { \
  2189. ok = 0; \
  2190. fprintf( stderr, #decname ": [FAILED]\n" ); \
  2191. break; \
  2192. } \
  2193. } \
  2194. }
  2195. ok = 1; used_asm = 0;
  2196. TEST_DECIMATE( decimate_score64, 8, 0, 6 );
  2197. TEST_DECIMATE( decimate_score16, 4, 0, 6 );
  2198. TEST_DECIMATE( decimate_score15, 4, 1, 7 );
  2199. report( "decimate_score :" );
  2200. #define TEST_LAST( last, lastname, size, ac ) \
  2201. if( qf_a.last != qf_ref.last ) \
  2202. { \
  2203. set_func_name( #lastname ); \
  2204. used_asm = 1; \
  2205. for( int i = 0; i < 100; i++ ) \
  2206. { \
  2207. int nnz = 0; \
  2208. int max = rand() & (size-1); \
  2209. memset( dct1, 0, size*sizeof(dctcoef) ); \
  2210. for( int idx = ac; idx < max; idx++ ) \
  2211. nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
  2212. if( !nnz ) \
  2213. dct1[ac] = 1; \
  2214. int result_c = call_c( qf_c.last, dct1+ac ); \
  2215. int result_a = call_a( qf_a.last, dct1+ac ); \
  2216. if( result_c != result_a ) \
  2217. { \
  2218. ok = 0; \
  2219. fprintf( stderr, #lastname ": [FAILED]\n" ); \
  2220. break; \
  2221. } \
  2222. } \
  2223. }
  2224. ok = 1; used_asm = 0;
  2225. TEST_LAST( coeff_last4 , coeff_last4, 4, 0 );
  2226. TEST_LAST( coeff_last8 , coeff_last8, 8, 0 );
  2227. TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 16, 1 );
  2228. TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
  2229. TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
  2230. report( "coeff_last :" );
  2231. #define TEST_LEVELRUN( lastname, name, size, ac ) \
  2232. if( qf_a.lastname != qf_ref.lastname ) \
  2233. { \
  2234. set_func_name( #name ); \
  2235. used_asm = 1; \
  2236. for( int i = 0; i < 100; i++ ) \
  2237. { \
  2238. x264_run_level_t runlevel_c, runlevel_a; \
  2239. int nnz = 0; \
  2240. int max = rand() & (size-1); \
  2241. memset( dct1, 0, size*sizeof(dctcoef) ); \
  2242. memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
  2243. memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
  2244. for( int idx = ac; idx < max; idx++ ) \
  2245. nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
  2246. if( !nnz ) \
  2247. dct1[ac] = 1; \
  2248. int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
  2249. int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
  2250. if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
  2251. runlevel_c.mask != runlevel_a.mask || \
  2252. memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c)) \
  2253. { \
  2254. ok = 0; \
  2255. fprintf( stderr, #name ": [FAILED]\n" ); \
  2256. break; \
  2257. } \
  2258. } \
  2259. }
  2260. ok = 1; used_asm = 0;
  2261. TEST_LEVELRUN( coeff_level_run4 , coeff_level_run4, 4, 0 );
  2262. TEST_LEVELRUN( coeff_level_run8 , coeff_level_run8, 8, 0 );
  2263. TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 16, 1 );
  2264. TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
  2265. report( "coeff_level_run :" );
  2266. return ret;
  2267. }
  2268. static int check_intra( int cpu_ref, int cpu_new )
  2269. {
  2270. int ret = 0, ok = 1, used_asm = 0;
  2271. ALIGNED_ARRAY_32( pixel, edge,[36] );
  2272. ALIGNED_ARRAY_32( pixel, edge2,[36] );
  2273. ALIGNED_ARRAY_32( pixel, fdec,[FDEC_STRIDE*20] );
  2274. struct
  2275. {
  2276. x264_predict_t predict_16x16[4+3];
  2277. x264_predict_t predict_8x8c[4+3];
  2278. x264_predict_t predict_8x16c[4+3];
  2279. x264_predict8x8_t predict_8x8[9+3];
  2280. x264_predict_t predict_4x4[9+3];
  2281. x264_predict_8x8_filter_t predict_8x8_filter;
  2282. } ip_c, ip_ref, ip_a;
  2283. x264_predict_16x16_init( 0, ip_c.predict_16x16 );
  2284. x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
  2285. x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
  2286. x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
  2287. x264_predict_4x4_init( 0, ip_c.predict_4x4 );
  2288. x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
  2289. x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
  2290. x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
  2291. x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
  2292. x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
  2293. x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
  2294. x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
  2295. x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
  2296. x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
  2297. x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
  2298. memcpy( fdec, pbuf1, 32*20 * sizeof(pixel) );\
  2299. ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
  2300. #define INTRA_TEST( name, dir, w, h, align, bench, ... )\
  2301. if( ip_a.name[dir] != ip_ref.name[dir] )\
  2302. {\
  2303. set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
  2304. used_asm = 1;\
  2305. memcpy( pbuf3, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
  2306. memcpy( pbuf4, fdec, FDEC_STRIDE*20 * sizeof(pixel) );\
  2307. for( int a = 0; a < (do_bench ? 64/sizeof(pixel) : 1); a += align )\
  2308. {\
  2309. call_c##bench( ip_c.name[dir], pbuf3+48+a, ##__VA_ARGS__ );\
  2310. call_a##bench( ip_a.name[dir], pbuf4+48+a, ##__VA_ARGS__ );\
  2311. if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*20 * sizeof(pixel) ) )\
  2312. {\
  2313. fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\
  2314. ok = 0;\
  2315. if( ip_c.name == (void *)ip_c.predict_8x8 )\
  2316. {\
  2317. for( int k = -1; k < 16; k++ )\
  2318. printf( "%2x ", edge[16+k] );\
  2319. printf( "\n" );\
  2320. }\
  2321. for( int j = 0; j < h; j++ )\
  2322. {\
  2323. if( ip_c.name == (void *)ip_c.predict_8x8 )\
  2324. printf( "%2x ", edge[14-j] );\
  2325. for( int k = 0; k < w; k++ )\
  2326. printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
  2327. printf( "\n" );\
  2328. }\
  2329. printf( "\n" );\
  2330. for( int j = 0; j < h; j++ )\
  2331. {\
  2332. if( ip_c.name == (void *)ip_c.predict_8x8 )\
  2333. printf( " " );\
  2334. for( int k = 0; k < w; k++ )\
  2335. printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
  2336. printf( "\n" );\
  2337. }\
  2338. break;\
  2339. }\
  2340. }\
  2341. }
  2342. for( int i = 0; i < 12; i++ )
  2343. INTRA_TEST( predict_4x4, i, 4, 4, 4, );
  2344. for( int i = 0; i < 7; i++ )
  2345. INTRA_TEST( predict_8x8c, i, 8, 8, 16, );
  2346. for( int i = 0; i < 7; i++ )
  2347. INTRA_TEST( predict_8x16c, i, 8, 16, 16, );
  2348. for( int i = 0; i < 7; i++ )
  2349. INTRA_TEST( predict_16x16, i, 16, 16, 16, );
  2350. for( int i = 0; i < 12; i++ )
  2351. INTRA_TEST( predict_8x8, i, 8, 8, 8, , edge );
  2352. set_func_name("intra_predict_8x8_filter");
  2353. if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
  2354. {
  2355. used_asm = 1;
  2356. for( int i = 0; i < 32; i++ )
  2357. {
  2358. if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
  2359. continue;
  2360. int neighbor = (i&24)>>1;
  2361. memset( edge, 0, 36*sizeof(pixel) );
  2362. memset( edge2, 0, 36*sizeof(pixel) );
  2363. call_c( ip_c.predict_8x8_filter, pbuf1+48, edge, neighbor, i&7 );
  2364. call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
  2365. if( !(neighbor&MB_TOPLEFT) )
  2366. edge[15] = edge2[15] = 0;
  2367. if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * sizeof(pixel) ) )
  2368. {
  2369. fprintf( stderr, "predict_8x8_filter : [FAILED] %d %d\n", (i&24)>>1, i&7);
  2370. ok = 0;
  2371. }
  2372. }
  2373. }
  2374. #define EXTREMAL_PLANE( w, h ) \
  2375. { \
  2376. int max[7]; \
  2377. for( int j = 0; j < 7; j++ ) \
  2378. max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
  2379. fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
  2380. for( int j = 0; j < w/2; j++ ) \
  2381. fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
  2382. for( int j = w/2; j < w-1; j++ ) \
  2383. fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
  2384. fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
  2385. for( int j = 0; j < h/2; j++ ) \
  2386. fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
  2387. for( int j = h/2; j < h-1; j++ ) \
  2388. fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
  2389. fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
  2390. }
  2391. /* Extremal test case for planar prediction. */
  2392. for( int test = 0; test < 100 && ok; test++ )
  2393. for( int i = 0; i < 128 && ok; i++ )
  2394. {
  2395. EXTREMAL_PLANE( 8, 8 );
  2396. INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 8, 64, 1 );
  2397. EXTREMAL_PLANE( 8, 16 );
  2398. INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P, 8, 16, 64, 1 );
  2399. EXTREMAL_PLANE( 16, 16 );
  2400. INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 16, 64, 1 );
  2401. }
  2402. report( "intra pred :" );
  2403. return ret;
  2404. }
  2405. #define DECL_CABAC(cpu) \
  2406. static void run_cabac_decision_##cpu( x264_t *h, uint8_t *dst )\
  2407. {\
  2408. x264_cabac_t cb;\
  2409. x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
  2410. x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  2411. for( int i = 0; i < 0x1000; i++ )\
  2412. x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
  2413. }\
  2414. static void run_cabac_bypass_##cpu( x264_t *h, uint8_t *dst )\
  2415. {\
  2416. x264_cabac_t cb;\
  2417. x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
  2418. x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  2419. for( int i = 0; i < 0x1000; i++ )\
  2420. x264_cabac_encode_bypass_##cpu( &cb, buf1[i]&1 );\
  2421. }\
  2422. static void run_cabac_terminal_##cpu( x264_t *h, uint8_t *dst )\
  2423. {\
  2424. x264_cabac_t cb;\
  2425. x264_cabac_context_init( h, &cb, SLICE_TYPE_P, 26, 0 );\
  2426. x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
  2427. for( int i = 0; i < 0x1000; i++ )\
  2428. x264_cabac_encode_terminal_##cpu( &cb );\
  2429. }
  2430. DECL_CABAC(c)
  2431. #if HAVE_MMX
  2432. DECL_CABAC(asm)
  2433. #elif defined(ARCH_AARCH64)
  2434. DECL_CABAC(asm)
  2435. #else
  2436. #define run_cabac_decision_asm run_cabac_decision_c
  2437. #define run_cabac_bypass_asm run_cabac_bypass_c
  2438. #define run_cabac_terminal_asm run_cabac_terminal_c
  2439. #endif
  2440. extern const uint8_t x264_count_cat_m1[14];
  2441. static int check_cabac( int cpu_ref, int cpu_new )
  2442. {
  2443. int ret = 0, ok = 1, used_asm = 0;
  2444. x264_t h;
  2445. h.sps->i_chroma_format_idc = 3;
  2446. x264_bitstream_function_t bs_ref;
  2447. x264_bitstream_function_t bs_a;
  2448. x264_bitstream_init( cpu_ref, &bs_ref );
  2449. x264_bitstream_init( cpu_new, &bs_a );
  2450. x264_quant_init( &h, cpu_new, &h.quantf );
  2451. h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
  2452. /* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
  2453. #define GET_CB( i ) (\
  2454. x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
  2455. cb[i].f8_bits_encoded = 0, &cb[i] )
  2456. #define CABAC_RESIDUAL(name, start, end, rd)\
  2457. {\
  2458. if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
  2459. {\
  2460. used_asm = 1;\
  2461. set_func_name( #name );\
  2462. for( int i = 0; i < 2; i++ )\
  2463. {\
  2464. for( intptr_t ctx_block_cat = start; ctx_block_cat <= end; ctx_block_cat++ )\
  2465. {\
  2466. for( int j = 0; j < 256; j++ )\
  2467. {\
  2468. ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
  2469. uint8_t bitstream[2][1<<16];\
  2470. static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
  2471. int ac = ctx_ac[ctx_block_cat];\
  2472. int nz = 0;\
  2473. while( !nz )\
  2474. {\
  2475. for( int k = 0; k <= x264_count_cat_m1[ctx_block_cat]; k++ )\
  2476. {\
  2477. /* Very rough distribution that covers possible inputs */\
  2478. int rnd = rand();\
  2479. int coef = !(rnd&3);\
  2480. coef += !(rnd& 15) * (rand()&0x0006);\
  2481. coef += !(rnd& 63) * (rand()&0x0008);\
  2482. coef += !(rnd& 255) * (rand()&0x00F0);\
  2483. coef += !(rnd&1023) * (rand()&0x7F00);\
  2484. nz |= dct[0][ac+k] = dct[1][ac+k] = coef * ((rand()&1) ? 1 : -1);\
  2485. }\
  2486. }\
  2487. h.mb.b_interlaced = i;\
  2488. x264_cabac_t cb[2];\
  2489. x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
  2490. x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
  2491. if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
  2492. call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
  2493. call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
  2494. ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
  2495. if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
  2496. if( !ok )\
  2497. {\
  2498. fprintf( stderr, #name " : [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
  2499. if( rd && cb[0].f8_bits_encoded != cb[1].f8_bits_encoded )\
  2500. fprintf( stderr, " (%d != %d)", cb[0].f8_bits_encoded, cb[1].f8_bits_encoded );\
  2501. fprintf( stderr, "\n");\
  2502. goto name##fail;\
  2503. }\
  2504. if( (j&15) == 0 )\
  2505. {\
  2506. call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
  2507. call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
  2508. }\
  2509. }\
  2510. }\
  2511. }\
  2512. }\
  2513. }\
  2514. name##fail:
  2515. CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
  2516. report( "cabac residual:" );
  2517. ok = 1; used_asm = 0;
  2518. CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
  2519. CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
  2520. report( "cabac residual rd:" );
  2521. if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
  2522. return ret;
  2523. ok = 1; used_asm = 0;
  2524. x264_cabac_init( &h );
  2525. set_func_name( "cabac_encode_decision" );
  2526. memcpy( buf4, buf3, 0x1000 );
  2527. call_c( run_cabac_decision_c, &h, buf3 );
  2528. call_a( run_cabac_decision_asm, &h, buf4 );
  2529. ok = !memcmp( buf3, buf4, 0x1000 );
  2530. report( "cabac decision:" );
  2531. set_func_name( "cabac_encode_bypass" );
  2532. memcpy( buf4, buf3, 0x1000 );
  2533. call_c( run_cabac_bypass_c, &h, buf3 );
  2534. call_a( run_cabac_bypass_asm, &h, buf4 );
  2535. ok = !memcmp( buf3, buf4, 0x1000 );
  2536. report( "cabac bypass:" );
  2537. set_func_name( "cabac_encode_terminal" );
  2538. memcpy( buf4, buf3, 0x1000 );
  2539. call_c( run_cabac_terminal_c, &h, buf3 );
  2540. call_a( run_cabac_terminal_asm, &h, buf4 );
  2541. ok = !memcmp( buf3, buf4, 0x1000 );
  2542. report( "cabac terminal:" );
  2543. return ret;
  2544. }
  2545. static int check_bitstream( int cpu_ref, int cpu_new )
  2546. {
  2547. x264_bitstream_function_t bs_c;
  2548. x264_bitstream_function_t bs_ref;
  2549. x264_bitstream_function_t bs_a;
  2550. int ret = 0, ok = 1, used_asm = 0;
  2551. x264_bitstream_init( 0, &bs_c );
  2552. x264_bitstream_init( cpu_ref, &bs_ref );
  2553. x264_bitstream_init( cpu_new, &bs_a );
  2554. if( bs_a.nal_escape != bs_ref.nal_escape )
  2555. {
  2556. int size = 0x4000;
  2557. uint8_t *input = malloc(size+100);
  2558. uint8_t *output1 = malloc(size*2);
  2559. uint8_t *output2 = malloc(size*2);
  2560. used_asm = 1;
  2561. set_func_name( "nal_escape" );
  2562. for( int i = 0; i < 100; i++ )
  2563. {
  2564. /* Test corner-case sizes */
  2565. int test_size = i < 10 ? i+1 : rand() & 0x3fff;
  2566. /* Test 8 different probability distributions of zeros */
  2567. for( int j = 0; j < test_size+32; j++ )
  2568. input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
  2569. uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
  2570. uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
  2571. int size_c = end_c-output1;
  2572. int size_a = end_a-output2;
  2573. if( size_c != size_a || memcmp( output1, output2, size_c ) )
  2574. {
  2575. fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
  2576. ok = 0;
  2577. break;
  2578. }
  2579. }
  2580. for( int j = 0; j < size+32; j++ )
  2581. input[j] = rand();
  2582. call_c2( bs_c.nal_escape, output1, input, input+size );
  2583. call_a2( bs_a.nal_escape, output2, input, input+size );
  2584. free(input);
  2585. free(output1);
  2586. free(output2);
  2587. }
  2588. report( "nal escape:" );
  2589. return ret;
  2590. }
  2591. static int check_all_funcs( int cpu_ref, int cpu_new )
  2592. {
  2593. return check_pixel( cpu_ref, cpu_new )
  2594. + check_dct( cpu_ref, cpu_new )
  2595. + check_mc( cpu_ref, cpu_new )
  2596. + check_intra( cpu_ref, cpu_new )
  2597. + check_deblock( cpu_ref, cpu_new )
  2598. + check_quant( cpu_ref, cpu_new )
  2599. + check_cabac( cpu_ref, cpu_new )
  2600. + check_bitstream( cpu_ref, cpu_new );
  2601. }
  2602. static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
  2603. {
  2604. *cpu_ref = *cpu_new;
  2605. *cpu_new |= flags;
  2606. #if STACK_ALIGNMENT < 16
  2607. *cpu_new |= X264_CPU_STACK_MOD4;
  2608. #endif
  2609. if( *cpu_new & X264_CPU_SSE2_IS_FAST )
  2610. *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
  2611. if( !quiet )
  2612. fprintf( stderr, "x264: %s\n", name );
  2613. return check_all_funcs( *cpu_ref, *cpu_new );
  2614. }
  2615. static int check_all_flags( void )
  2616. {
  2617. int ret = 0;
  2618. int cpu0 = 0, cpu1 = 0;
  2619. uint32_t cpu_detect = x264_cpu_detect();
  2620. #if ARCH_X86 || ARCH_X86_64
  2621. if( cpu_detect & X264_CPU_AVX512 )
  2622. simd_warmup_func = x264_checkasm_warmup_avx512;
  2623. else if( cpu_detect & X264_CPU_AVX )
  2624. simd_warmup_func = x264_checkasm_warmup_avx;
  2625. #endif
  2626. simd_warmup();
  2627. #if HAVE_MMX
  2628. if( cpu_detect & X264_CPU_MMX2 )
  2629. {
  2630. ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMX2, "MMX" );
  2631. ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
  2632. cpu1 &= ~X264_CPU_CACHELINE_64;
  2633. #if ARCH_X86
  2634. ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
  2635. cpu1 &= ~X264_CPU_CACHELINE_32;
  2636. #endif
  2637. }
  2638. if( cpu_detect & X264_CPU_SSE )
  2639. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
  2640. if( cpu_detect & X264_CPU_SSE2 )
  2641. {
  2642. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
  2643. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
  2644. ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
  2645. cpu1 &= ~X264_CPU_CACHELINE_64;
  2646. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
  2647. cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
  2648. }
  2649. if( cpu_detect & X264_CPU_LZCNT )
  2650. {
  2651. ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" );
  2652. cpu1 &= ~X264_CPU_LZCNT;
  2653. }
  2654. if( cpu_detect & X264_CPU_SSE3 )
  2655. {
  2656. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
  2657. cpu1 &= ~X264_CPU_CACHELINE_64;
  2658. }
  2659. if( cpu_detect & X264_CPU_SSSE3 )
  2660. {
  2661. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
  2662. ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
  2663. cpu1 &= ~X264_CPU_CACHELINE_64;
  2664. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
  2665. cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
  2666. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
  2667. ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
  2668. cpu1 &= ~X264_CPU_CACHELINE_64;
  2669. cpu1 &= ~X264_CPU_SLOW_ATOM;
  2670. if( cpu_detect & X264_CPU_LZCNT )
  2671. {
  2672. ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
  2673. cpu1 &= ~X264_CPU_LZCNT;
  2674. }
  2675. }
  2676. if( cpu_detect & X264_CPU_SSE4 )
  2677. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
  2678. if( cpu_detect & X264_CPU_SSE42 )
  2679. ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
  2680. if( cpu_detect & X264_CPU_AVX )
  2681. ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
  2682. if( cpu_detect & X264_CPU_XOP )
  2683. ret |= add_flags( &cpu0, &cpu1, X264_CPU_XOP, "XOP" );
  2684. if( cpu_detect & X264_CPU_FMA4 )
  2685. {
  2686. ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
  2687. cpu1 &= ~X264_CPU_FMA4;
  2688. }
  2689. if( cpu_detect & X264_CPU_FMA3 )
  2690. ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
  2691. if( cpu_detect & X264_CPU_BMI1 )
  2692. ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
  2693. if( cpu_detect & X264_CPU_BMI2 )
  2694. ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
  2695. if( cpu_detect & X264_CPU_AVX2 )
  2696. ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
  2697. if( cpu_detect & X264_CPU_AVX512 )
  2698. ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
  2699. #elif ARCH_PPC
  2700. if( cpu_detect & X264_CPU_ALTIVEC )
  2701. {
  2702. fprintf( stderr, "x264: ALTIVEC against C\n" );
  2703. ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
  2704. }
  2705. #elif ARCH_ARM
  2706. if( cpu_detect & X264_CPU_NEON )
  2707. x264_checkasm_call = x264_checkasm_call_neon;
  2708. if( cpu_detect & X264_CPU_ARMV6 )
  2709. ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
  2710. if( cpu_detect & X264_CPU_NEON )
  2711. ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
  2712. if( cpu_detect & X264_CPU_FAST_NEON_MRC )
  2713. ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
  2714. #elif ARCH_AARCH64
  2715. if( cpu_detect & X264_CPU_ARMV8 )
  2716. ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
  2717. if( cpu_detect & X264_CPU_NEON )
  2718. ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
  2719. #elif ARCH_MIPS
  2720. if( cpu_detect & X264_CPU_MSA )
  2721. ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" );
  2722. #endif
  2723. return ret;
  2724. }
  2725. static int main_internal( int argc, char **argv )
  2726. {
  2727. #ifdef _WIN32
  2728. /* Disable the Windows Error Reporting dialog */
  2729. SetErrorMode( SEM_NOGPFAULTERRORBOX );
  2730. #endif
  2731. if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
  2732. {
  2733. #if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
  2734. fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
  2735. return 1;
  2736. #endif
  2737. do_bench = 1;
  2738. if( argv[1][7] == '=' )
  2739. {
  2740. bench_pattern = argv[1]+8;
  2741. bench_pattern_len = strlen(bench_pattern);
  2742. }
  2743. argc--;
  2744. argv++;
  2745. }
  2746. int seed = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
  2747. fprintf( stderr, "x264: using random seed %u\n", seed );
  2748. srand( seed );
  2749. buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
  2750. pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
  2751. if( !buf1 || !pbuf1 )
  2752. {
  2753. fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
  2754. return -1;
  2755. }
  2756. #define INIT_POINTER_OFFSETS\
  2757. buf2 = buf1 + 0xf00;\
  2758. buf3 = buf2 + 0xf00;\
  2759. buf4 = buf3 + 0x1000*sizeof(pixel);\
  2760. pbuf2 = pbuf1 + 0xf00;\
  2761. pbuf3 = (pixel*)buf3;\
  2762. pbuf4 = (pixel*)buf4;
  2763. INIT_POINTER_OFFSETS;
  2764. for( int i = 0; i < 0x1e00; i++ )
  2765. {
  2766. buf1[i] = rand() & 0xFF;
  2767. pbuf1[i] = rand() & PIXEL_MAX;
  2768. }
  2769. memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
  2770. if( x264_stack_pagealign( check_all_flags, 0 ) )
  2771. {
  2772. fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
  2773. return -1;
  2774. }
  2775. fprintf( stderr, "x264: All tests passed Yeah :)\n" );
  2776. if( do_bench )
  2777. print_bench();
  2778. return 0;
  2779. }
  2780. int main( int argc, char **argv )
  2781. {
  2782. return x264_stack_align( main_internal, argc, argv );
  2783. }