analyse.c 160 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883
  1. /*****************************************************************************
  2. * analyse.c: macroblock analysis
  3. *****************************************************************************
  4. * Copyright (C) 2003-2018 x264 project
  5. *
  6. * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  7. * Loren Merritt <lorenm@u.washington.edu>
  8. * Fiona Glaser <fiona@x264.com>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include "common/common.h"
  28. #include "macroblock.h"
  29. #include "me.h"
  30. #include "ratecontrol.h"
  31. #include "analyse.h"
  32. #include "rdo.c"
  33. typedef struct
  34. {
  35. x264_me_t me16x16;
  36. x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
  37. x264_me_t me8x8[4];
  38. x264_me_t me4x4[4][4];
  39. x264_me_t me8x4[4][2];
  40. x264_me_t me4x8[4][2];
  41. x264_me_t me16x8[2];
  42. x264_me_t me8x16[2];
  43. int i_rd16x16;
  44. int i_cost8x8;
  45. int i_cost4x4[4]; /* cost per 8x8 partition */
  46. int i_cost8x4[4]; /* cost per 8x8 partition */
  47. int i_cost4x8[4]; /* cost per 8x8 partition */
  48. int i_cost16x8;
  49. int i_cost8x16;
  50. /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
  51. ALIGNED_4( int16_t mvc[32][5][2] );
  52. } x264_mb_analysis_list_t;
  53. typedef struct
  54. {
  55. /* conduct the analysis using this lamda and QP */
  56. int i_lambda;
  57. int i_lambda2;
  58. int i_qp;
  59. uint16_t *p_cost_mv;
  60. uint16_t *p_cost_ref[2];
  61. int i_mbrd;
  62. /* I: Intra part */
  63. /* Take some shortcuts in intra search if intra is deemed unlikely */
  64. int b_fast_intra;
  65. int b_force_intra; /* For Periodic Intra Refresh. Only supported in P-frames. */
  66. int b_avoid_topright; /* For Periodic Intra Refresh: don't predict from top-right pixels. */
  67. int b_try_skip;
  68. /* Luma part */
  69. int i_satd_i16x16;
  70. int i_satd_i16x16_dir[7];
  71. int i_predict16x16;
  72. int i_satd_i8x8;
  73. int i_cbp_i8x8_luma;
  74. ALIGNED_16( uint16_t i_satd_i8x8_dir[4][16] );
  75. int i_predict8x8[4];
  76. int i_satd_i4x4;
  77. int i_predict4x4[16];
  78. int i_satd_pcm;
  79. /* Chroma part */
  80. int i_satd_chroma;
  81. int i_satd_chroma_dir[7];
  82. int i_predict8x8chroma;
  83. /* II: Inter part P/B frame */
  84. x264_mb_analysis_list_t l0;
  85. x264_mb_analysis_list_t l1;
  86. int i_cost16x16bi; /* used the same ref and mv as l0 and l1 (at least for now) */
  87. int i_cost16x16direct;
  88. int i_cost8x8bi;
  89. int i_cost8x8direct[4];
  90. int i_satd8x8[3][4]; /* [L0,L1,BI][8x8 0..3] SATD only */
  91. int i_cost_est16x8[2]; /* Per-partition estimated cost */
  92. int i_cost_est8x16[2];
  93. int i_cost16x8bi;
  94. int i_cost8x16bi;
  95. int i_rd16x16bi;
  96. int i_rd16x16direct;
  97. int i_rd16x8bi;
  98. int i_rd8x16bi;
  99. int i_rd8x8bi;
  100. int i_mb_partition16x8[2]; /* mb_partition_e */
  101. int i_mb_partition8x16[2];
  102. int i_mb_type16x8; /* mb_class_e */
  103. int i_mb_type8x16;
  104. int b_direct_available;
  105. int b_early_terminate;
  106. } x264_mb_analysis_t;
  107. /* TODO: calculate CABAC costs */
  108. static const uint8_t i_mb_b_cost_table[X264_MBTYPE_MAX] =
  109. {
  110. 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0
  111. };
  112. static const uint8_t i_mb_b16x8_cost_table[17] =
  113. {
  114. 0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9
  115. };
  116. static const uint8_t i_sub_mb_b_cost_table[13] =
  117. {
  118. 7, 5, 5, 3, 7, 5, 7, 3, 7, 7, 7, 5, 1
  119. };
  120. static const uint8_t i_sub_mb_p_cost_table[4] =
  121. {
  122. 5, 3, 3, 1
  123. };
  124. static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
  125. static int init_costs( x264_t *h, float *logs, int qp )
  126. {
  127. if( h->cost_mv[qp] )
  128. return 0;
  129. int mv_range = h->param.analyse.i_mv_range;
  130. int lambda = x264_lambda_tab[qp];
  131. /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
  132. CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) );
  133. h->cost_mv[qp] += 2*4*mv_range;
  134. for( int i = 0; i <= 2*4*mv_range; i++ )
  135. {
  136. h->cost_mv[qp][-i] =
  137. h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
  138. }
  139. for( int i = 0; i < 3; i++ )
  140. for( int j = 0; j < 33; j++ )
  141. h->cost_table->ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
  142. if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
  143. {
  144. for( int j = 0; j < 4; j++ )
  145. {
  146. CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) );
  147. h->cost_mv_fpel[qp][j] += 2*mv_range;
  148. for( int i = -2*mv_range; i < 2*mv_range; i++ )
  149. h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
  150. }
  151. }
  152. uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[qp];
  153. for( int i = 0; i < 17; i++ )
  154. cost_i4x4_mode[i] = 3*lambda*(i!=8);
  155. return 0;
  156. fail:
  157. return -1;
  158. }
  159. int x264_analyse_init_costs( x264_t *h )
  160. {
  161. int mv_range = h->param.analyse.i_mv_range;
  162. float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) );
  163. if( !logs )
  164. return -1;
  165. logs[0] = 0.718f;
  166. for( int i = 1; i <= 2*4*mv_range; i++ )
  167. logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
  168. for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
  169. if( init_costs( h, logs, qp ) )
  170. goto fail;
  171. if( init_costs( h, logs, X264_LOOKAHEAD_QP ) )
  172. goto fail;
  173. x264_free( logs );
  174. return 0;
  175. fail:
  176. x264_free( logs );
  177. return -1;
  178. }
  179. void x264_analyse_free_costs( x264_t *h )
  180. {
  181. int mv_range = h->param.analyse.i_mv_range;
  182. for( int i = 0; i < QP_MAX+1; i++ )
  183. {
  184. if( h->cost_mv[i] )
  185. x264_free( h->cost_mv[i] - 2*4*mv_range );
  186. for( int j = 0; j < 4; j++ )
  187. {
  188. if( h->cost_mv_fpel[i][j] )
  189. x264_free( h->cost_mv_fpel[i][j] - 2*mv_range );
  190. }
  191. }
  192. }
  193. void x264_analyse_weight_frame( x264_t *h, int end )
  194. {
  195. for( int j = 0; j < h->i_ref[0]; j++ )
  196. {
  197. if( h->sh.weight[j][0].weightfn )
  198. {
  199. x264_frame_t *frame = h->fref[0][j];
  200. int width = frame->i_width[0] + 2*PADH;
  201. int i_padv = PADV << PARAM_INTERLACED;
  202. int offset, height;
  203. pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
  204. height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
  205. offset = h->fenc->i_lines_weighted*frame->i_stride[0];
  206. h->fenc->i_lines_weighted += height;
  207. if( height )
  208. for( int k = j; k < h->i_ref[0]; k++ )
  209. if( h->sh.weight[k][0].weightfn )
  210. {
  211. pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
  212. x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
  213. src + offset, frame->i_stride[0],
  214. width, height, &h->sh.weight[k][0] );
  215. }
  216. break;
  217. }
  218. }
  219. }
  220. /* initialize an array of lambda*nbits for all possible mvs */
  221. static void mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
  222. {
  223. a->p_cost_mv = h->cost_mv[a->i_qp];
  224. a->p_cost_ref[0] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
  225. a->p_cost_ref[1] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
  226. }
  227. static void mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
  228. {
  229. int effective_chroma_qp = h->chroma_qp_table[SPEC_QP(qp)] + X264_MAX( qp - QP_MAX_SPEC, 0 );
  230. a->i_lambda = x264_lambda_tab[qp];
  231. a->i_lambda2 = x264_lambda2_tab[qp];
  232. h->mb.b_trellis = h->param.analyse.i_trellis > 1 && a->i_mbrd;
  233. if( h->param.analyse.i_trellis )
  234. {
  235. h->mb.i_trellis_lambda2[0][0] = x264_trellis_lambda2_tab[0][qp];
  236. h->mb.i_trellis_lambda2[0][1] = x264_trellis_lambda2_tab[1][qp];
  237. h->mb.i_trellis_lambda2[1][0] = x264_trellis_lambda2_tab[0][effective_chroma_qp];
  238. h->mb.i_trellis_lambda2[1][1] = x264_trellis_lambda2_tab[1][effective_chroma_qp];
  239. }
  240. h->mb.i_psy_rd_lambda = a->i_lambda;
  241. /* Adjusting chroma lambda based on QP offset hurts PSNR but improves visual quality. */
  242. int chroma_offset_idx = X264_MIN( qp-effective_chroma_qp+12, MAX_CHROMA_LAMBDA_OFFSET );
  243. h->mb.i_chroma_lambda2_offset = h->param.analyse.b_psy ? x264_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
  244. if( qp > QP_MAX_SPEC )
  245. {
  246. h->nr_offset = h->nr_offset_emergency[qp-QP_MAX_SPEC-1];
  247. h->nr_residual_sum = h->nr_residual_sum_buf[1];
  248. h->nr_count = h->nr_count_buf[1];
  249. h->mb.b_noise_reduction = 1;
  250. qp = QP_MAX_SPEC; /* Out-of-spec QPs are just used for calculating lambda values. */
  251. }
  252. else
  253. {
  254. h->nr_offset = h->nr_offset_denoise;
  255. h->nr_residual_sum = h->nr_residual_sum_buf[0];
  256. h->nr_count = h->nr_count_buf[0];
  257. h->mb.b_noise_reduction = 0;
  258. }
  259. a->i_qp = h->mb.i_qp = qp;
  260. h->mb.i_chroma_qp = h->chroma_qp_table[qp];
  261. }
  262. static void mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
  263. {
  264. int subme = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
  265. /* mbrd == 1 -> RD mode decision */
  266. /* mbrd == 2 -> RD refinement */
  267. /* mbrd == 3 -> QPRD */
  268. a->i_mbrd = (subme>=6) + (subme>=8) + (h->param.analyse.i_subpel_refine>=10);
  269. h->mb.b_deblock_rdo = h->param.analyse.i_subpel_refine >= 9 && h->sh.i_disable_deblocking_filter_idc != 1;
  270. a->b_early_terminate = h->param.analyse.i_subpel_refine < 11;
  271. mb_analyse_init_qp( h, a, qp );
  272. h->mb.b_transform_8x8 = 0;
  273. /* I: Intra part */
  274. a->i_satd_i16x16 =
  275. a->i_satd_i8x8 =
  276. a->i_satd_i4x4 = COST_MAX;
  277. a->i_satd_chroma = CHROMA_FORMAT ? COST_MAX : 0;
  278. /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it.
  279. * PCM cost can overflow with high lambda2, so cap it at COST_MAX. */
  280. uint64_t pcm_cost = ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8;
  281. a->i_satd_pcm = !h->param.i_avcintra_class && !h->mb.i_psy_rd && a->i_mbrd && pcm_cost < COST_MAX ? pcm_cost : COST_MAX;
  282. a->b_fast_intra = 0;
  283. a->b_avoid_topright = 0;
  284. h->mb.i_skip_intra =
  285. h->mb.b_lossless ? 0 :
  286. a->i_mbrd ? 2 :
  287. !h->param.analyse.i_trellis && !h->param.analyse.i_noise_reduction;
  288. /* II: Inter part P/B frame */
  289. if( h->sh.i_type != SLICE_TYPE_I )
  290. {
  291. int i_fmv_range = 4 * h->param.analyse.i_mv_range;
  292. // limit motion search to a slightly smaller range than the theoretical limit,
  293. // since the search may go a few iterations past its given range
  294. int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
  295. /* Calculate max allowed MV range */
  296. h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
  297. h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
  298. h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range );
  299. h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 );
  300. if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
  301. {
  302. int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
  303. int max_mv = max_x - 4*16*h->mb.i_mb_x;
  304. /* If we're left of the refresh bar, don't reference right of it. */
  305. if( max_mv > 0 && h->mb.i_mb_x < h->fdec->i_pir_start_col )
  306. h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max_spel[0], max_mv );
  307. }
  308. h->mb.mv_limit_fpel[0][0] = (h->mb.mv_min_spel[0]>>2) + i_fpel_border;
  309. h->mb.mv_limit_fpel[1][0] = (h->mb.mv_max_spel[0]>>2) - i_fpel_border;
  310. if( h->mb.i_mb_x == 0 && !(h->mb.i_mb_y & PARAM_INTERLACED) )
  311. {
  312. int mb_y = h->mb.i_mb_y >> SLICE_MBAFF;
  313. int thread_mvy_range = i_fmv_range;
  314. if( h->i_thread_frames > 1 )
  315. {
  316. int pix_y = (h->mb.i_mb_y | PARAM_INTERLACED) * 16;
  317. int thresh = pix_y + h->param.analyse.i_mv_range_thread;
  318. for( int i = (h->sh.i_type == SLICE_TYPE_B); i >= 0; i-- )
  319. for( int j = 0; j < h->i_ref[i]; j++ )
  320. {
  321. x264_frame_cond_wait( h->fref[i][j]->orig, thresh );
  322. thread_mvy_range = X264_MIN( thread_mvy_range, h->fref[i][j]->orig->i_lines_completed - pix_y );
  323. }
  324. if( h->param.b_deterministic )
  325. thread_mvy_range = h->param.analyse.i_mv_range_thread;
  326. if( PARAM_INTERLACED )
  327. thread_mvy_range >>= 1;
  328. x264_analyse_weight_frame( h, pix_y + thread_mvy_range );
  329. }
  330. if( PARAM_INTERLACED )
  331. {
  332. /* 0 == top progressive, 1 == bot progressive, 2 == interlaced */
  333. for( int i = 0; i < 3; i++ )
  334. {
  335. int j = i == 2;
  336. mb_y = (h->mb.i_mb_y >> j) + (i == 1);
  337. h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
  338. h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
  339. h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range );
  340. h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range );
  341. h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
  342. h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
  343. }
  344. }
  345. else
  346. {
  347. h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
  348. h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
  349. h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range );
  350. h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range );
  351. h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
  352. h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
  353. }
  354. }
  355. if( PARAM_INTERLACED )
  356. {
  357. int i = MB_INTERLACED ? 2 : h->mb.i_mb_y&1;
  358. h->mb.mv_min[1] = h->mb.mv_miny_row[i];
  359. h->mb.mv_max[1] = h->mb.mv_maxy_row[i];
  360. h->mb.mv_min_spel[1] = h->mb.mv_miny_spel_row[i];
  361. h->mb.mv_max_spel[1] = h->mb.mv_maxy_spel_row[i];
  362. h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
  363. h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
  364. }
  365. a->l0.me16x16.cost =
  366. a->l0.i_rd16x16 =
  367. a->l0.i_cost8x8 =
  368. a->l0.i_cost16x8 =
  369. a->l0.i_cost8x16 = COST_MAX;
  370. if( h->sh.i_type == SLICE_TYPE_B )
  371. {
  372. a->l1.me16x16.cost =
  373. a->l1.i_rd16x16 =
  374. a->l1.i_cost8x8 =
  375. a->i_cost8x8direct[0] =
  376. a->i_cost8x8direct[1] =
  377. a->i_cost8x8direct[2] =
  378. a->i_cost8x8direct[3] =
  379. a->l1.i_cost16x8 =
  380. a->l1.i_cost8x16 =
  381. a->i_rd16x16bi =
  382. a->i_rd16x16direct =
  383. a->i_rd8x8bi =
  384. a->i_rd16x8bi =
  385. a->i_rd8x16bi =
  386. a->i_cost16x16bi =
  387. a->i_cost16x16direct =
  388. a->i_cost8x8bi =
  389. a->i_cost16x8bi =
  390. a->i_cost8x16bi = COST_MAX;
  391. }
  392. else if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  393. for( int i = 0; i < 4; i++ )
  394. {
  395. a->l0.i_cost4x4[i] =
  396. a->l0.i_cost8x4[i] =
  397. a->l0.i_cost4x8[i] = COST_MAX;
  398. }
  399. /* Fast intra decision */
  400. if( a->b_early_terminate && h->mb.i_mb_xy - h->sh.i_first_mb > 4 )
  401. {
  402. if( IS_INTRA( h->mb.i_mb_type_left[0] ) ||
  403. IS_INTRA( h->mb.i_mb_type_top ) ||
  404. IS_INTRA( h->mb.i_mb_type_topleft ) ||
  405. IS_INTRA( h->mb.i_mb_type_topright ) ||
  406. (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref[0][0]->mb_type[h->mb.i_mb_xy] )) ||
  407. (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) )
  408. { /* intra is likely */ }
  409. else
  410. {
  411. a->b_fast_intra = 1;
  412. }
  413. }
  414. h->mb.b_skip_mc = 0;
  415. if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P &&
  416. h->mb.i_mb_x >= h->fdec->i_pir_start_col && h->mb.i_mb_x <= h->fdec->i_pir_end_col )
  417. {
  418. a->b_force_intra = 1;
  419. a->b_fast_intra = 0;
  420. a->b_avoid_topright = h->mb.i_mb_x == h->fdec->i_pir_end_col;
  421. }
  422. else
  423. a->b_force_intra = 0;
  424. }
  425. }
  426. /* Prediction modes allowed for various combinations of neighbors. */
  427. /* Terminated by a -1. */
  428. /* In order, no neighbors, left, top, top/left, top/left/topleft */
  429. static const int8_t i16x16_mode_available[5][5] =
  430. {
  431. {I_PRED_16x16_DC_128, -1, -1, -1, -1},
  432. {I_PRED_16x16_DC_LEFT, I_PRED_16x16_H, -1, -1, -1},
  433. {I_PRED_16x16_DC_TOP, I_PRED_16x16_V, -1, -1, -1},
  434. {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, -1, -1},
  435. {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
  436. };
  437. static const int8_t chroma_mode_available[5][5] =
  438. {
  439. {I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
  440. {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
  441. {I_PRED_CHROMA_DC_TOP, I_PRED_CHROMA_V, -1, -1, -1},
  442. {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, -1, -1},
  443. {I_PRED_CHROMA_V, I_PRED_CHROMA_H, I_PRED_CHROMA_DC, I_PRED_CHROMA_P, -1},
  444. };
  445. static const int8_t i8x8_mode_available[2][5][10] =
  446. {
  447. {
  448. {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
  449. {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
  450. {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
  451. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
  452. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
  453. },
  454. {
  455. {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
  456. {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
  457. {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
  458. {I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1, -1},
  459. {I_PRED_4x4_H, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
  460. }
  461. };
  462. static const int8_t i4x4_mode_available[2][5][10] =
  463. {
  464. {
  465. {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
  466. {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
  467. {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1, -1, -1, -1},
  468. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_VL, I_PRED_4x4_HU, -1, -1, -1, -1},
  469. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, -1},
  470. },
  471. {
  472. {I_PRED_4x4_DC_128, -1, -1, -1, -1, -1, -1, -1, -1, -1},
  473. {I_PRED_4x4_DC_LEFT, I_PRED_4x4_H, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1, -1},
  474. {I_PRED_4x4_DC_TOP, I_PRED_4x4_V, -1, -1, -1, -1, -1, -1, -1, -1},
  475. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_HU, -1, -1, -1, -1, -1, -1},
  476. {I_PRED_4x4_DC, I_PRED_4x4_H, I_PRED_4x4_V, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1, -1},
  477. }
  478. };
  479. static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour )
  480. {
  481. int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
  482. idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
  483. return i16x16_mode_available[idx];
  484. }
  485. static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
  486. {
  487. int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
  488. idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
  489. return chroma_mode_available[idx];
  490. }
  491. static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
  492. {
  493. int avoid_topright = force_intra && (i&1);
  494. int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
  495. idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
  496. return i8x8_mode_available[avoid_topright][idx];
  497. }
  498. static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra, int i_neighbour, int i )
  499. {
  500. int avoid_topright = force_intra && ((i&5) == 5);
  501. int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
  502. idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
  503. return i4x4_mode_available[avoid_topright][idx];
  504. }
  505. /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
  506. static inline void psy_trellis_init( x264_t *h, int do_both_dct )
  507. {
  508. if( do_both_dct || h->mb.b_transform_8x8 )
  509. h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
  510. if( do_both_dct || !h->mb.b_transform_8x8 )
  511. h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
  512. }
  513. /* Reset fenc satd scores cache for psy RD */
  514. static inline void mb_init_fenc_cache( x264_t *h, int b_satd )
  515. {
  516. if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
  517. psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
  518. if( !h->mb.i_psy_rd )
  519. return;
  520. M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
  521. M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
  522. M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
  523. M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
  524. h->mb.pic.fenc_hadamard_cache[8] = 0;
  525. if( b_satd )
  526. h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
  527. }
  528. static void mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
  529. {
  530. if( a->i_satd_chroma < COST_MAX )
  531. return;
  532. if( CHROMA444 )
  533. {
  534. if( !h->mb.b_chroma_me )
  535. {
  536. a->i_satd_chroma = 0;
  537. return;
  538. }
  539. /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
  540. if( h->mb.b_lossless )
  541. {
  542. x264_predict_lossless_16x16( h, 1, a->i_predict16x16 );
  543. x264_predict_lossless_16x16( h, 2, a->i_predict16x16 );
  544. }
  545. else
  546. {
  547. h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
  548. h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
  549. }
  550. a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
  551. + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
  552. return;
  553. }
  554. const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
  555. int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
  556. /* Prediction selection for chroma */
  557. if( predict_mode[3] >= 0 && !h->mb.b_lossless )
  558. {
  559. int satdu[4], satdv[4];
  560. h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
  561. h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
  562. h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
  563. h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
  564. satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
  565. satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
  566. for( ; *predict_mode >= 0; predict_mode++ )
  567. {
  568. int i_mode = *predict_mode;
  569. int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
  570. a->i_satd_chroma_dir[i_mode] = i_satd;
  571. COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
  572. }
  573. }
  574. else
  575. {
  576. for( ; *predict_mode >= 0; predict_mode++ )
  577. {
  578. int i_satd;
  579. int i_mode = *predict_mode;
  580. /* we do the prediction */
  581. if( h->mb.b_lossless )
  582. x264_predict_lossless_chroma( h, i_mode );
  583. else
  584. {
  585. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
  586. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
  587. }
  588. /* we calculate the cost */
  589. i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) +
  590. h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) +
  591. a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
  592. a->i_satd_chroma_dir[i_mode] = i_satd;
  593. COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
  594. }
  595. }
  596. h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
  597. }
  598. /* FIXME: should we do any sort of merged chroma analysis with 4:4:4? */
  599. static void mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
  600. {
  601. const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
  602. pixel *p_src = h->mb.pic.p_fenc[0];
  603. pixel *p_dst = h->mb.pic.p_fdec[0];
  604. static const int8_t intra_analysis_shortcut[2][2][2][5] =
  605. {
  606. {{{I_PRED_4x4_HU, -1, -1, -1, -1},
  607. {I_PRED_4x4_DDL, I_PRED_4x4_VL, -1, -1, -1}},
  608. {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
  609. {I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_VL, -1}}},
  610. {{{I_PRED_4x4_HU, -1, -1, -1, -1},
  611. {-1, -1, -1, -1, -1}},
  612. {{I_PRED_4x4_DDR, I_PRED_4x4_HD, I_PRED_4x4_HU, -1, -1},
  613. {I_PRED_4x4_DDR, I_PRED_4x4_VR, -1, -1, -1}}},
  614. };
  615. int idx;
  616. int lambda = a->i_lambda;
  617. /*---------------- Try all mode and calculate their score ---------------*/
  618. /* Disabled i16x16 for AVC-Intra compat */
  619. if( !h->param.i_avcintra_class )
  620. {
  621. const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
  622. /* Not heavily tuned */
  623. static const uint8_t i16x16_thresh_lut[11] = { 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4 };
  624. int i16x16_thresh = a->b_fast_intra ? (i16x16_thresh_lut[h->mb.i_subpel_refine]*i_satd_inter)>>1 : COST_MAX;
  625. if( !h->mb.b_lossless && predict_mode[3] >= 0 )
  626. {
  627. h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
  628. a->i_satd_i16x16_dir[0] += lambda * bs_size_ue(0);
  629. a->i_satd_i16x16_dir[1] += lambda * bs_size_ue(1);
  630. a->i_satd_i16x16_dir[2] += lambda * bs_size_ue(2);
  631. COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[0], a->i_predict16x16, 0 );
  632. COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[1], a->i_predict16x16, 1 );
  633. COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[2], a->i_predict16x16, 2 );
  634. /* Plane is expensive, so don't check it unless one of the previous modes was useful. */
  635. if( a->i_satd_i16x16 <= i16x16_thresh )
  636. {
  637. h->predict_16x16[I_PRED_16x16_P]( p_dst );
  638. a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
  639. a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
  640. COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
  641. }
  642. }
  643. else
  644. {
  645. for( ; *predict_mode >= 0; predict_mode++ )
  646. {
  647. int i_satd;
  648. int i_mode = *predict_mode;
  649. if( h->mb.b_lossless )
  650. x264_predict_lossless_16x16( h, 0, i_mode );
  651. else
  652. h->predict_16x16[i_mode]( p_dst );
  653. i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) +
  654. lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
  655. COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
  656. a->i_satd_i16x16_dir[i_mode] = i_satd;
  657. }
  658. }
  659. if( h->sh.i_type == SLICE_TYPE_B )
  660. /* cavlc mb type prefix */
  661. a->i_satd_i16x16 += lambda * i_mb_b_cost_table[I_16x16];
  662. if( a->i_satd_i16x16 > i16x16_thresh )
  663. return;
  664. }
  665. uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[a->i_qp] + 8;
  666. /* 8x8 prediction selection */
  667. if( flags & X264_ANALYSE_I8x8 )
  668. {
  669. ALIGNED_ARRAY_32( pixel, edge,[36] );
  670. x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
  671. int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
  672. // FIXME some bias like in i4x4?
  673. int i_cost = lambda * 4; /* base predmode costs */
  674. h->mb.i_cbp_luma = 0;
  675. if( h->sh.i_type == SLICE_TYPE_B )
  676. i_cost += lambda * i_mb_b_cost_table[I_8x8];
  677. for( idx = 0;; idx++ )
  678. {
  679. int x = idx&1;
  680. int y = idx>>1;
  681. pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
  682. pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
  683. int i_best = COST_MAX;
  684. int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
  685. const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
  686. h->predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
  687. if( h->pixf.intra_mbcmp_x9_8x8 && predict_mode[8] >= 0 )
  688. {
  689. /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
  690. i_best = h->pixf.intra_mbcmp_x9_8x8( p_src_by, p_dst_by, edge, cost_i4x4_mode-i_pred_mode, a->i_satd_i8x8_dir[idx] );
  691. i_cost += i_best & 0xffff;
  692. i_best >>= 16;
  693. a->i_predict8x8[idx] = i_best;
  694. if( idx == 3 || i_cost > i_satd_thresh )
  695. break;
  696. x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, i_best );
  697. }
  698. else
  699. {
  700. if( !h->mb.b_lossless && predict_mode[5] >= 0 )
  701. {
  702. ALIGNED_ARRAY_16( int32_t, satd,[9] );
  703. h->pixf.intra_mbcmp_x3_8x8( p_src_by, edge, satd );
  704. int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
  705. satd[i_pred_mode] -= 3 * lambda;
  706. for( int i = 2; i >= 0; i-- )
  707. {
  708. int cost = satd[i];
  709. a->i_satd_i8x8_dir[idx][i] = cost + 4 * lambda;
  710. COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
  711. }
  712. /* Take analysis shortcuts: don't analyse modes that are too
  713. * far away direction-wise from the favored mode. */
  714. if( a->i_mbrd < 1 + a->b_fast_intra )
  715. predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
  716. else
  717. predict_mode += 3;
  718. }
  719. for( ; *predict_mode >= 0 && (i_best >= 0 || a->i_mbrd >= 2); predict_mode++ )
  720. {
  721. int i_satd;
  722. int i_mode = *predict_mode;
  723. if( h->mb.b_lossless )
  724. x264_predict_lossless_8x8( h, p_dst_by, 0, idx, i_mode, edge );
  725. else
  726. h->predict_8x8[i_mode]( p_dst_by, edge );
  727. i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
  728. if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
  729. i_satd -= 3 * lambda;
  730. COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
  731. a->i_satd_i8x8_dir[idx][i_mode] = i_satd + 4 * lambda;
  732. }
  733. i_cost += i_best + 3*lambda;
  734. if( idx == 3 || i_cost > i_satd_thresh )
  735. break;
  736. if( h->mb.b_lossless )
  737. x264_predict_lossless_8x8( h, p_dst_by, 0, idx, a->i_predict8x8[idx], edge );
  738. else
  739. h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
  740. x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
  741. }
  742. /* we need to encode this block now (for next ones) */
  743. x264_mb_encode_i8x8( h, 0, idx, a->i_qp, a->i_predict8x8[idx], edge, 0 );
  744. }
  745. if( idx == 3 )
  746. {
  747. a->i_satd_i8x8 = i_cost;
  748. if( h->mb.i_skip_intra )
  749. {
  750. h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
  751. h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
  752. h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
  753. h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
  754. h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
  755. h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
  756. if( h->mb.i_skip_intra == 2 )
  757. h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
  758. }
  759. }
  760. else
  761. {
  762. static const uint16_t cost_div_fix8[3] = {1024,512,341};
  763. a->i_satd_i8x8 = COST_MAX;
  764. i_cost = (i_cost * cost_div_fix8[idx]) >> 8;
  765. }
  766. /* Not heavily tuned */
  767. static const uint8_t i8x8_thresh[11] = { 4, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6 };
  768. if( a->b_early_terminate && X264_MIN(i_cost, a->i_satd_i16x16) > (i_satd_inter*i8x8_thresh[h->mb.i_subpel_refine])>>2 )
  769. return;
  770. }
  771. /* 4x4 prediction selection */
  772. if( flags & X264_ANALYSE_I4x4 )
  773. {
  774. int i_cost = lambda * (24+16); /* 24from JVT (SATD0), 16 from base predmode costs */
  775. int i_satd_thresh = a->b_early_terminate ? X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ) : COST_MAX;
  776. h->mb.i_cbp_luma = 0;
  777. if( a->b_early_terminate && a->i_mbrd )
  778. i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
  779. if( h->sh.i_type == SLICE_TYPE_B )
  780. i_cost += lambda * i_mb_b_cost_table[I_4x4];
  781. for( idx = 0;; idx++ )
  782. {
  783. pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
  784. pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
  785. int i_best = COST_MAX;
  786. int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
  787. const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
  788. if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
  789. /* emulate missing topright samples */
  790. MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
  791. if( h->pixf.intra_mbcmp_x9_4x4 && predict_mode[8] >= 0 )
  792. {
  793. /* No shortcuts here. The SSSE3 implementation of intra_mbcmp_x9 is fast enough. */
  794. i_best = h->pixf.intra_mbcmp_x9_4x4( p_src_by, p_dst_by, cost_i4x4_mode-i_pred_mode );
  795. i_cost += i_best & 0xffff;
  796. i_best >>= 16;
  797. a->i_predict4x4[idx] = i_best;
  798. if( i_cost > i_satd_thresh || idx == 15 )
  799. break;
  800. h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = i_best;
  801. }
  802. else
  803. {
  804. if( !h->mb.b_lossless && predict_mode[5] >= 0 )
  805. {
  806. ALIGNED_ARRAY_16( int32_t, satd,[9] );
  807. h->pixf.intra_mbcmp_x3_4x4( p_src_by, p_dst_by, satd );
  808. int favor_vertical = satd[I_PRED_4x4_H] > satd[I_PRED_4x4_V];
  809. satd[i_pred_mode] -= 3 * lambda;
  810. i_best = satd[I_PRED_4x4_DC]; a->i_predict4x4[idx] = I_PRED_4x4_DC;
  811. COPY2_IF_LT( i_best, satd[I_PRED_4x4_H], a->i_predict4x4[idx], I_PRED_4x4_H );
  812. COPY2_IF_LT( i_best, satd[I_PRED_4x4_V], a->i_predict4x4[idx], I_PRED_4x4_V );
  813. /* Take analysis shortcuts: don't analyse modes that are too
  814. * far away direction-wise from the favored mode. */
  815. if( a->i_mbrd < 1 + a->b_fast_intra )
  816. predict_mode = intra_analysis_shortcut[a->b_avoid_topright][predict_mode[8] >= 0][favor_vertical];
  817. else
  818. predict_mode += 3;
  819. }
  820. if( i_best > 0 )
  821. {
  822. for( ; *predict_mode >= 0; predict_mode++ )
  823. {
  824. int i_satd;
  825. int i_mode = *predict_mode;
  826. if( h->mb.b_lossless )
  827. x264_predict_lossless_4x4( h, p_dst_by, 0, idx, i_mode );
  828. else
  829. h->predict_4x4[i_mode]( p_dst_by );
  830. i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE );
  831. if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
  832. {
  833. i_satd -= lambda * 3;
  834. if( i_satd <= 0 )
  835. {
  836. i_best = i_satd;
  837. a->i_predict4x4[idx] = i_mode;
  838. break;
  839. }
  840. }
  841. COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
  842. }
  843. }
  844. i_cost += i_best + 3 * lambda;
  845. if( i_cost > i_satd_thresh || idx == 15 )
  846. break;
  847. if( h->mb.b_lossless )
  848. x264_predict_lossless_4x4( h, p_dst_by, 0, idx, a->i_predict4x4[idx] );
  849. else
  850. h->predict_4x4[a->i_predict4x4[idx]]( p_dst_by );
  851. h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
  852. }
  853. /* we need to encode this block now (for next ones) */
  854. x264_mb_encode_i4x4( h, 0, idx, a->i_qp, a->i_predict4x4[idx], 0 );
  855. }
  856. if( idx == 15 )
  857. {
  858. a->i_satd_i4x4 = i_cost;
  859. if( h->mb.i_skip_intra )
  860. {
  861. h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
  862. h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
  863. h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
  864. h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
  865. h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
  866. h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
  867. if( h->mb.i_skip_intra == 2 )
  868. h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
  869. }
  870. }
  871. else
  872. a->i_satd_i4x4 = COST_MAX;
  873. }
  874. }
  875. static void intra_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_thresh )
  876. {
  877. if( !a->b_early_terminate )
  878. i_satd_thresh = COST_MAX;
  879. if( a->i_satd_i16x16 < i_satd_thresh )
  880. {
  881. h->mb.i_type = I_16x16;
  882. analyse_update_cache( h, a );
  883. a->i_satd_i16x16 = rd_cost_mb( h, a->i_lambda2 );
  884. }
  885. else
  886. a->i_satd_i16x16 = COST_MAX;
  887. if( a->i_satd_i4x4 < i_satd_thresh )
  888. {
  889. h->mb.i_type = I_4x4;
  890. analyse_update_cache( h, a );
  891. a->i_satd_i4x4 = rd_cost_mb( h, a->i_lambda2 );
  892. }
  893. else
  894. a->i_satd_i4x4 = COST_MAX;
  895. if( a->i_satd_i8x8 < i_satd_thresh )
  896. {
  897. h->mb.i_type = I_8x8;
  898. analyse_update_cache( h, a );
  899. a->i_satd_i8x8 = rd_cost_mb( h, a->i_lambda2 );
  900. a->i_cbp_i8x8_luma = h->mb.i_cbp_luma;
  901. }
  902. else
  903. a->i_satd_i8x8 = COST_MAX;
  904. }
  905. static void intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
  906. {
  907. uint64_t i_satd, i_best;
  908. int plane_count = CHROMA444 ? 3 : 1;
  909. h->mb.i_skip_intra = 0;
  910. if( h->mb.i_type == I_16x16 )
  911. {
  912. int old_pred_mode = a->i_predict16x16;
  913. const int8_t *predict_mode = predict_16x16_mode_available( h->mb.i_neighbour_intra );
  914. int i_thresh = a->b_early_terminate ? a->i_satd_i16x16_dir[old_pred_mode] * 9/8 : COST_MAX;
  915. i_best = a->i_satd_i16x16;
  916. for( ; *predict_mode >= 0; predict_mode++ )
  917. {
  918. int i_mode = *predict_mode;
  919. if( i_mode == old_pred_mode || a->i_satd_i16x16_dir[i_mode] > i_thresh )
  920. continue;
  921. h->mb.i_intra16x16_pred_mode = i_mode;
  922. i_satd = rd_cost_mb( h, a->i_lambda2 );
  923. COPY2_IF_LT( i_best, i_satd, a->i_predict16x16, i_mode );
  924. }
  925. }
  926. /* RD selection for chroma prediction */
  927. if( CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422 )
  928. {
  929. const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
  930. if( predict_mode[1] >= 0 )
  931. {
  932. int8_t predict_mode_sorted[4];
  933. int i_max;
  934. int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
  935. for( i_max = 0; *predict_mode >= 0; predict_mode++ )
  936. {
  937. int i_mode = *predict_mode;
  938. if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
  939. predict_mode_sorted[i_max++] = i_mode;
  940. }
  941. if( i_max > 0 )
  942. {
  943. int i_cbp_chroma_best = h->mb.i_cbp_chroma;
  944. int i_chroma_lambda = x264_lambda2_tab[h->mb.i_chroma_qp];
  945. /* the previous thing encoded was intra_rd(), so the pixels and
  946. * coefs for the current chroma mode are still around, so we only
  947. * have to recount the bits. */
  948. i_best = rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
  949. for( int i = 0; i < i_max; i++ )
  950. {
  951. int i_mode = predict_mode_sorted[i];
  952. if( h->mb.b_lossless )
  953. x264_predict_lossless_chroma( h, i_mode );
  954. else
  955. {
  956. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
  957. h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
  958. }
  959. /* if we've already found a mode that needs no residual, then
  960. * probably any mode with a residual will be worse.
  961. * so avoid dct on the remaining modes to improve speed. */
  962. i_satd = rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
  963. COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
  964. }
  965. h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
  966. h->mb.i_cbp_chroma = i_cbp_chroma_best;
  967. }
  968. }
  969. }
  970. if( h->mb.i_type == I_4x4 )
  971. {
  972. pixel4 pels[3][4] = {{0}}; // doesn't need initting, just shuts up a gcc warning
  973. int nnz[3] = {0};
  974. for( int idx = 0; idx < 16; idx++ )
  975. {
  976. pixel *dst[3] = {h->mb.pic.p_fdec[0] + block_idx_xy_fdec[idx],
  977. h->mb.pic.p_fdec[1] + block_idx_xy_fdec[idx],
  978. h->mb.pic.p_fdec[2] + block_idx_xy_fdec[idx]};
  979. i_best = COST_MAX64;
  980. const int8_t *predict_mode = predict_4x4_mode_available( a->b_avoid_topright, h->mb.i_neighbour4[idx], idx );
  981. if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
  982. for( int p = 0; p < plane_count; p++ )
  983. /* emulate missing topright samples */
  984. MPIXEL_X4( dst[p]+4-FDEC_STRIDE ) = PIXEL_SPLAT_X4( dst[p][3-FDEC_STRIDE] );
  985. for( ; *predict_mode >= 0; predict_mode++ )
  986. {
  987. int i_mode = *predict_mode;
  988. i_satd = rd_cost_i4x4( h, a->i_lambda2, idx, i_mode );
  989. if( i_best > i_satd )
  990. {
  991. a->i_predict4x4[idx] = i_mode;
  992. i_best = i_satd;
  993. for( int p = 0; p < plane_count; p++ )
  994. {
  995. pels[p][0] = MPIXEL_X4( dst[p]+0*FDEC_STRIDE );
  996. pels[p][1] = MPIXEL_X4( dst[p]+1*FDEC_STRIDE );
  997. pels[p][2] = MPIXEL_X4( dst[p]+2*FDEC_STRIDE );
  998. pels[p][3] = MPIXEL_X4( dst[p]+3*FDEC_STRIDE );
  999. nnz[p] = h->mb.cache.non_zero_count[x264_scan8[idx+p*16]];
  1000. }
  1001. }
  1002. }
  1003. for( int p = 0; p < plane_count; p++ )
  1004. {
  1005. MPIXEL_X4( dst[p]+0*FDEC_STRIDE ) = pels[p][0];
  1006. MPIXEL_X4( dst[p]+1*FDEC_STRIDE ) = pels[p][1];
  1007. MPIXEL_X4( dst[p]+2*FDEC_STRIDE ) = pels[p][2];
  1008. MPIXEL_X4( dst[p]+3*FDEC_STRIDE ) = pels[p][3];
  1009. h->mb.cache.non_zero_count[x264_scan8[idx+p*16]] = nnz[p];
  1010. }
  1011. h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
  1012. }
  1013. }
  1014. else if( h->mb.i_type == I_8x8 )
  1015. {
  1016. ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
  1017. pixel4 pels_h[3][2] = {{0}};
  1018. pixel pels_v[3][7] = {{0}};
  1019. uint16_t nnz[3][2] = {{0}}; //shut up gcc
  1020. for( int idx = 0; idx < 4; idx++ )
  1021. {
  1022. int x = idx&1;
  1023. int y = idx>>1;
  1024. int s8 = X264_SCAN8_0 + 2*x + 16*y;
  1025. pixel *dst[3] = {h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE,
  1026. h->mb.pic.p_fdec[1] + 8*x + 8*y*FDEC_STRIDE,
  1027. h->mb.pic.p_fdec[2] + 8*x + 8*y*FDEC_STRIDE};
  1028. int cbp_luma_new = 0;
  1029. int i_thresh = a->b_early_terminate ? a->i_satd_i8x8_dir[idx][a->i_predict8x8[idx]] * 11/8 : COST_MAX;
  1030. i_best = COST_MAX64;
  1031. const int8_t *predict_mode = predict_8x8_mode_available( a->b_avoid_topright, h->mb.i_neighbour8[idx], idx );
  1032. for( int p = 0; p < plane_count; p++ )
  1033. h->predict_8x8_filter( dst[p], edge[p], h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
  1034. for( ; *predict_mode >= 0; predict_mode++ )
  1035. {
  1036. int i_mode = *predict_mode;
  1037. if( a->i_satd_i8x8_dir[idx][i_mode] > i_thresh )
  1038. continue;
  1039. h->mb.i_cbp_luma = a->i_cbp_i8x8_luma;
  1040. i_satd = rd_cost_i8x8( h, a->i_lambda2, idx, i_mode, edge );
  1041. if( i_best > i_satd )
  1042. {
  1043. a->i_predict8x8[idx] = i_mode;
  1044. cbp_luma_new = h->mb.i_cbp_luma;
  1045. i_best = i_satd;
  1046. for( int p = 0; p < plane_count; p++ )
  1047. {
  1048. pels_h[p][0] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 );
  1049. pels_h[p][1] = MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 );
  1050. if( !(idx&1) )
  1051. for( int j = 0; j < 7; j++ )
  1052. pels_v[p][j] = dst[p][7+j*FDEC_STRIDE];
  1053. nnz[p][0] = M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] );
  1054. nnz[p][1] = M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] );
  1055. }
  1056. }
  1057. }
  1058. a->i_cbp_i8x8_luma = cbp_luma_new;
  1059. for( int p = 0; p < plane_count; p++ )
  1060. {
  1061. MPIXEL_X4( dst[p]+7*FDEC_STRIDE+0 ) = pels_h[p][0];
  1062. MPIXEL_X4( dst[p]+7*FDEC_STRIDE+4 ) = pels_h[p][1];
  1063. if( !(idx&1) )
  1064. for( int j = 0; j < 7; j++ )
  1065. dst[p][7+j*FDEC_STRIDE] = pels_v[p][j];
  1066. M16( &h->mb.cache.non_zero_count[s8 + 0*8 + p*16] ) = nnz[p][0];
  1067. M16( &h->mb.cache.non_zero_count[s8 + 1*8 + p*16] ) = nnz[p][1];
  1068. }
  1069. x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
  1070. }
  1071. }
  1072. }
  1073. #define LOAD_FENC(m, src, xoff, yoff) \
  1074. { \
  1075. (m)->p_cost_mv = a->p_cost_mv; \
  1076. (m)->i_stride[0] = h->mb.pic.i_stride[0]; \
  1077. (m)->i_stride[1] = h->mb.pic.i_stride[1]; \
  1078. (m)->i_stride[2] = h->mb.pic.i_stride[2]; \
  1079. (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
  1080. (m)->p_fenc[1] = &(src)[1][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
  1081. (m)->p_fenc[2] = &(src)[2][((xoff)>>CHROMA_H_SHIFT)+((yoff)>>CHROMA_V_SHIFT)*FENC_STRIDE]; \
  1082. }
  1083. #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
  1084. { \
  1085. (m)->p_fref_w = (m)->p_fref[0] = &(src)[0][(xoff)+(yoff)*(m)->i_stride[0]]; \
  1086. (m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
  1087. (m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
  1088. (m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
  1089. if( CHROMA444 ) \
  1090. { \
  1091. (m)->p_fref[ 4] = &(src)[ 4][(xoff)+(yoff)*(m)->i_stride[1]]; \
  1092. (m)->p_fref[ 5] = &(src)[ 5][(xoff)+(yoff)*(m)->i_stride[1]]; \
  1093. (m)->p_fref[ 6] = &(src)[ 6][(xoff)+(yoff)*(m)->i_stride[1]]; \
  1094. (m)->p_fref[ 7] = &(src)[ 7][(xoff)+(yoff)*(m)->i_stride[1]]; \
  1095. (m)->p_fref[ 8] = &(src)[ 8][(xoff)+(yoff)*(m)->i_stride[2]]; \
  1096. (m)->p_fref[ 9] = &(src)[ 9][(xoff)+(yoff)*(m)->i_stride[2]]; \
  1097. (m)->p_fref[10] = &(src)[10][(xoff)+(yoff)*(m)->i_stride[2]]; \
  1098. (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
  1099. } \
  1100. else \
  1101. (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>CHROMA_V_SHIFT)*(m)->i_stride[1]]; \
  1102. (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
  1103. (m)->weight = x264_weight_none; \
  1104. (m)->i_ref = ref; \
  1105. }
  1106. #define LOAD_WPELS(m, src, list, ref, xoff, yoff) \
  1107. (m)->p_fref_w = &(src)[(xoff)+(yoff)*(m)->i_stride[0]]; \
  1108. (m)->weight = h->sh.weight[i_ref];
  1109. #define REF_COST(list, ref) \
  1110. (a->p_cost_ref[list][ref])
  1111. static void mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
  1112. {
  1113. x264_me_t m;
  1114. int i_mvc;
  1115. ALIGNED_4( int16_t mvc[8][2] );
  1116. int i_halfpel_thresh = INT_MAX;
  1117. int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
  1118. /* 16x16 Search on all ref frame */
  1119. m.i_pixel = PIXEL_16x16;
  1120. LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
  1121. a->l0.me16x16.cost = INT_MAX;
  1122. for( int i_ref = 0; i_ref < h->mb.pic.i_fref[0]; i_ref++ )
  1123. {
  1124. m.i_ref_cost = REF_COST( 0, i_ref );
  1125. i_halfpel_thresh -= m.i_ref_cost;
  1126. /* search with ref */
  1127. LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 0 );
  1128. LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 0 );
  1129. x264_mb_predict_mv_16x16( h, 0, i_ref, m.mvp );
  1130. if( h->mb.ref_blind_dupe == i_ref )
  1131. {
  1132. CP32( m.mv, a->l0.mvc[0][0] );
  1133. x264_me_refine_qpel_refdupe( h, &m, p_halfpel_thresh );
  1134. }
  1135. else
  1136. {
  1137. x264_mb_predict_mv_ref16x16( h, 0, i_ref, mvc, &i_mvc );
  1138. x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh );
  1139. }
  1140. /* save mv for predicting neighbors */
  1141. CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
  1142. CP32( a->l0.mvc[i_ref][0], m.mv );
  1143. /* early termination
  1144. * SSD threshold would probably be better than SATD */
  1145. if( i_ref == 0
  1146. && a->b_try_skip
  1147. && m.cost-m.cost_mv < 300*a->i_lambda
  1148. && abs(m.mv[0]-h->mb.cache.pskip_mv[0])
  1149. + abs(m.mv[1]-h->mb.cache.pskip_mv[1]) <= 1
  1150. && x264_macroblock_probe_pskip( h ) )
  1151. {
  1152. h->mb.i_type = P_SKIP;
  1153. analyse_update_cache( h, a );
  1154. assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
  1155. return;
  1156. }
  1157. m.cost += m.i_ref_cost;
  1158. i_halfpel_thresh += m.i_ref_cost;
  1159. if( m.cost < a->l0.me16x16.cost )
  1160. h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
  1161. }
  1162. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
  1163. assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
  1164. h->mb.i_type = P_L0;
  1165. if( a->i_mbrd )
  1166. {
  1167. mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
  1168. if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
  1169. {
  1170. h->mb.i_partition = D_16x16;
  1171. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  1172. a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 );
  1173. if( !(h->mb.i_cbp_luma|h->mb.i_cbp_chroma) )
  1174. h->mb.i_type = P_SKIP;
  1175. }
  1176. }
  1177. }
  1178. static void mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
  1179. {
  1180. x264_me_t m;
  1181. pixel **p_fenc = h->mb.pic.p_fenc;
  1182. int i_maxref = h->mb.pic.i_fref[0]-1;
  1183. h->mb.i_partition = D_8x8;
  1184. #define CHECK_NEIGHBOUR(i)\
  1185. {\
  1186. int ref = h->mb.cache.ref[0][X264_SCAN8_0+i];\
  1187. if( ref > i_maxref && ref != h->mb.ref_blind_dupe )\
  1188. i_maxref = ref;\
  1189. }
  1190. /* early termination: if 16x16 chose ref 0, then evalute no refs older
  1191. * than those used by the neighbors */
  1192. if( a->b_early_terminate && (i_maxref > 0 && (a->l0.me16x16.i_ref == 0 || a->l0.me16x16.i_ref == h->mb.ref_blind_dupe) &&
  1193. h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0) )
  1194. {
  1195. i_maxref = 0;
  1196. CHECK_NEIGHBOUR( -8 - 1 );
  1197. CHECK_NEIGHBOUR( -8 + 0 );
  1198. CHECK_NEIGHBOUR( -8 + 2 );
  1199. CHECK_NEIGHBOUR( -8 + 4 );
  1200. CHECK_NEIGHBOUR( 0 - 1 );
  1201. CHECK_NEIGHBOUR( 2*8 - 1 );
  1202. }
  1203. #undef CHECK_NEIGHBOUR
  1204. for( int i_ref = 0; i_ref <= i_maxref; i_ref++ )
  1205. CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
  1206. for( int i = 0; i < 4; i++ )
  1207. {
  1208. x264_me_t *l0m = &a->l0.me8x8[i];
  1209. int x8 = i&1;
  1210. int y8 = i>>1;
  1211. m.i_pixel = PIXEL_8x8;
  1212. LOAD_FENC( &m, p_fenc, 8*x8, 8*y8 );
  1213. l0m->cost = INT_MAX;
  1214. for( int i_ref = 0; i_ref <= i_maxref || i_ref == h->mb.ref_blind_dupe; )
  1215. {
  1216. m.i_ref_cost = REF_COST( 0, i_ref );
  1217. LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
  1218. LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
  1219. x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
  1220. x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
  1221. if( h->mb.ref_blind_dupe == i_ref )
  1222. {
  1223. CP32( m.mv, a->l0.mvc[0][i+1] );
  1224. x264_me_refine_qpel_refdupe( h, &m, NULL );
  1225. }
  1226. else
  1227. x264_me_search( h, &m, a->l0.mvc[i_ref], i+1 );
  1228. m.cost += m.i_ref_cost;
  1229. CP32( a->l0.mvc[i_ref][i+1], m.mv );
  1230. if( m.cost < l0m->cost )
  1231. h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
  1232. if( i_ref == i_maxref && i_maxref < h->mb.ref_blind_dupe )
  1233. i_ref = h->mb.ref_blind_dupe;
  1234. else
  1235. i_ref++;
  1236. }
  1237. x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
  1238. x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
  1239. a->i_satd8x8[0][i] = l0m->cost - ( l0m->cost_mv + l0m->i_ref_cost );
  1240. /* If CABAC is on and we're not doing sub-8x8 analysis, the costs
  1241. are effectively zero. */
  1242. if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
  1243. l0m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
  1244. }
  1245. a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
  1246. a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
  1247. /* P_8x8 ref0 has no ref cost */
  1248. if( !h->param.b_cabac && !(a->l0.me8x8[0].i_ref | a->l0.me8x8[1].i_ref |
  1249. a->l0.me8x8[2].i_ref | a->l0.me8x8[3].i_ref) )
  1250. a->l0.i_cost8x8 -= REF_COST( 0, 0 ) * 4;
  1251. M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
  1252. }
  1253. static void mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
  1254. {
  1255. /* Duplicate refs are rarely useful in p8x8 due to the high cost of the
  1256. * reference frame flags. Thus, if we're not doing mixedrefs, just
  1257. * don't bother analysing the dupes. */
  1258. const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
  1259. const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
  1260. pixel **p_fenc = h->mb.pic.p_fenc;
  1261. int i_mvc;
  1262. int16_t (*mvc)[2] = a->l0.mvc[i_ref];
  1263. /* XXX Needed for x264_mb_predict_mv */
  1264. h->mb.i_partition = D_8x8;
  1265. i_mvc = 1;
  1266. CP32( mvc[0], a->l0.me16x16.mv );
  1267. for( int i = 0; i < 4; i++ )
  1268. {
  1269. x264_me_t *m = &a->l0.me8x8[i];
  1270. int x8 = i&1;
  1271. int y8 = i>>1;
  1272. m->i_pixel = PIXEL_8x8;
  1273. m->i_ref_cost = i_ref_cost;
  1274. LOAD_FENC( m, p_fenc, 8*x8, 8*y8 );
  1275. LOAD_HPELS( m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
  1276. LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*x8, 8*y8 );
  1277. x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
  1278. x264_me_search( h, m, mvc, i_mvc );
  1279. x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
  1280. CP32( mvc[i_mvc], m->mv );
  1281. i_mvc++;
  1282. a->i_satd8x8[0][i] = m->cost - m->cost_mv;
  1283. /* mb type cost */
  1284. m->cost += i_ref_cost;
  1285. if( !h->param.b_cabac || (h->param.analyse.inter & X264_ANALYSE_PSUB8x8) )
  1286. m->cost += a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x8];
  1287. }
  1288. a->l0.i_cost8x8 = a->l0.me8x8[0].cost + a->l0.me8x8[1].cost +
  1289. a->l0.me8x8[2].cost + a->l0.me8x8[3].cost;
  1290. /* theoretically this should include 4*ref_cost,
  1291. * but 3 seems a better approximation of cabac. */
  1292. if( h->param.b_cabac )
  1293. a->l0.i_cost8x8 -= i_ref_cost;
  1294. M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
  1295. }
  1296. static void mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  1297. {
  1298. x264_me_t m;
  1299. pixel **p_fenc = h->mb.pic.p_fenc;
  1300. ALIGNED_4( int16_t mvc[3][2] );
  1301. /* XXX Needed for x264_mb_predict_mv */
  1302. h->mb.i_partition = D_16x8;
  1303. for( int i = 0; i < 2; i++ )
  1304. {
  1305. x264_me_t *l0m = &a->l0.me16x8[i];
  1306. const int minref = X264_MIN( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
  1307. const int maxref = X264_MAX( a->l0.me8x8[2*i].i_ref, a->l0.me8x8[2*i+1].i_ref );
  1308. const int ref8[2] = { minref, maxref };
  1309. const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  1310. m.i_pixel = PIXEL_16x8;
  1311. LOAD_FENC( &m, p_fenc, 0, 8*i );
  1312. l0m->cost = INT_MAX;
  1313. for( int j = 0; j < i_ref8s; j++ )
  1314. {
  1315. const int i_ref = ref8[j];
  1316. m.i_ref_cost = REF_COST( 0, i_ref );
  1317. /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
  1318. CP32( mvc[0], a->l0.mvc[i_ref][0] );
  1319. CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
  1320. CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
  1321. LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
  1322. LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
  1323. x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
  1324. x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
  1325. /* We can only take this shortcut if the first search was performed on ref0. */
  1326. if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
  1327. {
  1328. /* We can just leave the MV from the previous ref search. */
  1329. x264_me_refine_qpel_refdupe( h, &m, NULL );
  1330. }
  1331. else
  1332. x264_me_search( h, &m, mvc, 3 );
  1333. m.cost += m.i_ref_cost;
  1334. if( m.cost < l0m->cost )
  1335. h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
  1336. }
  1337. /* Early termination based on the current SATD score of partition[0]
  1338. plus the estimated SATD score of partition[1] */
  1339. if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est16x8[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
  1340. {
  1341. a->l0.i_cost16x8 = COST_MAX;
  1342. return;
  1343. }
  1344. x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
  1345. x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
  1346. }
  1347. a->l0.i_cost16x8 = a->l0.me16x8[0].cost + a->l0.me16x8[1].cost;
  1348. }
  1349. static void mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  1350. {
  1351. x264_me_t m;
  1352. pixel **p_fenc = h->mb.pic.p_fenc;
  1353. ALIGNED_4( int16_t mvc[3][2] );
  1354. /* XXX Needed for x264_mb_predict_mv */
  1355. h->mb.i_partition = D_8x16;
  1356. for( int i = 0; i < 2; i++ )
  1357. {
  1358. x264_me_t *l0m = &a->l0.me8x16[i];
  1359. const int minref = X264_MIN( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
  1360. const int maxref = X264_MAX( a->l0.me8x8[i].i_ref, a->l0.me8x8[i+2].i_ref );
  1361. const int ref8[2] = { minref, maxref };
  1362. const int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  1363. m.i_pixel = PIXEL_8x16;
  1364. LOAD_FENC( &m, p_fenc, 8*i, 0 );
  1365. l0m->cost = INT_MAX;
  1366. for( int j = 0; j < i_ref8s; j++ )
  1367. {
  1368. const int i_ref = ref8[j];
  1369. m.i_ref_cost = REF_COST( 0, i_ref );
  1370. CP32( mvc[0], a->l0.mvc[i_ref][0] );
  1371. CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
  1372. CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
  1373. LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
  1374. LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
  1375. x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
  1376. x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
  1377. /* We can only take this shortcut if the first search was performed on ref0. */
  1378. if( h->mb.ref_blind_dupe == i_ref && !ref8[0] )
  1379. {
  1380. /* We can just leave the MV from the previous ref search. */
  1381. x264_me_refine_qpel_refdupe( h, &m, NULL );
  1382. }
  1383. else
  1384. x264_me_search( h, &m, mvc, 3 );
  1385. m.cost += m.i_ref_cost;
  1386. if( m.cost < l0m->cost )
  1387. h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
  1388. }
  1389. /* Early termination based on the current SATD score of partition[0]
  1390. plus the estimated SATD score of partition[1] */
  1391. if( a->b_early_terminate && (!i && l0m->cost + a->i_cost_est8x16[1] > i_best_satd * (4 + !!a->i_mbrd) / 4) )
  1392. {
  1393. a->l0.i_cost8x16 = COST_MAX;
  1394. return;
  1395. }
  1396. x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
  1397. x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
  1398. }
  1399. a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
  1400. }
  1401. static ALWAYS_INLINE int mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
  1402. pixel **p_fref, int i8x8, int size, int chroma )
  1403. {
  1404. ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
  1405. pixel *pix2 = pix1+8;
  1406. int i_stride = h->mb.pic.i_stride[1];
  1407. int chroma_h_shift = chroma <= CHROMA_422;
  1408. int chroma_v_shift = chroma == CHROMA_420;
  1409. int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
  1410. int i_ref = a->l0.me8x8[i8x8].i_ref;
  1411. int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
  1412. x264_weight_t *weight = h->sh.weight[i_ref];
  1413. // FIXME weight can be done on 4x4 blocks even if mc is smaller
  1414. #define CHROMA4x4MC( width, height, me, x, y ) \
  1415. if( chroma == CHROMA_444 ) \
  1416. { \
  1417. int mvx = (me).mv[0] + 4*2*x; \
  1418. int mvy = (me).mv[1] + 4*2*y; \
  1419. h->mc.mc_luma( &pix1[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][4], i_stride, \
  1420. mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][1] ); \
  1421. h->mc.mc_luma( &pix2[2*x+2*y*16], 16, &h->mb.pic.p_fref[0][i_ref][8], i_stride, \
  1422. mvx, mvy, 2*width, 2*height, &h->sh.weight[i_ref][2] ); \
  1423. } \
  1424. else \
  1425. { \
  1426. int offset = x + (2>>chroma_v_shift)*16*y; \
  1427. int chroma_height = (2>>chroma_v_shift)*height; \
  1428. h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
  1429. (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
  1430. if( weight[1].weightfn ) \
  1431. weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
  1432. if( weight[2].weightfn ) \
  1433. weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
  1434. }
  1435. if( size == PIXEL_4x4 )
  1436. {
  1437. x264_me_t *m = a->l0.me4x4[i8x8];
  1438. CHROMA4x4MC( 2,2, m[0], 0,0 );
  1439. CHROMA4x4MC( 2,2, m[1], 2,0 );
  1440. CHROMA4x4MC( 2,2, m[2], 0,2 );
  1441. CHROMA4x4MC( 2,2, m[3], 2,2 );
  1442. }
  1443. else if( size == PIXEL_8x4 )
  1444. {
  1445. x264_me_t *m = a->l0.me8x4[i8x8];
  1446. CHROMA4x4MC( 4,2, m[0], 0,0 );
  1447. CHROMA4x4MC( 4,2, m[1], 0,2 );
  1448. }
  1449. else
  1450. {
  1451. x264_me_t *m = a->l0.me4x8[i8x8];
  1452. CHROMA4x4MC( 2,4, m[0], 0,0 );
  1453. CHROMA4x4MC( 2,4, m[1], 2,0 );
  1454. }
  1455. #undef CHROMA4x4MC
  1456. int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
  1457. int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
  1458. return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
  1459. + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
  1460. }
  1461. static int mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
  1462. {
  1463. if( CHROMA_FORMAT == CHROMA_444 )
  1464. return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
  1465. else if( CHROMA_FORMAT == CHROMA_422 )
  1466. return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
  1467. else
  1468. return mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
  1469. }
  1470. static void mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  1471. {
  1472. pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
  1473. pixel **p_fenc = h->mb.pic.p_fenc;
  1474. const int i_ref = a->l0.me8x8[i8x8].i_ref;
  1475. /* XXX Needed for x264_mb_predict_mv */
  1476. h->mb.i_partition = D_8x8;
  1477. for( int i4x4 = 0; i4x4 < 4; i4x4++ )
  1478. {
  1479. const int idx = 4*i8x8 + i4x4;
  1480. const int x4 = block_idx_x[idx];
  1481. const int y4 = block_idx_y[idx];
  1482. const int i_mvc = (i4x4 == 0);
  1483. x264_me_t *m = &a->l0.me4x4[i8x8][i4x4];
  1484. m->i_pixel = PIXEL_4x4;
  1485. LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
  1486. LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
  1487. LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  1488. x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
  1489. x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
  1490. x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
  1491. }
  1492. a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
  1493. a->l0.me4x4[i8x8][1].cost +
  1494. a->l0.me4x4[i8x8][2].cost +
  1495. a->l0.me4x4[i8x8][3].cost +
  1496. REF_COST( 0, i_ref ) +
  1497. a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x4];
  1498. if( h->mb.b_chroma_me && !CHROMA444 )
  1499. a->l0.i_cost4x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x4 );
  1500. }
  1501. static void mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  1502. {
  1503. pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
  1504. pixel **p_fenc = h->mb.pic.p_fenc;
  1505. const int i_ref = a->l0.me8x8[i8x8].i_ref;
  1506. /* XXX Needed for x264_mb_predict_mv */
  1507. h->mb.i_partition = D_8x8;
  1508. for( int i8x4 = 0; i8x4 < 2; i8x4++ )
  1509. {
  1510. const int idx = 4*i8x8 + 2*i8x4;
  1511. const int x4 = block_idx_x[idx];
  1512. const int y4 = block_idx_y[idx];
  1513. const int i_mvc = (i8x4 == 0);
  1514. x264_me_t *m = &a->l0.me8x4[i8x8][i8x4];
  1515. m->i_pixel = PIXEL_8x4;
  1516. LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
  1517. LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
  1518. LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  1519. x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
  1520. x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
  1521. x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
  1522. }
  1523. a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
  1524. REF_COST( 0, i_ref ) +
  1525. a->i_lambda * i_sub_mb_p_cost_table[D_L0_8x4];
  1526. if( h->mb.b_chroma_me && !CHROMA444 )
  1527. a->l0.i_cost8x4[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_8x4 );
  1528. }
  1529. static void mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
  1530. {
  1531. pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
  1532. pixel **p_fenc = h->mb.pic.p_fenc;
  1533. const int i_ref = a->l0.me8x8[i8x8].i_ref;
  1534. /* XXX Needed for x264_mb_predict_mv */
  1535. h->mb.i_partition = D_8x8;
  1536. for( int i4x8 = 0; i4x8 < 2; i4x8++ )
  1537. {
  1538. const int idx = 4*i8x8 + i4x8;
  1539. const int x4 = block_idx_x[idx];
  1540. const int y4 = block_idx_y[idx];
  1541. const int i_mvc = (i4x8 == 0);
  1542. x264_me_t *m = &a->l0.me4x8[i8x8][i4x8];
  1543. m->i_pixel = PIXEL_4x8;
  1544. LOAD_FENC( m, p_fenc, 4*x4, 4*y4 );
  1545. LOAD_HPELS( m, p_fref, 0, i_ref, 4*x4, 4*y4 );
  1546. LOAD_WPELS( m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 4*x4, 4*y4 );
  1547. x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
  1548. x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
  1549. x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
  1550. }
  1551. a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
  1552. REF_COST( 0, i_ref ) +
  1553. a->i_lambda * i_sub_mb_p_cost_table[D_L0_4x8];
  1554. if( h->mb.b_chroma_me && !CHROMA444 )
  1555. a->l0.i_cost4x8[i8x8] += mb_analyse_inter_p4x4_chroma( h, a, p_fref, i8x8, PIXEL_4x8 );
  1556. }
  1557. static ALWAYS_INLINE int analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
  1558. {
  1559. ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
  1560. ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] );
  1561. int i_chroma_cost = 0;
  1562. int chromapix = h->luma2chroma_pixel[i_pixel];
  1563. #define COST_BI_CHROMA( m0, m1, width, height ) \
  1564. { \
  1565. if( CHROMA444 ) \
  1566. { \
  1567. h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
  1568. m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
  1569. h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
  1570. m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
  1571. h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
  1572. m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
  1573. h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
  1574. m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
  1575. } \
  1576. else \
  1577. { \
  1578. int v_shift = CHROMA_V_SHIFT; \
  1579. int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
  1580. int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
  1581. h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
  1582. m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
  1583. h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
  1584. m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
  1585. } \
  1586. h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
  1587. h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
  1588. i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
  1589. + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
  1590. }
  1591. if( i_pixel == PIXEL_16x16 )
  1592. COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
  1593. else if( i_pixel == PIXEL_16x8 )
  1594. COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
  1595. else if( i_pixel == PIXEL_8x16 )
  1596. COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
  1597. else
  1598. COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
  1599. return i_chroma_cost;
  1600. }
  1601. static void mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
  1602. {
  1603. /* Assumes that fdec still contains the results of
  1604. * x264_mb_predict_mv_direct16x16 and x264_mb_mc */
  1605. pixel *p_fenc = h->mb.pic.p_fenc[0];
  1606. pixel *p_fdec = h->mb.pic.p_fdec[0];
  1607. a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
  1608. if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
  1609. {
  1610. int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
  1611. for( int i = 0; i < 4; i++ )
  1612. {
  1613. const int x = (i&1)*8;
  1614. const int y = (i>>1)*8;
  1615. a->i_cost8x8direct[i] = h->pixf.mbcmp[PIXEL_8x8]( &p_fenc[x+y*FENC_STRIDE], FENC_STRIDE,
  1616. &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
  1617. if( h->mb.b_chroma_me )
  1618. {
  1619. int fenc_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FENC_STRIDE;
  1620. int fdec_offset = (x>>CHROMA_H_SHIFT) + (y>>CHROMA_V_SHIFT)*FDEC_STRIDE;
  1621. a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
  1622. &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
  1623. + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
  1624. &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
  1625. }
  1626. a->i_cost16x16direct += a->i_cost8x8direct[i];
  1627. /* mb type cost */
  1628. a->i_cost8x8direct[i] += a->i_lambda * i_sub_mb_b_cost_table[D_DIRECT_8x8];
  1629. }
  1630. }
  1631. else
  1632. {
  1633. a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
  1634. if( h->mb.b_chroma_me )
  1635. {
  1636. int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
  1637. a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
  1638. + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
  1639. }
  1640. }
  1641. }
  1642. static void mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
  1643. {
  1644. ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
  1645. ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
  1646. pixel *src0, *src1;
  1647. intptr_t stride0 = 16, stride1 = 16;
  1648. int i_ref, i_mvc;
  1649. ALIGNED_4( int16_t mvc[9][2] );
  1650. int try_skip = a->b_try_skip;
  1651. int list1_skipped = 0;
  1652. int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
  1653. int *p_halfpel_thresh[2] = {(a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh[0] : NULL,
  1654. (a->b_early_terminate && h->mb.pic.i_fref[1]>1) ? &i_halfpel_thresh[1] : NULL};
  1655. x264_me_t m;
  1656. m.i_pixel = PIXEL_16x16;
  1657. LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 0 );
  1658. /* 16x16 Search on list 0 and list 1 */
  1659. a->l0.me16x16.cost = INT_MAX;
  1660. a->l1.me16x16.cost = INT_MAX;
  1661. for( int l = 1; l >= 0; )
  1662. {
  1663. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  1664. /* This loop is extremely munged in order to facilitate the following order of operations,
  1665. * necessary for an efficient fast skip.
  1666. * 1. Search list1 ref0.
  1667. * 2. Search list0 ref0.
  1668. * 3. Try skip.
  1669. * 4. Search the rest of list0.
  1670. * 5. Go back and finish list1.
  1671. */
  1672. for( i_ref = (list1_skipped && l == 1) ? 1 : 0; i_ref < h->mb.pic.i_fref[l]; i_ref++ )
  1673. {
  1674. if( try_skip && l == 1 && i_ref > 0 )
  1675. {
  1676. list1_skipped = 1;
  1677. break;
  1678. }
  1679. m.i_ref_cost = REF_COST( l, i_ref );
  1680. /* search with ref */
  1681. LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 0 );
  1682. x264_mb_predict_mv_16x16( h, l, i_ref, m.mvp );
  1683. x264_mb_predict_mv_ref16x16( h, l, i_ref, mvc, &i_mvc );
  1684. x264_me_search_ref( h, &m, mvc, i_mvc, p_halfpel_thresh[l] );
  1685. /* add ref cost */
  1686. m.cost += m.i_ref_cost;
  1687. if( m.cost < lX->me16x16.cost )
  1688. h->mc.memcpy_aligned( &lX->me16x16, &m, sizeof(x264_me_t) );
  1689. /* save mv for predicting neighbors */
  1690. CP32( lX->mvc[i_ref][0], m.mv );
  1691. CP32( h->mb.mvr[l][i_ref][h->mb.i_mb_xy], m.mv );
  1692. /* Fast skip detection. */
  1693. if( i_ref == 0 && try_skip )
  1694. {
  1695. if( abs(lX->me16x16.mv[0]-h->mb.cache.direct_mv[l][0][0]) +
  1696. abs(lX->me16x16.mv[1]-h->mb.cache.direct_mv[l][0][1]) > 1 )
  1697. {
  1698. try_skip = 0;
  1699. }
  1700. else if( !l )
  1701. {
  1702. /* We already tested skip */
  1703. h->mb.i_type = B_SKIP;
  1704. analyse_update_cache( h, a );
  1705. return;
  1706. }
  1707. }
  1708. }
  1709. if( list1_skipped && l == 1 && i_ref == h->mb.pic.i_fref[1] )
  1710. break;
  1711. if( list1_skipped && l == 0 )
  1712. l = 1;
  1713. else
  1714. l--;
  1715. }
  1716. /* get cost of BI mode */
  1717. h->mc.memcpy_aligned( &a->l0.bi16x16, &a->l0.me16x16, sizeof(x264_me_t) );
  1718. h->mc.memcpy_aligned( &a->l1.bi16x16, &a->l1.me16x16, sizeof(x264_me_t) );
  1719. int ref_costs = REF_COST( 0, a->l0.bi16x16.i_ref ) + REF_COST( 1, a->l1.bi16x16.i_ref );
  1720. src0 = h->mc.get_ref( pix0, &stride0,
  1721. h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref], h->mb.pic.i_stride[0],
  1722. a->l0.bi16x16.mv[0], a->l0.bi16x16.mv[1], 16, 16, x264_weight_none );
  1723. src1 = h->mc.get_ref( pix1, &stride1,
  1724. h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref], h->mb.pic.i_stride[0],
  1725. a->l1.bi16x16.mv[0], a->l1.bi16x16.mv[1], 16, 16, x264_weight_none );
  1726. h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1727. a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  1728. + ref_costs
  1729. + a->l0.bi16x16.cost_mv
  1730. + a->l1.bi16x16.cost_mv;
  1731. if( h->mb.b_chroma_me )
  1732. a->i_cost16x16bi += analyse_bi_chroma( h, a, 0, PIXEL_16x16 );
  1733. /* Always try the 0,0,0,0 vector; helps avoid errant motion vectors in fades */
  1734. if( M32( a->l0.bi16x16.mv ) | M32( a->l1.bi16x16.mv ) )
  1735. {
  1736. int l0_mv_cost = a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[0]]
  1737. + a->l0.bi16x16.p_cost_mv[-a->l0.bi16x16.mvp[1]];
  1738. int l1_mv_cost = a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[0]]
  1739. + a->l1.bi16x16.p_cost_mv[-a->l1.bi16x16.mvp[1]];
  1740. h->mc.avg[PIXEL_16x16]( pix0, 16, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
  1741. h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][0], h->mb.pic.i_stride[0],
  1742. h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1743. int cost00 = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
  1744. + ref_costs + l0_mv_cost + l1_mv_cost;
  1745. if( h->mb.b_chroma_me && cost00 < a->i_cost16x16bi )
  1746. {
  1747. ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
  1748. if( CHROMA444 )
  1749. {
  1750. h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
  1751. h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1],
  1752. h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1753. cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE );
  1754. h->mc.avg[PIXEL_16x16]( bi, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
  1755. h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][8], h->mb.pic.i_stride[2],
  1756. h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1757. cost00 += h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi, FENC_STRIDE );
  1758. }
  1759. else
  1760. {
  1761. ALIGNED_ARRAY_64( pixel, pixuv, [2],[16*FENC_STRIDE] );
  1762. int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
  1763. int v_shift = CHROMA_V_SHIFT;
  1764. if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
  1765. {
  1766. int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
  1767. h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
  1768. h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
  1769. }
  1770. else
  1771. h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
  1772. h->mb.pic.i_stride[1], 16>>v_shift );
  1773. if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
  1774. {
  1775. int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
  1776. h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
  1777. h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
  1778. }
  1779. else
  1780. h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
  1781. h->mb.pic.i_stride[1], 16>>v_shift );
  1782. h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
  1783. h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1784. h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
  1785. h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
  1786. cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
  1787. + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
  1788. }
  1789. }
  1790. if( cost00 < a->i_cost16x16bi )
  1791. {
  1792. M32( a->l0.bi16x16.mv ) = 0;
  1793. M32( a->l1.bi16x16.mv ) = 0;
  1794. a->l0.bi16x16.cost_mv = l0_mv_cost;
  1795. a->l1.bi16x16.cost_mv = l1_mv_cost;
  1796. a->i_cost16x16bi = cost00;
  1797. }
  1798. }
  1799. /* mb type cost */
  1800. a->i_cost16x16bi += a->i_lambda * i_mb_b_cost_table[B_BI_BI];
  1801. a->l0.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L0_L0];
  1802. a->l1.me16x16.cost += a->i_lambda * i_mb_b_cost_table[B_L1_L1];
  1803. }
  1804. static inline void mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int i )
  1805. {
  1806. int x = 2*(i&1);
  1807. int y = i&2;
  1808. switch( h->mb.i_sub_partition[i] )
  1809. {
  1810. case D_L0_8x8:
  1811. x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
  1812. break;
  1813. case D_L0_8x4:
  1814. x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
  1815. x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
  1816. break;
  1817. case D_L0_4x8:
  1818. x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
  1819. x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
  1820. break;
  1821. case D_L0_4x4:
  1822. x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
  1823. x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
  1824. x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
  1825. x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
  1826. break;
  1827. default:
  1828. x264_log( h, X264_LOG_ERROR, "internal error\n" );
  1829. break;
  1830. }
  1831. }
  1832. static void mb_load_mv_direct8x8( x264_t *h, int idx )
  1833. {
  1834. int x = 2*(idx&1);
  1835. int y = idx&2;
  1836. x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
  1837. x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
  1838. x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, h->mb.cache.direct_mv[0][idx] );
  1839. x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 1, h->mb.cache.direct_mv[1][idx] );
  1840. }
  1841. #define CACHE_MV_BI(x,y,dx,dy,me0,me1,part) \
  1842. if( x264_mb_partition_listX_table[0][part] ) \
  1843. { \
  1844. x264_macroblock_cache_ref( h, x,y,dx,dy, 0, me0.i_ref ); \
  1845. x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
  1846. } \
  1847. else \
  1848. { \
  1849. x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
  1850. x264_macroblock_cache_mv( h, x,y,dx,dy, 0, 0 ); \
  1851. if( b_mvd ) \
  1852. x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
  1853. } \
  1854. if( x264_mb_partition_listX_table[1][part] ) \
  1855. { \
  1856. x264_macroblock_cache_ref( h, x,y,dx,dy, 1, me1.i_ref ); \
  1857. x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
  1858. } \
  1859. else \
  1860. { \
  1861. x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
  1862. x264_macroblock_cache_mv( h, x,y,dx,dy, 1, 0 ); \
  1863. if( b_mvd ) \
  1864. x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
  1865. }
  1866. static inline void mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
  1867. {
  1868. int x = 2*(i&1);
  1869. int y = i&2;
  1870. if( h->mb.i_sub_partition[i] == D_DIRECT_8x8 )
  1871. {
  1872. mb_load_mv_direct8x8( h, i );
  1873. if( b_mvd )
  1874. {
  1875. x264_macroblock_cache_mvd( h, x, y, 2, 2, 0, 0 );
  1876. x264_macroblock_cache_mvd( h, x, y, 2, 2, 1, 0 );
  1877. x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
  1878. }
  1879. }
  1880. else
  1881. {
  1882. CACHE_MV_BI( x, y, 2, 2, a->l0.me8x8[i], a->l1.me8x8[i], h->mb.i_sub_partition[i] );
  1883. }
  1884. }
  1885. static inline void mb_cache_mv_b16x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
  1886. {
  1887. CACHE_MV_BI( 0, 2*i, 4, 2, a->l0.me16x8[i], a->l1.me16x8[i], a->i_mb_partition16x8[i] );
  1888. }
  1889. static inline void mb_cache_mv_b8x16( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
  1890. {
  1891. CACHE_MV_BI( 2*i, 0, 2, 4, a->l0.me8x16[i], a->l1.me8x16[i], a->i_mb_partition8x16[i] );
  1892. }
  1893. #undef CACHE_MV_BI
  1894. static void mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
  1895. {
  1896. ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
  1897. int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
  1898. /* early termination: if 16x16 chose ref 0, then evalute no refs older
  1899. * than those used by the neighbors */
  1900. #define CHECK_NEIGHBOUR(i)\
  1901. {\
  1902. int ref = h->mb.cache.ref[l][X264_SCAN8_0+i];\
  1903. if( ref > i_maxref[l] )\
  1904. i_maxref[l] = ref;\
  1905. }
  1906. for( int l = 0; l < 2; l++ )
  1907. {
  1908. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  1909. if( i_maxref[l] > 0 && lX->me16x16.i_ref == 0 &&
  1910. h->mb.i_mb_type_top > 0 && h->mb.i_mb_type_left[0] > 0 )
  1911. {
  1912. i_maxref[l] = 0;
  1913. CHECK_NEIGHBOUR( -8 - 1 );
  1914. CHECK_NEIGHBOUR( -8 + 0 );
  1915. CHECK_NEIGHBOUR( -8 + 2 );
  1916. CHECK_NEIGHBOUR( -8 + 4 );
  1917. CHECK_NEIGHBOUR( 0 - 1 );
  1918. CHECK_NEIGHBOUR( 2*8 - 1 );
  1919. }
  1920. }
  1921. /* XXX Needed for x264_mb_predict_mv */
  1922. h->mb.i_partition = D_8x8;
  1923. a->i_cost8x8bi = 0;
  1924. for( int i = 0; i < 4; i++ )
  1925. {
  1926. int x8 = i&1;
  1927. int y8 = i>>1;
  1928. int i_part_cost;
  1929. int i_part_cost_bi;
  1930. intptr_t stride[2] = {8,8};
  1931. pixel *src[2];
  1932. x264_me_t m;
  1933. m.i_pixel = PIXEL_8x8;
  1934. LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
  1935. for( int l = 0; l < 2; l++ )
  1936. {
  1937. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  1938. lX->me8x8[i].cost = INT_MAX;
  1939. for( int i_ref = 0; i_ref <= i_maxref[l]; i_ref++ )
  1940. {
  1941. m.i_ref_cost = REF_COST( l, i_ref );
  1942. LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*x8, 8*y8 );
  1943. x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, i_ref );
  1944. x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
  1945. x264_me_search( h, &m, lX->mvc[i_ref], i+1 );
  1946. m.cost += m.i_ref_cost;
  1947. if( m.cost < lX->me8x8[i].cost )
  1948. {
  1949. h->mc.memcpy_aligned( &lX->me8x8[i], &m, sizeof(x264_me_t) );
  1950. a->i_satd8x8[l][i] = m.cost - ( m.cost_mv + m.i_ref_cost );
  1951. }
  1952. /* save mv for predicting other partitions within this MB */
  1953. CP32( lX->mvc[i_ref][i+1], m.mv );
  1954. }
  1955. }
  1956. /* BI mode */
  1957. src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x8[i].p_fref, a->l0.me8x8[i].i_stride[0],
  1958. a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1], 8, 8, x264_weight_none );
  1959. src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x8[i].p_fref, a->l1.me8x8[i].i_stride[0],
  1960. a->l1.me8x8[i].mv[0], a->l1.me8x8[i].mv[1], 8, 8, x264_weight_none );
  1961. h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1],
  1962. h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref] );
  1963. a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
  1964. i_part_cost_bi = a->i_satd8x8[2][i] + a->l0.me8x8[i].cost_mv + a->l1.me8x8[i].cost_mv
  1965. + a->l0.me8x8[i].i_ref_cost + a->l1.me8x8[i].i_ref_cost
  1966. + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
  1967. if( h->mb.b_chroma_me )
  1968. {
  1969. int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 );
  1970. i_part_cost_bi += i_chroma_cost;
  1971. a->i_satd8x8[2][i] += i_chroma_cost;
  1972. }
  1973. a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
  1974. a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
  1975. i_part_cost = a->l0.me8x8[i].cost;
  1976. h->mb.i_sub_partition[i] = D_L0_8x8;
  1977. COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
  1978. COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
  1979. COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
  1980. a->i_cost8x8bi += i_part_cost;
  1981. /* XXX Needed for x264_mb_predict_mv */
  1982. mb_cache_mv_b8x8( h, a, i, 0 );
  1983. }
  1984. /* mb type cost */
  1985. a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
  1986. }
  1987. static void mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
  1988. {
  1989. pixel **p_fref[2] =
  1990. { h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
  1991. h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
  1992. ALIGNED_ARRAY_16( pixel, pix,[2],[8*8] );
  1993. /* XXX Needed for x264_mb_predict_mv */
  1994. h->mb.i_partition = D_8x8;
  1995. a->i_cost8x8bi = 0;
  1996. for( int i = 0; i < 4; i++ )
  1997. {
  1998. int x8 = i&1;
  1999. int y8 = i>>1;
  2000. int i_part_cost;
  2001. int i_part_cost_bi = 0;
  2002. intptr_t stride[2] = {8,8};
  2003. pixel *src[2];
  2004. for( int l = 0; l < 2; l++ )
  2005. {
  2006. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  2007. x264_me_t *m = &lX->me8x8[i];
  2008. m->i_pixel = PIXEL_8x8;
  2009. LOAD_FENC( m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
  2010. m->i_ref_cost = REF_COST( l, lX->me16x16.i_ref );
  2011. m->i_ref = lX->me16x16.i_ref;
  2012. LOAD_HPELS( m, p_fref[l], l, lX->me16x16.i_ref, 8*x8, 8*y8 );
  2013. x264_macroblock_cache_ref( h, x8*2, y8*2, 2, 2, l, lX->me16x16.i_ref );
  2014. x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
  2015. x264_me_search( h, m, &lX->me16x16.mv, 1 );
  2016. a->i_satd8x8[l][i] = m->cost - m->cost_mv;
  2017. m->cost += m->i_ref_cost;
  2018. x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
  2019. /* save mv for predicting other partitions within this MB */
  2020. CP32( lX->mvc[lX->me16x16.i_ref][i+1], m->mv );
  2021. /* BI mode */
  2022. src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
  2023. m->mv[0], m->mv[1], 8, 8, x264_weight_none );
  2024. i_part_cost_bi += m->cost_mv + m->i_ref_cost;
  2025. }
  2026. h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me16x16.i_ref][a->l1.me16x16.i_ref] );
  2027. a->i_satd8x8[2][i] = h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
  2028. i_part_cost_bi += a->i_satd8x8[2][i] + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
  2029. a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
  2030. a->l1.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
  2031. if( h->mb.b_chroma_me )
  2032. {
  2033. int i_chroma_cost = analyse_bi_chroma( h, a, i, PIXEL_8x8 );
  2034. i_part_cost_bi += i_chroma_cost;
  2035. a->i_satd8x8[2][i] += i_chroma_cost;
  2036. }
  2037. i_part_cost = a->l0.me8x8[i].cost;
  2038. h->mb.i_sub_partition[i] = D_L0_8x8;
  2039. COPY2_IF_LT( i_part_cost, a->l1.me8x8[i].cost, h->mb.i_sub_partition[i], D_L1_8x8 );
  2040. COPY2_IF_LT( i_part_cost, i_part_cost_bi, h->mb.i_sub_partition[i], D_BI_8x8 );
  2041. COPY2_IF_LT( i_part_cost, a->i_cost8x8direct[i], h->mb.i_sub_partition[i], D_DIRECT_8x8 );
  2042. a->i_cost8x8bi += i_part_cost;
  2043. /* XXX Needed for x264_mb_predict_mv */
  2044. mb_cache_mv_b8x8( h, a, i, 0 );
  2045. }
  2046. /* mb type cost */
  2047. a->i_cost8x8bi += a->i_lambda * i_mb_b_cost_table[B_8x8];
  2048. }
  2049. static void mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  2050. {
  2051. ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
  2052. ALIGNED_4( int16_t mvc[3][2] );
  2053. h->mb.i_partition = D_16x8;
  2054. a->i_cost16x8bi = 0;
  2055. for( int i = 0; i < 2; i++ )
  2056. {
  2057. int i_part_cost;
  2058. int i_part_cost_bi = 0;
  2059. intptr_t stride[2] = {16,16};
  2060. pixel *src[2];
  2061. x264_me_t m;
  2062. m.i_pixel = PIXEL_16x8;
  2063. LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
  2064. for( int l = 0; l < 2; l++ )
  2065. {
  2066. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  2067. int ref8[2] = { lX->me8x8[2*i].i_ref, lX->me8x8[2*i+1].i_ref };
  2068. int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  2069. lX->me16x8[i].cost = INT_MAX;
  2070. for( int j = 0; j < i_ref8s; j++ )
  2071. {
  2072. int i_ref = ref8[j];
  2073. m.i_ref_cost = REF_COST( l, i_ref );
  2074. LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 0, 8*i );
  2075. CP32( mvc[0], lX->mvc[i_ref][0] );
  2076. CP32( mvc[1], lX->mvc[i_ref][2*i+1] );
  2077. CP32( mvc[2], lX->mvc[i_ref][2*i+2] );
  2078. x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, l, i_ref );
  2079. x264_mb_predict_mv( h, l, 8*i, 4, m.mvp );
  2080. x264_me_search( h, &m, mvc, 3 );
  2081. m.cost += m.i_ref_cost;
  2082. if( m.cost < lX->me16x8[i].cost )
  2083. h->mc.memcpy_aligned( &lX->me16x8[i], &m, sizeof(x264_me_t) );
  2084. }
  2085. }
  2086. /* BI mode */
  2087. src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me16x8[i].p_fref, a->l0.me16x8[i].i_stride[0],
  2088. a->l0.me16x8[i].mv[0], a->l0.me16x8[i].mv[1], 16, 8, x264_weight_none );
  2089. src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me16x8[i].p_fref, a->l1.me16x8[i].i_stride[0],
  2090. a->l1.me16x8[i].mv[0], a->l1.me16x8[i].mv[1], 16, 8, x264_weight_none );
  2091. h->mc.avg[PIXEL_16x8]( pix[0], 16, src[0], stride[0], src[1], stride[1],
  2092. h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref] );
  2093. i_part_cost_bi = h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 )
  2094. + a->l0.me16x8[i].cost_mv + a->l1.me16x8[i].cost_mv + a->l0.me16x8[i].i_ref_cost
  2095. + a->l1.me16x8[i].i_ref_cost;
  2096. if( h->mb.b_chroma_me )
  2097. i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_16x8 );
  2098. i_part_cost = a->l0.me16x8[i].cost;
  2099. a->i_mb_partition16x8[i] = D_L0_8x8; /* not actually 8x8, only the L0 matters */
  2100. if( a->l1.me16x8[i].cost < i_part_cost )
  2101. {
  2102. i_part_cost = a->l1.me16x8[i].cost;
  2103. a->i_mb_partition16x8[i] = D_L1_8x8;
  2104. }
  2105. if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
  2106. {
  2107. i_part_cost = i_part_cost_bi;
  2108. a->i_mb_partition16x8[i] = D_BI_8x8;
  2109. }
  2110. a->i_cost16x8bi += i_part_cost;
  2111. /* Early termination based on the current SATD score of partition[0]
  2112. plus the estimated SATD score of partition[1] */
  2113. if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est16x8[1] > i_best_satd
  2114. * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
  2115. {
  2116. a->i_cost16x8bi = COST_MAX;
  2117. return;
  2118. }
  2119. mb_cache_mv_b16x8( h, a, i, 0 );
  2120. }
  2121. /* mb type cost */
  2122. a->i_mb_type16x8 = B_L0_L0
  2123. + (a->i_mb_partition16x8[0]>>2) * 3
  2124. + (a->i_mb_partition16x8[1]>>2);
  2125. a->i_cost16x8bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type16x8];
  2126. }
  2127. static void mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
  2128. {
  2129. ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
  2130. ALIGNED_4( int16_t mvc[3][2] );
  2131. h->mb.i_partition = D_8x16;
  2132. a->i_cost8x16bi = 0;
  2133. for( int i = 0; i < 2; i++ )
  2134. {
  2135. int i_part_cost;
  2136. int i_part_cost_bi = 0;
  2137. intptr_t stride[2] = {8,8};
  2138. pixel *src[2];
  2139. x264_me_t m;
  2140. m.i_pixel = PIXEL_8x16;
  2141. LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
  2142. for( int l = 0; l < 2; l++ )
  2143. {
  2144. x264_mb_analysis_list_t *lX = l ? &a->l1 : &a->l0;
  2145. int ref8[2] = { lX->me8x8[i].i_ref, lX->me8x8[i+2].i_ref };
  2146. int i_ref8s = ( ref8[0] == ref8[1] ) ? 1 : 2;
  2147. lX->me8x16[i].cost = INT_MAX;
  2148. for( int j = 0; j < i_ref8s; j++ )
  2149. {
  2150. int i_ref = ref8[j];
  2151. m.i_ref_cost = REF_COST( l, i_ref );
  2152. LOAD_HPELS( &m, h->mb.pic.p_fref[l][i_ref], l, i_ref, 8*i, 0 );
  2153. CP32( mvc[0], lX->mvc[i_ref][0] );
  2154. CP32( mvc[1], lX->mvc[i_ref][i+1] );
  2155. CP32( mvc[2], lX->mvc[i_ref][i+3] );
  2156. x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, l, i_ref );
  2157. x264_mb_predict_mv( h, l, 4*i, 2, m.mvp );
  2158. x264_me_search( h, &m, mvc, 3 );
  2159. m.cost += m.i_ref_cost;
  2160. if( m.cost < lX->me8x16[i].cost )
  2161. h->mc.memcpy_aligned( &lX->me8x16[i], &m, sizeof(x264_me_t) );
  2162. }
  2163. }
  2164. /* BI mode */
  2165. src[0] = h->mc.get_ref( pix[0], &stride[0], a->l0.me8x16[i].p_fref, a->l0.me8x16[i].i_stride[0],
  2166. a->l0.me8x16[i].mv[0], a->l0.me8x16[i].mv[1], 8, 16, x264_weight_none );
  2167. src[1] = h->mc.get_ref( pix[1], &stride[1], a->l1.me8x16[i].p_fref, a->l1.me8x16[i].i_stride[0],
  2168. a->l1.me8x16[i].mv[0], a->l1.me8x16[i].mv[1], 8, 16, x264_weight_none );
  2169. h->mc.avg[PIXEL_8x16]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref] );
  2170. i_part_cost_bi = h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
  2171. + a->l0.me8x16[i].cost_mv + a->l1.me8x16[i].cost_mv + a->l0.me8x16[i].i_ref_cost
  2172. + a->l1.me8x16[i].i_ref_cost;
  2173. if( h->mb.b_chroma_me )
  2174. i_part_cost_bi += analyse_bi_chroma( h, a, i, PIXEL_8x16 );
  2175. i_part_cost = a->l0.me8x16[i].cost;
  2176. a->i_mb_partition8x16[i] = D_L0_8x8;
  2177. if( a->l1.me8x16[i].cost < i_part_cost )
  2178. {
  2179. i_part_cost = a->l1.me8x16[i].cost;
  2180. a->i_mb_partition8x16[i] = D_L1_8x8;
  2181. }
  2182. if( i_part_cost_bi + a->i_lambda * 1 < i_part_cost )
  2183. {
  2184. i_part_cost = i_part_cost_bi;
  2185. a->i_mb_partition8x16[i] = D_BI_8x8;
  2186. }
  2187. a->i_cost8x16bi += i_part_cost;
  2188. /* Early termination based on the current SATD score of partition[0]
  2189. plus the estimated SATD score of partition[1] */
  2190. if( a->b_early_terminate && (!i && i_part_cost + a->i_cost_est8x16[1] > i_best_satd
  2191. * (16 + (!!a->i_mbrd + !!h->mb.i_psy_rd))/16) )
  2192. {
  2193. a->i_cost8x16bi = COST_MAX;
  2194. return;
  2195. }
  2196. mb_cache_mv_b8x16( h, a, i, 0 );
  2197. }
  2198. /* mb type cost */
  2199. a->i_mb_type8x16 = B_L0_L0
  2200. + (a->i_mb_partition8x16[0]>>2) * 3
  2201. + (a->i_mb_partition8x16[1]>>2);
  2202. a->i_cost8x16bi += a->i_lambda * i_mb_b16x8_cost_table[a->i_mb_type8x16];
  2203. }
  2204. static void mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd )
  2205. {
  2206. int thresh = a->b_early_terminate ? i_satd * 5/4 + 1 : COST_MAX;
  2207. h->mb.i_type = P_L0;
  2208. if( a->l0.i_rd16x16 == COST_MAX && (!a->b_early_terminate || a->l0.me16x16.cost <= i_satd * 3/2) )
  2209. {
  2210. h->mb.i_partition = D_16x16;
  2211. analyse_update_cache( h, a );
  2212. a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 );
  2213. }
  2214. if( a->l0.i_cost16x8 < thresh )
  2215. {
  2216. h->mb.i_partition = D_16x8;
  2217. analyse_update_cache( h, a );
  2218. a->l0.i_cost16x8 = rd_cost_mb( h, a->i_lambda2 );
  2219. }
  2220. else
  2221. a->l0.i_cost16x8 = COST_MAX;
  2222. if( a->l0.i_cost8x16 < thresh )
  2223. {
  2224. h->mb.i_partition = D_8x16;
  2225. analyse_update_cache( h, a );
  2226. a->l0.i_cost8x16 = rd_cost_mb( h, a->i_lambda2 );
  2227. }
  2228. else
  2229. a->l0.i_cost8x16 = COST_MAX;
  2230. if( a->l0.i_cost8x8 < thresh )
  2231. {
  2232. h->mb.i_type = P_8x8;
  2233. h->mb.i_partition = D_8x8;
  2234. if( h->param.analyse.inter & X264_ANALYSE_PSUB8x8 )
  2235. {
  2236. x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
  2237. x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
  2238. x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
  2239. x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
  2240. /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection
  2241. * for future blocks are those left over from previous RDO calls. */
  2242. for( int i = 0; i < 4; i++ )
  2243. {
  2244. int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost};
  2245. int sub8x8_thresh = a->b_early_terminate ? X264_MIN4( costs[0], costs[1], costs[2], costs[3] ) * 5 / 4 : COST_MAX;
  2246. int subtype, btype = D_L0_8x8;
  2247. uint64_t bcost = COST_MAX64;
  2248. for( subtype = D_L0_4x4; subtype <= D_L0_8x8; subtype++ )
  2249. {
  2250. uint64_t cost;
  2251. if( costs[subtype] > sub8x8_thresh )
  2252. continue;
  2253. h->mb.i_sub_partition[i] = subtype;
  2254. mb_cache_mv_p8x8( h, a, i );
  2255. if( subtype == btype )
  2256. continue;
  2257. cost = x264_rd_cost_part( h, a->i_lambda2, i<<2, PIXEL_8x8 );
  2258. COPY2_IF_LT( bcost, cost, btype, subtype );
  2259. }
  2260. if( h->mb.i_sub_partition[i] != btype )
  2261. {
  2262. h->mb.i_sub_partition[i] = btype;
  2263. mb_cache_mv_p8x8( h, a, i );
  2264. }
  2265. }
  2266. }
  2267. else
  2268. analyse_update_cache( h, a );
  2269. a->l0.i_cost8x8 = rd_cost_mb( h, a->i_lambda2 );
  2270. }
  2271. else
  2272. a->l0.i_cost8x8 = COST_MAX;
  2273. }
  2274. static void mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
  2275. {
  2276. int thresh = a->b_early_terminate ? i_satd_inter * (17 + (!!h->mb.i_psy_rd))/16 + 1 : COST_MAX;
  2277. if( a->b_direct_available && a->i_rd16x16direct == COST_MAX )
  2278. {
  2279. h->mb.i_type = B_DIRECT;
  2280. /* Assumes direct/skip MC is still in fdec */
  2281. /* Requires b-rdo to be done before intra analysis */
  2282. h->mb.b_skip_mc = 1;
  2283. analyse_update_cache( h, a );
  2284. a->i_rd16x16direct = rd_cost_mb( h, a->i_lambda2 );
  2285. h->mb.b_skip_mc = 0;
  2286. }
  2287. //FIXME not all the update_cache calls are needed
  2288. h->mb.i_partition = D_16x16;
  2289. /* L0 */
  2290. if( a->l0.me16x16.cost < thresh && a->l0.i_rd16x16 == COST_MAX )
  2291. {
  2292. h->mb.i_type = B_L0_L0;
  2293. analyse_update_cache( h, a );
  2294. a->l0.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 );
  2295. }
  2296. /* L1 */
  2297. if( a->l1.me16x16.cost < thresh && a->l1.i_rd16x16 == COST_MAX )
  2298. {
  2299. h->mb.i_type = B_L1_L1;
  2300. analyse_update_cache( h, a );
  2301. a->l1.i_rd16x16 = rd_cost_mb( h, a->i_lambda2 );
  2302. }
  2303. /* BI */
  2304. if( a->i_cost16x16bi < thresh && a->i_rd16x16bi == COST_MAX )
  2305. {
  2306. h->mb.i_type = B_BI_BI;
  2307. analyse_update_cache( h, a );
  2308. a->i_rd16x16bi = rd_cost_mb( h, a->i_lambda2 );
  2309. }
  2310. /* 8x8 */
  2311. if( a->i_cost8x8bi < thresh && a->i_rd8x8bi == COST_MAX )
  2312. {
  2313. h->mb.i_type = B_8x8;
  2314. h->mb.i_partition = D_8x8;
  2315. analyse_update_cache( h, a );
  2316. a->i_rd8x8bi = rd_cost_mb( h, a->i_lambda2 );
  2317. x264_macroblock_cache_skip( h, 0, 0, 4, 4, 0 );
  2318. }
  2319. /* 16x8 */
  2320. if( a->i_cost16x8bi < thresh && a->i_rd16x8bi == COST_MAX )
  2321. {
  2322. h->mb.i_type = a->i_mb_type16x8;
  2323. h->mb.i_partition = D_16x8;
  2324. analyse_update_cache( h, a );
  2325. a->i_rd16x8bi = rd_cost_mb( h, a->i_lambda2 );
  2326. }
  2327. /* 8x16 */
  2328. if( a->i_cost8x16bi < thresh && a->i_rd8x16bi == COST_MAX )
  2329. {
  2330. h->mb.i_type = a->i_mb_type8x16;
  2331. h->mb.i_partition = D_8x16;
  2332. analyse_update_cache( h, a );
  2333. a->i_rd8x16bi = rd_cost_mb( h, a->i_lambda2 );
  2334. }
  2335. }
  2336. static void refine_bidir( x264_t *h, x264_mb_analysis_t *a )
  2337. {
  2338. int i_biweight;
  2339. if( IS_INTRA(h->mb.i_type) )
  2340. return;
  2341. switch( h->mb.i_partition )
  2342. {
  2343. case D_16x16:
  2344. if( h->mb.i_type == B_BI_BI )
  2345. {
  2346. i_biweight = h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref];
  2347. x264_me_refine_bidir_satd( h, &a->l0.bi16x16, &a->l1.bi16x16, i_biweight );
  2348. }
  2349. break;
  2350. case D_16x8:
  2351. for( int i = 0; i < 2; i++ )
  2352. if( a->i_mb_partition16x8[i] == D_BI_8x8 )
  2353. {
  2354. i_biweight = h->mb.bipred_weight[a->l0.me16x8[i].i_ref][a->l1.me16x8[i].i_ref];
  2355. x264_me_refine_bidir_satd( h, &a->l0.me16x8[i], &a->l1.me16x8[i], i_biweight );
  2356. }
  2357. break;
  2358. case D_8x16:
  2359. for( int i = 0; i < 2; i++ )
  2360. if( a->i_mb_partition8x16[i] == D_BI_8x8 )
  2361. {
  2362. i_biweight = h->mb.bipred_weight[a->l0.me8x16[i].i_ref][a->l1.me8x16[i].i_ref];
  2363. x264_me_refine_bidir_satd( h, &a->l0.me8x16[i], &a->l1.me8x16[i], i_biweight );
  2364. }
  2365. break;
  2366. case D_8x8:
  2367. for( int i = 0; i < 4; i++ )
  2368. if( h->mb.i_sub_partition[i] == D_BI_8x8 )
  2369. {
  2370. i_biweight = h->mb.bipred_weight[a->l0.me8x8[i].i_ref][a->l1.me8x8[i].i_ref];
  2371. x264_me_refine_bidir_satd( h, &a->l0.me8x8[i], &a->l1.me8x8[i], i_biweight );
  2372. }
  2373. break;
  2374. }
  2375. }
  2376. static inline void mb_analyse_transform( x264_t *h )
  2377. {
  2378. if( x264_mb_transform_8x8_allowed( h ) && h->param.analyse.b_transform_8x8 && !h->mb.b_lossless )
  2379. {
  2380. /* Only luma MC is really needed for 4:2:0, but the full MC is re-used in macroblock_encode. */
  2381. x264_mb_mc( h );
  2382. int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
  2383. int i_cost8 = 0, i_cost4 = 0;
  2384. /* Not all platforms have a merged SATD function */
  2385. if( h->pixf.sa8d_satd[PIXEL_16x16] )
  2386. {
  2387. uint64_t cost = 0;
  2388. for( int p = 0; p < plane_count; p++ )
  2389. {
  2390. cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
  2391. h->mb.pic.p_fdec[p], FDEC_STRIDE );
  2392. }
  2393. i_cost8 = (uint32_t)cost;
  2394. i_cost4 = (uint32_t)(cost >> 32);
  2395. }
  2396. else
  2397. {
  2398. for( int p = 0; p < plane_count; p++ )
  2399. {
  2400. i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
  2401. h->mb.pic.p_fdec[p], FDEC_STRIDE );
  2402. i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
  2403. h->mb.pic.p_fdec[p], FDEC_STRIDE );
  2404. }
  2405. }
  2406. h->mb.b_transform_8x8 = i_cost8 < i_cost4;
  2407. h->mb.b_skip_mc = 1;
  2408. }
  2409. }
  2410. static inline void mb_analyse_transform_rd( x264_t *h, x264_mb_analysis_t *a, int *i_satd, int *i_rd )
  2411. {
  2412. if( h->param.analyse.b_transform_8x8 && h->pps->b_transform_8x8_mode )
  2413. {
  2414. uint32_t subpart_bak = M32( h->mb.i_sub_partition );
  2415. /* Try switching the subpartitions to 8x8 so that we can use 8x8 transform mode */
  2416. if( h->mb.i_type == P_8x8 )
  2417. M32( h->mb.i_sub_partition ) = D_L0_8x8*0x01010101;
  2418. else if( !x264_transform_allowed[h->mb.i_type] )
  2419. return;
  2420. analyse_update_cache( h, a );
  2421. h->mb.b_transform_8x8 ^= 1;
  2422. /* FIXME only luma is needed for 4:2:0, but the score for comparison already includes chroma */
  2423. int i_rd8 = rd_cost_mb( h, a->i_lambda2 );
  2424. if( *i_rd >= i_rd8 )
  2425. {
  2426. if( *i_rd > 0 )
  2427. *i_satd = (int64_t)(*i_satd) * i_rd8 / *i_rd;
  2428. *i_rd = i_rd8;
  2429. }
  2430. else
  2431. {
  2432. h->mb.b_transform_8x8 ^= 1;
  2433. M32( h->mb.i_sub_partition ) = subpart_bak;
  2434. }
  2435. }
  2436. }
  2437. /* Rate-distortion optimal QP selection.
  2438. * FIXME: More than half of the benefit of this function seems to be
  2439. * in the way it improves the coding of chroma DC (by decimating or
  2440. * finding a better way to code a single DC coefficient.)
  2441. * There must be a more efficient way to get that portion of the benefit
  2442. * without doing full QP-RD, but RD-decimation doesn't seem to do the
  2443. * trick. */
  2444. static inline void mb_analyse_qp_rd( x264_t *h, x264_mb_analysis_t *a )
  2445. {
  2446. int bcost, cost, failures, prevcost, origcost;
  2447. int orig_qp = h->mb.i_qp, bqp = h->mb.i_qp;
  2448. int last_qp_tried = 0;
  2449. origcost = bcost = rd_cost_mb( h, a->i_lambda2 );
  2450. int origcbp = h->mb.cbp[h->mb.i_mb_xy];
  2451. /* If CBP is already zero, don't raise the quantizer any higher. */
  2452. for( int direction = origcbp ? 1 : -1; direction >= -1; direction-=2 )
  2453. {
  2454. /* Without psy-RD, require monotonicity when moving quant away from previous
  2455. * macroblock's quant; allow 1 failure when moving quant towards previous quant.
  2456. * With psy-RD, allow 1 failure when moving quant away from previous quant,
  2457. * allow 2 failures when moving quant towards previous quant.
  2458. * Psy-RD generally seems to result in more chaotic RD score-vs-quantizer curves. */
  2459. int threshold = (!!h->mb.i_psy_rd);
  2460. /* Raise the threshold for failures if we're moving towards the last QP. */
  2461. if( ( h->mb.i_last_qp < orig_qp && direction == -1 ) ||
  2462. ( h->mb.i_last_qp > orig_qp && direction == 1 ) )
  2463. threshold++;
  2464. h->mb.i_qp = orig_qp;
  2465. failures = 0;
  2466. prevcost = origcost;
  2467. /* If the current QP results in an empty CBP, it's highly likely that lower QPs
  2468. * (up to a point) will too. So, jump down to where the threshold will kick in
  2469. * and check the QP there. If the CBP is still empty, skip the main loop.
  2470. * If it isn't empty, we would have ended up having to check this QP anyways,
  2471. * so as long as we store it for later lookup, we lose nothing. */
  2472. int already_checked_qp = -1;
  2473. int already_checked_cost = COST_MAX;
  2474. if( direction == -1 )
  2475. {
  2476. if( !origcbp )
  2477. {
  2478. h->mb.i_qp = X264_MAX( h->mb.i_qp - threshold - 1, SPEC_QP( h->param.rc.i_qp_min ) );
  2479. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  2480. already_checked_cost = rd_cost_mb( h, a->i_lambda2 );
  2481. if( !h->mb.cbp[h->mb.i_mb_xy] )
  2482. {
  2483. /* If our empty-CBP block is lower QP than the last QP,
  2484. * the last QP almost surely doesn't have a CBP either. */
  2485. if( h->mb.i_last_qp > h->mb.i_qp )
  2486. last_qp_tried = 1;
  2487. break;
  2488. }
  2489. already_checked_qp = h->mb.i_qp;
  2490. h->mb.i_qp = orig_qp;
  2491. }
  2492. }
  2493. h->mb.i_qp += direction;
  2494. while( h->mb.i_qp >= h->param.rc.i_qp_min && h->mb.i_qp <= SPEC_QP( h->param.rc.i_qp_max ) )
  2495. {
  2496. if( h->mb.i_last_qp == h->mb.i_qp )
  2497. last_qp_tried = 1;
  2498. if( h->mb.i_qp == already_checked_qp )
  2499. cost = already_checked_cost;
  2500. else
  2501. {
  2502. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  2503. cost = rd_cost_mb( h, a->i_lambda2 );
  2504. COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  2505. }
  2506. /* We can't assume that the costs are monotonic over QPs.
  2507. * Tie case-as-failure seems to give better results. */
  2508. if( cost < prevcost )
  2509. failures = 0;
  2510. else
  2511. failures++;
  2512. prevcost = cost;
  2513. if( failures > threshold )
  2514. break;
  2515. if( direction == 1 && !h->mb.cbp[h->mb.i_mb_xy] )
  2516. break;
  2517. h->mb.i_qp += direction;
  2518. }
  2519. }
  2520. /* Always try the last block's QP. */
  2521. if( !last_qp_tried )
  2522. {
  2523. h->mb.i_qp = h->mb.i_last_qp;
  2524. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  2525. cost = rd_cost_mb( h, a->i_lambda2 );
  2526. COPY2_IF_LT( bcost, cost, bqp, h->mb.i_qp );
  2527. }
  2528. h->mb.i_qp = bqp;
  2529. h->mb.i_chroma_qp = h->chroma_qp_table[h->mb.i_qp];
  2530. /* Check transform again; decision from before may no longer be optimal. */
  2531. if( h->mb.i_qp != orig_qp && h->param.analyse.b_transform_8x8 &&
  2532. x264_mb_transform_8x8_allowed( h ) )
  2533. {
  2534. h->mb.b_transform_8x8 ^= 1;
  2535. cost = rd_cost_mb( h, a->i_lambda2 );
  2536. if( cost > bcost )
  2537. h->mb.b_transform_8x8 ^= 1;
  2538. }
  2539. }
  2540. /*****************************************************************************
  2541. * x264_macroblock_analyse:
  2542. *****************************************************************************/
  2543. void x264_macroblock_analyse( x264_t *h )
  2544. {
  2545. x264_mb_analysis_t analysis;
  2546. int i_cost = COST_MAX;
  2547. h->mb.i_qp = x264_ratecontrol_mb_qp( h );
  2548. /* If the QP of this MB is within 1 of the previous MB, code the same QP as the previous MB,
  2549. * to lower the bit cost of the qp_delta. Don't do this if QPRD is enabled. */
  2550. if( h->param.rc.i_aq_mode && h->param.analyse.i_subpel_refine < 10 )
  2551. h->mb.i_qp = abs(h->mb.i_qp - h->mb.i_last_qp) == 1 ? h->mb.i_last_qp : h->mb.i_qp;
  2552. if( h->param.analyse.b_mb_info )
  2553. h->fdec->effective_qp[h->mb.i_mb_xy] = h->mb.i_qp; /* Store the real analysis QP. */
  2554. mb_analyse_init( h, &analysis, h->mb.i_qp );
  2555. /*--------------------------- Do the analysis ---------------------------*/
  2556. if( h->sh.i_type == SLICE_TYPE_I )
  2557. {
  2558. intra_analysis:
  2559. if( analysis.i_mbrd )
  2560. mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
  2561. mb_analyse_intra( h, &analysis, COST_MAX );
  2562. if( analysis.i_mbrd )
  2563. intra_rd( h, &analysis, COST_MAX );
  2564. i_cost = analysis.i_satd_i16x16;
  2565. h->mb.i_type = I_16x16;
  2566. COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, h->mb.i_type, I_4x4 );
  2567. COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, h->mb.i_type, I_8x8 );
  2568. if( analysis.i_satd_pcm < i_cost )
  2569. h->mb.i_type = I_PCM;
  2570. else if( analysis.i_mbrd >= 2 )
  2571. intra_rd_refine( h, &analysis );
  2572. }
  2573. else if( h->sh.i_type == SLICE_TYPE_P )
  2574. {
  2575. int b_skip = 0;
  2576. h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 0 );
  2577. analysis.b_try_skip = 0;
  2578. if( analysis.b_force_intra )
  2579. {
  2580. if( !h->param.analyse.b_psy )
  2581. {
  2582. mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
  2583. goto intra_analysis;
  2584. }
  2585. }
  2586. else
  2587. {
  2588. /* Special fast-skip logic using information from mb_info. */
  2589. if( h->fdec->mb_info && (h->fdec->mb_info[h->mb.i_mb_xy]&X264_MBINFO_CONSTANT) )
  2590. {
  2591. if( !SLICE_MBAFF && (h->fdec->i_frame - h->fref[0][0]->i_frame) == 1 && !h->sh.b_weighted_pred &&
  2592. h->fref[0][0]->effective_qp[h->mb.i_mb_xy] <= h->mb.i_qp )
  2593. {
  2594. h->mb.i_partition = D_16x16;
  2595. /* Use the P-SKIP MV if we can... */
  2596. if( !M32(h->mb.cache.pskip_mv) )
  2597. {
  2598. b_skip = 1;
  2599. h->mb.i_type = P_SKIP;
  2600. }
  2601. /* Otherwise, just force a 16x16 block. */
  2602. else
  2603. {
  2604. h->mb.i_type = P_L0;
  2605. analysis.l0.me16x16.i_ref = 0;
  2606. M32( analysis.l0.me16x16.mv ) = 0;
  2607. }
  2608. goto skip_analysis;
  2609. }
  2610. /* Reset the information accordingly */
  2611. else if( h->param.analyse.b_mb_info_update )
  2612. h->fdec->mb_info[h->mb.i_mb_xy] &= ~X264_MBINFO_CONSTANT;
  2613. }
  2614. int skip_invalid = h->i_thread_frames > 1 && h->mb.cache.pskip_mv[1] > h->mb.mv_max_spel[1];
  2615. /* If the current macroblock is off the frame, just skip it. */
  2616. if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height && !skip_invalid )
  2617. b_skip = 1;
  2618. /* Fast P_SKIP detection */
  2619. else if( h->param.analyse.b_fast_pskip )
  2620. {
  2621. if( skip_invalid )
  2622. // FIXME don't need to check this if the reference frame is done
  2623. {}
  2624. else if( h->param.analyse.i_subpel_refine >= 3 )
  2625. analysis.b_try_skip = 1;
  2626. else if( h->mb.i_mb_type_left[0] == P_SKIP ||
  2627. h->mb.i_mb_type_top == P_SKIP ||
  2628. h->mb.i_mb_type_topleft == P_SKIP ||
  2629. h->mb.i_mb_type_topright == P_SKIP )
  2630. b_skip = x264_macroblock_probe_pskip( h );
  2631. }
  2632. }
  2633. h->mc.prefetch_ref( h->mb.pic.p_fref[0][0][h->mb.i_mb_x&3], h->mb.pic.i_stride[0], 1 );
  2634. if( b_skip )
  2635. {
  2636. h->mb.i_type = P_SKIP;
  2637. h->mb.i_partition = D_16x16;
  2638. assert( h->mb.cache.pskip_mv[1] <= h->mb.mv_max_spel[1] || h->i_thread_frames == 1 );
  2639. skip_analysis:
  2640. /* Set up MVs for future predictors */
  2641. for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
  2642. M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  2643. }
  2644. else
  2645. {
  2646. const unsigned int flags = h->param.analyse.inter;
  2647. int i_type;
  2648. int i_partition;
  2649. int i_satd_inter, i_satd_intra;
  2650. mb_analyse_load_costs( h, &analysis );
  2651. mb_analyse_inter_p16x16( h, &analysis );
  2652. if( h->mb.i_type == P_SKIP )
  2653. {
  2654. for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
  2655. M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  2656. return;
  2657. }
  2658. if( flags & X264_ANALYSE_PSUB16x16 )
  2659. {
  2660. if( h->param.analyse.b_mixed_references )
  2661. mb_analyse_inter_p8x8_mixed_ref( h, &analysis );
  2662. else
  2663. mb_analyse_inter_p8x8( h, &analysis );
  2664. }
  2665. /* Select best inter mode */
  2666. i_type = P_L0;
  2667. i_partition = D_16x16;
  2668. i_cost = analysis.l0.me16x16.cost;
  2669. if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
  2670. analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost) )
  2671. {
  2672. i_type = P_8x8;
  2673. i_partition = D_8x8;
  2674. i_cost = analysis.l0.i_cost8x8;
  2675. /* Do sub 8x8 */
  2676. if( flags & X264_ANALYSE_PSUB8x8 )
  2677. {
  2678. for( int i = 0; i < 4; i++ )
  2679. {
  2680. mb_analyse_inter_p4x4( h, &analysis, i );
  2681. int i_thresh8x4 = analysis.l0.me4x4[i][1].cost_mv + analysis.l0.me4x4[i][2].cost_mv;
  2682. if( !analysis.b_early_terminate || analysis.l0.i_cost4x4[i] < analysis.l0.me8x8[i].cost + i_thresh8x4 )
  2683. {
  2684. int i_cost8x8 = analysis.l0.i_cost4x4[i];
  2685. h->mb.i_sub_partition[i] = D_L0_4x4;
  2686. mb_analyse_inter_p8x4( h, &analysis, i );
  2687. COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost8x4[i],
  2688. h->mb.i_sub_partition[i], D_L0_8x4 );
  2689. mb_analyse_inter_p4x8( h, &analysis, i );
  2690. COPY2_IF_LT( i_cost8x8, analysis.l0.i_cost4x8[i],
  2691. h->mb.i_sub_partition[i], D_L0_4x8 );
  2692. i_cost += i_cost8x8 - analysis.l0.me8x8[i].cost;
  2693. }
  2694. mb_cache_mv_p8x8( h, &analysis, i );
  2695. }
  2696. analysis.l0.i_cost8x8 = i_cost;
  2697. }
  2698. }
  2699. /* Now do 16x8/8x16 */
  2700. int i_thresh16x8 = analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[2].cost_mv;
  2701. if( ( flags & X264_ANALYSE_PSUB16x16 ) && (!analysis.b_early_terminate ||
  2702. analysis.l0.i_cost8x8 < analysis.l0.me16x16.cost + i_thresh16x8) )
  2703. {
  2704. int i_avg_mv_ref_cost = (analysis.l0.me8x8[2].cost_mv + analysis.l0.me8x8[2].i_ref_cost
  2705. + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
  2706. analysis.i_cost_est16x8[1] = analysis.i_satd8x8[0][2] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
  2707. mb_analyse_inter_p16x8( h, &analysis, i_cost );
  2708. COPY3_IF_LT( i_cost, analysis.l0.i_cost16x8, i_type, P_L0, i_partition, D_16x8 );
  2709. i_avg_mv_ref_cost = (analysis.l0.me8x8[1].cost_mv + analysis.l0.me8x8[1].i_ref_cost
  2710. + analysis.l0.me8x8[3].cost_mv + analysis.l0.me8x8[3].i_ref_cost + 1) >> 1;
  2711. analysis.i_cost_est8x16[1] = analysis.i_satd8x8[0][1] + analysis.i_satd8x8[0][3] + i_avg_mv_ref_cost;
  2712. mb_analyse_inter_p8x16( h, &analysis, i_cost );
  2713. COPY3_IF_LT( i_cost, analysis.l0.i_cost8x16, i_type, P_L0, i_partition, D_8x16 );
  2714. }
  2715. h->mb.i_partition = i_partition;
  2716. /* refine qpel */
  2717. //FIXME mb_type costs?
  2718. if( analysis.i_mbrd || !h->mb.i_subpel_refine )
  2719. {
  2720. /* refine later */
  2721. }
  2722. else if( i_partition == D_16x16 )
  2723. {
  2724. x264_me_refine_qpel( h, &analysis.l0.me16x16 );
  2725. i_cost = analysis.l0.me16x16.cost;
  2726. }
  2727. else if( i_partition == D_16x8 )
  2728. {
  2729. x264_me_refine_qpel( h, &analysis.l0.me16x8[0] );
  2730. x264_me_refine_qpel( h, &analysis.l0.me16x8[1] );
  2731. i_cost = analysis.l0.me16x8[0].cost + analysis.l0.me16x8[1].cost;
  2732. }
  2733. else if( i_partition == D_8x16 )
  2734. {
  2735. x264_me_refine_qpel( h, &analysis.l0.me8x16[0] );
  2736. x264_me_refine_qpel( h, &analysis.l0.me8x16[1] );
  2737. i_cost = analysis.l0.me8x16[0].cost + analysis.l0.me8x16[1].cost;
  2738. }
  2739. else if( i_partition == D_8x8 )
  2740. {
  2741. i_cost = 0;
  2742. for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  2743. {
  2744. switch( h->mb.i_sub_partition[i8x8] )
  2745. {
  2746. case D_L0_8x8:
  2747. x264_me_refine_qpel( h, &analysis.l0.me8x8[i8x8] );
  2748. i_cost += analysis.l0.me8x8[i8x8].cost;
  2749. break;
  2750. case D_L0_8x4:
  2751. x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][0] );
  2752. x264_me_refine_qpel( h, &analysis.l0.me8x4[i8x8][1] );
  2753. i_cost += analysis.l0.me8x4[i8x8][0].cost +
  2754. analysis.l0.me8x4[i8x8][1].cost;
  2755. break;
  2756. case D_L0_4x8:
  2757. x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][0] );
  2758. x264_me_refine_qpel( h, &analysis.l0.me4x8[i8x8][1] );
  2759. i_cost += analysis.l0.me4x8[i8x8][0].cost +
  2760. analysis.l0.me4x8[i8x8][1].cost;
  2761. break;
  2762. case D_L0_4x4:
  2763. x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][0] );
  2764. x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][1] );
  2765. x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][2] );
  2766. x264_me_refine_qpel( h, &analysis.l0.me4x4[i8x8][3] );
  2767. i_cost += analysis.l0.me4x4[i8x8][0].cost +
  2768. analysis.l0.me4x4[i8x8][1].cost +
  2769. analysis.l0.me4x4[i8x8][2].cost +
  2770. analysis.l0.me4x4[i8x8][3].cost;
  2771. break;
  2772. default:
  2773. x264_log( h, X264_LOG_ERROR, "internal error (!8x8 && !4x4)\n" );
  2774. break;
  2775. }
  2776. }
  2777. }
  2778. if( h->mb.b_chroma_me )
  2779. {
  2780. if( CHROMA444 )
  2781. {
  2782. mb_analyse_intra( h, &analysis, i_cost );
  2783. mb_analyse_intra_chroma( h, &analysis );
  2784. }
  2785. else
  2786. {
  2787. mb_analyse_intra_chroma( h, &analysis );
  2788. mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
  2789. }
  2790. analysis.i_satd_i16x16 += analysis.i_satd_chroma;
  2791. analysis.i_satd_i8x8 += analysis.i_satd_chroma;
  2792. analysis.i_satd_i4x4 += analysis.i_satd_chroma;
  2793. }
  2794. else
  2795. mb_analyse_intra( h, &analysis, i_cost );
  2796. i_satd_inter = i_cost;
  2797. i_satd_intra = X264_MIN3( analysis.i_satd_i16x16,
  2798. analysis.i_satd_i8x8,
  2799. analysis.i_satd_i4x4 );
  2800. if( analysis.i_mbrd )
  2801. {
  2802. mb_analyse_p_rd( h, &analysis, X264_MIN(i_satd_inter, i_satd_intra) );
  2803. i_type = P_L0;
  2804. i_partition = D_16x16;
  2805. i_cost = analysis.l0.i_rd16x16;
  2806. COPY2_IF_LT( i_cost, analysis.l0.i_cost16x8, i_partition, D_16x8 );
  2807. COPY2_IF_LT( i_cost, analysis.l0.i_cost8x16, i_partition, D_8x16 );
  2808. COPY3_IF_LT( i_cost, analysis.l0.i_cost8x8, i_partition, D_8x8, i_type, P_8x8 );
  2809. h->mb.i_type = i_type;
  2810. h->mb.i_partition = i_partition;
  2811. if( i_cost < COST_MAX )
  2812. mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
  2813. intra_rd( h, &analysis, i_satd_inter * 5/4 + 1 );
  2814. }
  2815. COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
  2816. COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
  2817. COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
  2818. COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
  2819. h->mb.i_type = i_type;
  2820. if( analysis.b_force_intra && !IS_INTRA(i_type) )
  2821. {
  2822. /* Intra masking: copy fdec to fenc and re-encode the block as intra in order to make it appear as if
  2823. * it was an inter block. */
  2824. analyse_update_cache( h, &analysis );
  2825. x264_macroblock_encode( h );
  2826. for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
  2827. h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
  2828. if( !CHROMA444 )
  2829. {
  2830. int height = 16 >> CHROMA_V_SHIFT;
  2831. h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
  2832. h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
  2833. }
  2834. mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
  2835. goto intra_analysis;
  2836. }
  2837. if( analysis.i_mbrd >= 2 && h->mb.i_type != I_PCM )
  2838. {
  2839. if( IS_INTRA( h->mb.i_type ) )
  2840. {
  2841. intra_rd_refine( h, &analysis );
  2842. }
  2843. else if( i_partition == D_16x16 )
  2844. {
  2845. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, analysis.l0.me16x16.i_ref );
  2846. analysis.l0.me16x16.cost = i_cost;
  2847. x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
  2848. }
  2849. else if( i_partition == D_16x8 )
  2850. {
  2851. M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
  2852. x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, analysis.l0.me16x8[0].i_ref );
  2853. x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, analysis.l0.me16x8[1].i_ref );
  2854. x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[0], analysis.i_lambda2, 0, 0 );
  2855. x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[1], analysis.i_lambda2, 8, 0 );
  2856. }
  2857. else if( i_partition == D_8x16 )
  2858. {
  2859. M32( h->mb.i_sub_partition ) = D_L0_8x8 * 0x01010101;
  2860. x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, analysis.l0.me8x16[0].i_ref );
  2861. x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, analysis.l0.me8x16[1].i_ref );
  2862. x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[0], analysis.i_lambda2, 0, 0 );
  2863. x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[1], analysis.i_lambda2, 4, 0 );
  2864. }
  2865. else if( i_partition == D_8x8 )
  2866. {
  2867. analyse_update_cache( h, &analysis );
  2868. for( int i8x8 = 0; i8x8 < 4; i8x8++ )
  2869. {
  2870. if( h->mb.i_sub_partition[i8x8] == D_L0_8x8 )
  2871. {
  2872. x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i8x8], analysis.i_lambda2, i8x8*4, 0 );
  2873. }
  2874. else if( h->mb.i_sub_partition[i8x8] == D_L0_8x4 )
  2875. {
  2876. x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
  2877. x264_me_refine_qpel_rd( h, &analysis.l0.me8x4[i8x8][1], analysis.i_lambda2, i8x8*4+2, 0 );
  2878. }
  2879. else if( h->mb.i_sub_partition[i8x8] == D_L0_4x8 )
  2880. {
  2881. x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
  2882. x264_me_refine_qpel_rd( h, &analysis.l0.me4x8[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
  2883. }
  2884. else if( h->mb.i_sub_partition[i8x8] == D_L0_4x4 )
  2885. {
  2886. x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][0], analysis.i_lambda2, i8x8*4+0, 0 );
  2887. x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][1], analysis.i_lambda2, i8x8*4+1, 0 );
  2888. x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][2], analysis.i_lambda2, i8x8*4+2, 0 );
  2889. x264_me_refine_qpel_rd( h, &analysis.l0.me4x4[i8x8][3], analysis.i_lambda2, i8x8*4+3, 0 );
  2890. }
  2891. }
  2892. }
  2893. }
  2894. }
  2895. }
  2896. else if( h->sh.i_type == SLICE_TYPE_B )
  2897. {
  2898. int i_bskip_cost = COST_MAX;
  2899. int b_skip = 0;
  2900. if( analysis.i_mbrd )
  2901. mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
  2902. h->mb.i_type = B_SKIP;
  2903. if( h->mb.b_direct_auto_write )
  2904. {
  2905. /* direct=auto heuristic: prefer whichever mode allows more Skip macroblocks */
  2906. for( int i = 0; i < 2; i++ )
  2907. {
  2908. int b_changed = 1;
  2909. h->sh.b_direct_spatial_mv_pred ^= 1;
  2910. analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, i && analysis.b_direct_available ? &b_changed : NULL );
  2911. if( analysis.b_direct_available )
  2912. {
  2913. if( b_changed )
  2914. {
  2915. x264_mb_mc( h );
  2916. b_skip = x264_macroblock_probe_bskip( h );
  2917. }
  2918. h->stat.frame.i_direct_score[ h->sh.b_direct_spatial_mv_pred ] += b_skip;
  2919. }
  2920. else
  2921. b_skip = 0;
  2922. }
  2923. }
  2924. else
  2925. analysis.b_direct_available = x264_mb_predict_mv_direct16x16( h, NULL );
  2926. analysis.b_try_skip = 0;
  2927. if( analysis.b_direct_available )
  2928. {
  2929. if( !h->mb.b_direct_auto_write )
  2930. x264_mb_mc( h );
  2931. /* If the current macroblock is off the frame, just skip it. */
  2932. if( HAVE_INTERLACED && !MB_INTERLACED && h->mb.i_mb_y * 16 >= h->param.i_height )
  2933. b_skip = 1;
  2934. else if( analysis.i_mbrd )
  2935. {
  2936. i_bskip_cost = ssd_mb( h );
  2937. /* 6 = minimum cavlc cost of a non-skipped MB */
  2938. b_skip = h->mb.b_skip_mc = i_bskip_cost <= ((6 * analysis.i_lambda2 + 128) >> 8);
  2939. }
  2940. else if( !h->mb.b_direct_auto_write )
  2941. {
  2942. /* Conditioning the probe on neighboring block types
  2943. * doesn't seem to help speed or quality. */
  2944. analysis.b_try_skip = x264_macroblock_probe_bskip( h );
  2945. if( h->param.analyse.i_subpel_refine < 3 )
  2946. b_skip = analysis.b_try_skip;
  2947. }
  2948. /* Set up MVs for future predictors */
  2949. if( b_skip )
  2950. {
  2951. for( int i = 0; i < h->mb.pic.i_fref[0]; i++ )
  2952. M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  2953. for( int i = 0; i < h->mb.pic.i_fref[1]; i++ )
  2954. M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
  2955. }
  2956. }
  2957. if( !b_skip )
  2958. {
  2959. const unsigned int flags = h->param.analyse.inter;
  2960. int i_type;
  2961. int i_partition;
  2962. int i_satd_inter;
  2963. h->mb.b_skip_mc = 0;
  2964. h->mb.i_type = B_DIRECT;
  2965. mb_analyse_load_costs( h, &analysis );
  2966. /* select best inter mode */
  2967. /* direct must be first */
  2968. if( analysis.b_direct_available )
  2969. mb_analyse_inter_direct( h, &analysis );
  2970. mb_analyse_inter_b16x16( h, &analysis );
  2971. if( h->mb.i_type == B_SKIP )
  2972. {
  2973. for( int i = 1; i < h->mb.pic.i_fref[0]; i++ )
  2974. M32( h->mb.mvr[0][i][h->mb.i_mb_xy] ) = 0;
  2975. for( int i = 1; i < h->mb.pic.i_fref[1]; i++ )
  2976. M32( h->mb.mvr[1][i][h->mb.i_mb_xy] ) = 0;
  2977. return;
  2978. }
  2979. i_type = B_L0_L0;
  2980. i_partition = D_16x16;
  2981. i_cost = analysis.l0.me16x16.cost;
  2982. COPY2_IF_LT( i_cost, analysis.l1.me16x16.cost, i_type, B_L1_L1 );
  2983. COPY2_IF_LT( i_cost, analysis.i_cost16x16bi, i_type, B_BI_BI );
  2984. COPY2_IF_LT( i_cost, analysis.i_cost16x16direct, i_type, B_DIRECT );
  2985. if( analysis.i_mbrd && analysis.b_early_terminate && analysis.i_cost16x16direct <= i_cost * 33/32 )
  2986. {
  2987. mb_analyse_b_rd( h, &analysis, i_cost );
  2988. if( i_bskip_cost < analysis.i_rd16x16direct &&
  2989. i_bskip_cost < analysis.i_rd16x16bi &&
  2990. i_bskip_cost < analysis.l0.i_rd16x16 &&
  2991. i_bskip_cost < analysis.l1.i_rd16x16 )
  2992. {
  2993. h->mb.i_type = B_SKIP;
  2994. analyse_update_cache( h, &analysis );
  2995. return;
  2996. }
  2997. }
  2998. if( flags & X264_ANALYSE_BSUB16x16 )
  2999. {
  3000. if( h->param.analyse.b_mixed_references )
  3001. mb_analyse_inter_b8x8_mixed_ref( h, &analysis );
  3002. else
  3003. mb_analyse_inter_b8x8( h, &analysis );
  3004. COPY3_IF_LT( i_cost, analysis.i_cost8x8bi, i_type, B_8x8, i_partition, D_8x8 );
  3005. /* Try to estimate the cost of b16x8/b8x16 based on the satd scores of the b8x8 modes */
  3006. int i_cost_est16x8bi_total = 0, i_cost_est8x16bi_total = 0;
  3007. int i_mb_type, i_partition16x8[2], i_partition8x16[2];
  3008. for( int i = 0; i < 2; i++ )
  3009. {
  3010. int avg_l0_mv_ref_cost, avg_l1_mv_ref_cost;
  3011. int i_l0_satd, i_l1_satd, i_bi_satd, i_best_cost;
  3012. // 16x8
  3013. i_best_cost = COST_MAX;
  3014. i_l0_satd = analysis.i_satd8x8[0][i*2] + analysis.i_satd8x8[0][i*2+1];
  3015. i_l1_satd = analysis.i_satd8x8[1][i*2] + analysis.i_satd8x8[1][i*2+1];
  3016. i_bi_satd = analysis.i_satd8x8[2][i*2] + analysis.i_satd8x8[2][i*2+1];
  3017. avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i*2].cost_mv + analysis.l0.me8x8[i*2].i_ref_cost
  3018. + analysis.l0.me8x8[i*2+1].cost_mv + analysis.l0.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
  3019. avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i*2].cost_mv + analysis.l1.me8x8[i*2].i_ref_cost
  3020. + analysis.l1.me8x8[i*2+1].cost_mv + analysis.l1.me8x8[i*2+1].i_ref_cost + 1 ) >> 1;
  3021. COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition16x8[i], D_L0_8x8 );
  3022. COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition16x8[i], D_L1_8x8 );
  3023. COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition16x8[i], D_BI_8x8 );
  3024. analysis.i_cost_est16x8[i] = i_best_cost;
  3025. // 8x16
  3026. i_best_cost = COST_MAX;
  3027. i_l0_satd = analysis.i_satd8x8[0][i] + analysis.i_satd8x8[0][i+2];
  3028. i_l1_satd = analysis.i_satd8x8[1][i] + analysis.i_satd8x8[1][i+2];
  3029. i_bi_satd = analysis.i_satd8x8[2][i] + analysis.i_satd8x8[2][i+2];
  3030. avg_l0_mv_ref_cost = ( analysis.l0.me8x8[i].cost_mv + analysis.l0.me8x8[i].i_ref_cost
  3031. + analysis.l0.me8x8[i+2].cost_mv + analysis.l0.me8x8[i+2].i_ref_cost + 1 ) >> 1;
  3032. avg_l1_mv_ref_cost = ( analysis.l1.me8x8[i].cost_mv + analysis.l1.me8x8[i].i_ref_cost
  3033. + analysis.l1.me8x8[i+2].cost_mv + analysis.l1.me8x8[i+2].i_ref_cost + 1 ) >> 1;
  3034. COPY2_IF_LT( i_best_cost, i_l0_satd + avg_l0_mv_ref_cost, i_partition8x16[i], D_L0_8x8 );
  3035. COPY2_IF_LT( i_best_cost, i_l1_satd + avg_l1_mv_ref_cost, i_partition8x16[i], D_L1_8x8 );
  3036. COPY2_IF_LT( i_best_cost, i_bi_satd + avg_l0_mv_ref_cost + avg_l1_mv_ref_cost, i_partition8x16[i], D_BI_8x8 );
  3037. analysis.i_cost_est8x16[i] = i_best_cost;
  3038. }
  3039. i_mb_type = B_L0_L0 + (i_partition16x8[0]>>2) * 3 + (i_partition16x8[1]>>2);
  3040. analysis.i_cost_est16x8[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
  3041. i_cost_est16x8bi_total = analysis.i_cost_est16x8[0] + analysis.i_cost_est16x8[1];
  3042. i_mb_type = B_L0_L0 + (i_partition8x16[0]>>2) * 3 + (i_partition8x16[1]>>2);
  3043. analysis.i_cost_est8x16[1] += analysis.i_lambda * i_mb_b16x8_cost_table[i_mb_type];
  3044. i_cost_est8x16bi_total = analysis.i_cost_est8x16[0] + analysis.i_cost_est8x16[1];
  3045. /* We can gain a little speed by checking the mode with the lowest estimated cost first */
  3046. int try_16x8_first = i_cost_est16x8bi_total < i_cost_est8x16bi_total;
  3047. if( try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
  3048. {
  3049. mb_analyse_inter_b16x8( h, &analysis, i_cost );
  3050. COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
  3051. }
  3052. if( !analysis.b_early_terminate || i_cost_est8x16bi_total < i_cost )
  3053. {
  3054. mb_analyse_inter_b8x16( h, &analysis, i_cost );
  3055. COPY3_IF_LT( i_cost, analysis.i_cost8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
  3056. }
  3057. if( !try_16x8_first && (!analysis.b_early_terminate || i_cost_est16x8bi_total < i_cost) )
  3058. {
  3059. mb_analyse_inter_b16x8( h, &analysis, i_cost );
  3060. COPY3_IF_LT( i_cost, analysis.i_cost16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
  3061. }
  3062. }
  3063. if( analysis.i_mbrd || !h->mb.i_subpel_refine )
  3064. {
  3065. /* refine later */
  3066. }
  3067. /* refine qpel */
  3068. else if( i_partition == D_16x16 )
  3069. {
  3070. analysis.l0.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
  3071. analysis.l1.me16x16.cost -= analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
  3072. if( i_type == B_L0_L0 )
  3073. {
  3074. x264_me_refine_qpel( h, &analysis.l0.me16x16 );
  3075. i_cost = analysis.l0.me16x16.cost
  3076. + analysis.i_lambda * i_mb_b_cost_table[B_L0_L0];
  3077. }
  3078. else if( i_type == B_L1_L1 )
  3079. {
  3080. x264_me_refine_qpel( h, &analysis.l1.me16x16 );
  3081. i_cost = analysis.l1.me16x16.cost
  3082. + analysis.i_lambda * i_mb_b_cost_table[B_L1_L1];
  3083. }
  3084. else if( i_type == B_BI_BI )
  3085. {
  3086. x264_me_refine_qpel( h, &analysis.l0.bi16x16 );
  3087. x264_me_refine_qpel( h, &analysis.l1.bi16x16 );
  3088. }
  3089. }
  3090. else if( i_partition == D_16x8 )
  3091. {
  3092. for( int i = 0; i < 2; i++ )
  3093. {
  3094. if( analysis.i_mb_partition16x8[i] != D_L1_8x8 )
  3095. x264_me_refine_qpel( h, &analysis.l0.me16x8[i] );
  3096. if( analysis.i_mb_partition16x8[i] != D_L0_8x8 )
  3097. x264_me_refine_qpel( h, &analysis.l1.me16x8[i] );
  3098. }
  3099. }
  3100. else if( i_partition == D_8x16 )
  3101. {
  3102. for( int i = 0; i < 2; i++ )
  3103. {
  3104. if( analysis.i_mb_partition8x16[i] != D_L1_8x8 )
  3105. x264_me_refine_qpel( h, &analysis.l0.me8x16[i] );
  3106. if( analysis.i_mb_partition8x16[i] != D_L0_8x8 )
  3107. x264_me_refine_qpel( h, &analysis.l1.me8x16[i] );
  3108. }
  3109. }
  3110. else if( i_partition == D_8x8 )
  3111. {
  3112. for( int i = 0; i < 4; i++ )
  3113. {
  3114. x264_me_t *m;
  3115. int i_part_cost_old;
  3116. int i_type_cost;
  3117. int i_part_type = h->mb.i_sub_partition[i];
  3118. int b_bidir = (i_part_type == D_BI_8x8);
  3119. if( i_part_type == D_DIRECT_8x8 )
  3120. continue;
  3121. if( x264_mb_partition_listX_table[0][i_part_type] )
  3122. {
  3123. m = &analysis.l0.me8x8[i];
  3124. i_part_cost_old = m->cost;
  3125. i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
  3126. m->cost -= i_type_cost;
  3127. x264_me_refine_qpel( h, m );
  3128. if( !b_bidir )
  3129. analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
  3130. }
  3131. if( x264_mb_partition_listX_table[1][i_part_type] )
  3132. {
  3133. m = &analysis.l1.me8x8[i];
  3134. i_part_cost_old = m->cost;
  3135. i_type_cost = analysis.i_lambda * i_sub_mb_b_cost_table[D_L1_8x8];
  3136. m->cost -= i_type_cost;
  3137. x264_me_refine_qpel( h, m );
  3138. if( !b_bidir )
  3139. analysis.i_cost8x8bi += m->cost + i_type_cost - i_part_cost_old;
  3140. }
  3141. /* TODO: update mvp? */
  3142. }
  3143. }
  3144. i_satd_inter = i_cost;
  3145. if( analysis.i_mbrd )
  3146. {
  3147. mb_analyse_b_rd( h, &analysis, i_satd_inter );
  3148. i_type = B_SKIP;
  3149. i_cost = i_bskip_cost;
  3150. i_partition = D_16x16;
  3151. COPY2_IF_LT( i_cost, analysis.l0.i_rd16x16, i_type, B_L0_L0 );
  3152. COPY2_IF_LT( i_cost, analysis.l1.i_rd16x16, i_type, B_L1_L1 );
  3153. COPY2_IF_LT( i_cost, analysis.i_rd16x16bi, i_type, B_BI_BI );
  3154. COPY2_IF_LT( i_cost, analysis.i_rd16x16direct, i_type, B_DIRECT );
  3155. COPY3_IF_LT( i_cost, analysis.i_rd16x8bi, i_type, analysis.i_mb_type16x8, i_partition, D_16x8 );
  3156. COPY3_IF_LT( i_cost, analysis.i_rd8x16bi, i_type, analysis.i_mb_type8x16, i_partition, D_8x16 );
  3157. COPY3_IF_LT( i_cost, analysis.i_rd8x8bi, i_type, B_8x8, i_partition, D_8x8 );
  3158. h->mb.i_type = i_type;
  3159. h->mb.i_partition = i_partition;
  3160. }
  3161. if( h->mb.b_chroma_me )
  3162. {
  3163. if( CHROMA444 )
  3164. {
  3165. mb_analyse_intra( h, &analysis, i_satd_inter );
  3166. mb_analyse_intra_chroma( h, &analysis );
  3167. }
  3168. else
  3169. {
  3170. mb_analyse_intra_chroma( h, &analysis );
  3171. mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
  3172. }
  3173. analysis.i_satd_i16x16 += analysis.i_satd_chroma;
  3174. analysis.i_satd_i8x8 += analysis.i_satd_chroma;
  3175. analysis.i_satd_i4x4 += analysis.i_satd_chroma;
  3176. }
  3177. else
  3178. mb_analyse_intra( h, &analysis, i_satd_inter );
  3179. if( analysis.i_mbrd )
  3180. {
  3181. mb_analyse_transform_rd( h, &analysis, &i_satd_inter, &i_cost );
  3182. intra_rd( h, &analysis, i_satd_inter * 17/16 + 1 );
  3183. }
  3184. COPY2_IF_LT( i_cost, analysis.i_satd_i16x16, i_type, I_16x16 );
  3185. COPY2_IF_LT( i_cost, analysis.i_satd_i8x8, i_type, I_8x8 );
  3186. COPY2_IF_LT( i_cost, analysis.i_satd_i4x4, i_type, I_4x4 );
  3187. COPY2_IF_LT( i_cost, analysis.i_satd_pcm, i_type, I_PCM );
  3188. h->mb.i_type = i_type;
  3189. h->mb.i_partition = i_partition;
  3190. if( analysis.i_mbrd >= 2 && IS_INTRA( i_type ) && i_type != I_PCM )
  3191. intra_rd_refine( h, &analysis );
  3192. if( h->mb.i_subpel_refine >= 5 )
  3193. refine_bidir( h, &analysis );
  3194. if( analysis.i_mbrd >= 2 && i_type > B_DIRECT && i_type < B_SKIP )
  3195. {
  3196. int i_biweight;
  3197. analyse_update_cache( h, &analysis );
  3198. if( i_partition == D_16x16 )
  3199. {
  3200. if( i_type == B_L0_L0 )
  3201. {
  3202. analysis.l0.me16x16.cost = i_cost;
  3203. x264_me_refine_qpel_rd( h, &analysis.l0.me16x16, analysis.i_lambda2, 0, 0 );
  3204. }
  3205. else if( i_type == B_L1_L1 )
  3206. {
  3207. analysis.l1.me16x16.cost = i_cost;
  3208. x264_me_refine_qpel_rd( h, &analysis.l1.me16x16, analysis.i_lambda2, 0, 1 );
  3209. }
  3210. else if( i_type == B_BI_BI )
  3211. {
  3212. i_biweight = h->mb.bipred_weight[analysis.l0.bi16x16.i_ref][analysis.l1.bi16x16.i_ref];
  3213. x264_me_refine_bidir_rd( h, &analysis.l0.bi16x16, &analysis.l1.bi16x16, i_biweight, 0, analysis.i_lambda2 );
  3214. }
  3215. }
  3216. else if( i_partition == D_16x8 )
  3217. {
  3218. for( int i = 0; i < 2; i++ )
  3219. {
  3220. h->mb.i_sub_partition[i*2] = h->mb.i_sub_partition[i*2+1] = analysis.i_mb_partition16x8[i];
  3221. if( analysis.i_mb_partition16x8[i] == D_L0_8x8 )
  3222. x264_me_refine_qpel_rd( h, &analysis.l0.me16x8[i], analysis.i_lambda2, i*8, 0 );
  3223. else if( analysis.i_mb_partition16x8[i] == D_L1_8x8 )
  3224. x264_me_refine_qpel_rd( h, &analysis.l1.me16x8[i], analysis.i_lambda2, i*8, 1 );
  3225. else if( analysis.i_mb_partition16x8[i] == D_BI_8x8 )
  3226. {
  3227. i_biweight = h->mb.bipred_weight[analysis.l0.me16x8[i].i_ref][analysis.l1.me16x8[i].i_ref];
  3228. x264_me_refine_bidir_rd( h, &analysis.l0.me16x8[i], &analysis.l1.me16x8[i], i_biweight, i*2, analysis.i_lambda2 );
  3229. }
  3230. }
  3231. }
  3232. else if( i_partition == D_8x16 )
  3233. {
  3234. for( int i = 0; i < 2; i++ )
  3235. {
  3236. h->mb.i_sub_partition[i] = h->mb.i_sub_partition[i+2] = analysis.i_mb_partition8x16[i];
  3237. if( analysis.i_mb_partition8x16[i] == D_L0_8x8 )
  3238. x264_me_refine_qpel_rd( h, &analysis.l0.me8x16[i], analysis.i_lambda2, i*4, 0 );
  3239. else if( analysis.i_mb_partition8x16[i] == D_L1_8x8 )
  3240. x264_me_refine_qpel_rd( h, &analysis.l1.me8x16[i], analysis.i_lambda2, i*4, 1 );
  3241. else if( analysis.i_mb_partition8x16[i] == D_BI_8x8 )
  3242. {
  3243. i_biweight = h->mb.bipred_weight[analysis.l0.me8x16[i].i_ref][analysis.l1.me8x16[i].i_ref];
  3244. x264_me_refine_bidir_rd( h, &analysis.l0.me8x16[i], &analysis.l1.me8x16[i], i_biweight, i, analysis.i_lambda2 );
  3245. }
  3246. }
  3247. }
  3248. else if( i_partition == D_8x8 )
  3249. {
  3250. for( int i = 0; i < 4; i++ )
  3251. {
  3252. if( h->mb.i_sub_partition[i] == D_L0_8x8 )
  3253. x264_me_refine_qpel_rd( h, &analysis.l0.me8x8[i], analysis.i_lambda2, i*4, 0 );
  3254. else if( h->mb.i_sub_partition[i] == D_L1_8x8 )
  3255. x264_me_refine_qpel_rd( h, &analysis.l1.me8x8[i], analysis.i_lambda2, i*4, 1 );
  3256. else if( h->mb.i_sub_partition[i] == D_BI_8x8 )
  3257. {
  3258. i_biweight = h->mb.bipred_weight[analysis.l0.me8x8[i].i_ref][analysis.l1.me8x8[i].i_ref];
  3259. x264_me_refine_bidir_rd( h, &analysis.l0.me8x8[i], &analysis.l1.me8x8[i], i_biweight, i, analysis.i_lambda2 );
  3260. }
  3261. }
  3262. }
  3263. }
  3264. }
  3265. }
  3266. analyse_update_cache( h, &analysis );
  3267. /* In rare cases we can end up qpel-RDing our way back to a larger partition size
  3268. * without realizing it. Check for this and account for it if necessary. */
  3269. if( analysis.i_mbrd >= 2 )
  3270. {
  3271. /* Don't bother with bipred or 8x8-and-below, the odds are incredibly low. */
  3272. static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
  3273. int list = check_mv_lists[h->mb.i_type] - 1;
  3274. if( list >= 0 && h->mb.i_partition != D_16x16 &&
  3275. M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
  3276. h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
  3277. h->mb.i_partition = D_16x16;
  3278. }
  3279. if( !analysis.i_mbrd )
  3280. mb_analyse_transform( h );
  3281. if( analysis.i_mbrd == 3 && !IS_SKIP(h->mb.i_type) )
  3282. mb_analyse_qp_rd( h, &analysis );
  3283. h->mb.b_trellis = h->param.analyse.i_trellis;
  3284. h->mb.b_noise_reduction = h->mb.b_noise_reduction || (!!h->param.analyse.i_noise_reduction && !IS_INTRA( h->mb.i_type ));
  3285. if( !IS_SKIP(h->mb.i_type) && h->mb.i_psy_trellis && h->param.analyse.i_trellis == 1 )
  3286. psy_trellis_init( h, 0 );
  3287. if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
  3288. h->mb.i_skip_intra = 0;
  3289. }
  3290. /*-------------------- Update MB from the analysis ----------------------*/
  3291. static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a )
  3292. {
  3293. switch( h->mb.i_type )
  3294. {
  3295. case I_4x4:
  3296. for( int i = 0; i < 16; i++ )
  3297. h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = a->i_predict4x4[i];
  3298. mb_analyse_intra_chroma( h, a );
  3299. break;
  3300. case I_8x8:
  3301. for( int i = 0; i < 4; i++ )
  3302. x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), a->i_predict8x8[i] );
  3303. mb_analyse_intra_chroma( h, a );
  3304. break;
  3305. case I_16x16:
  3306. h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
  3307. mb_analyse_intra_chroma( h, a );
  3308. break;
  3309. case I_PCM:
  3310. break;
  3311. case P_L0:
  3312. switch( h->mb.i_partition )
  3313. {
  3314. case D_16x16:
  3315. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
  3316. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  3317. break;
  3318. case D_16x8:
  3319. x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
  3320. x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
  3321. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
  3322. x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
  3323. break;
  3324. case D_8x16:
  3325. x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
  3326. x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
  3327. x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
  3328. x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
  3329. break;
  3330. default:
  3331. x264_log( h, X264_LOG_ERROR, "internal error P_L0 and partition=%d\n", h->mb.i_partition );
  3332. break;
  3333. }
  3334. break;
  3335. case P_8x8:
  3336. x264_macroblock_cache_ref( h, 0, 0, 2, 2, 0, a->l0.me8x8[0].i_ref );
  3337. x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref );
  3338. x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref );
  3339. x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref );
  3340. for( int i = 0; i < 4; i++ )
  3341. mb_cache_mv_p8x8( h, a, i );
  3342. break;
  3343. case P_SKIP:
  3344. {
  3345. h->mb.i_partition = D_16x16;
  3346. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
  3347. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
  3348. break;
  3349. }
  3350. case B_SKIP:
  3351. case B_DIRECT:
  3352. h->mb.i_partition = h->mb.cache.direct_partition;
  3353. mb_load_mv_direct8x8( h, 0 );
  3354. mb_load_mv_direct8x8( h, 1 );
  3355. mb_load_mv_direct8x8( h, 2 );
  3356. mb_load_mv_direct8x8( h, 3 );
  3357. break;
  3358. case B_8x8:
  3359. /* optimize: cache might not need to be rewritten */
  3360. for( int i = 0; i < 4; i++ )
  3361. mb_cache_mv_b8x8( h, a, i, 1 );
  3362. break;
  3363. default: /* the rest of the B types */
  3364. switch( h->mb.i_partition )
  3365. {
  3366. case D_16x16:
  3367. switch( h->mb.i_type )
  3368. {
  3369. case B_L0_L0:
  3370. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
  3371. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
  3372. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
  3373. x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
  3374. x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
  3375. break;
  3376. case B_L1_L1:
  3377. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
  3378. x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
  3379. x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
  3380. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.me16x16.i_ref );
  3381. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
  3382. break;
  3383. case B_BI_BI:
  3384. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.bi16x16.i_ref );
  3385. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.bi16x16.mv );
  3386. x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.bi16x16.i_ref );
  3387. x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.bi16x16.mv );
  3388. break;
  3389. }
  3390. break;
  3391. case D_16x8:
  3392. mb_cache_mv_b16x8( h, a, 0, 1 );
  3393. mb_cache_mv_b16x8( h, a, 1, 1 );
  3394. break;
  3395. case D_8x16:
  3396. mb_cache_mv_b8x16( h, a, 0, 1 );
  3397. mb_cache_mv_b8x16( h, a, 1, 1 );
  3398. break;
  3399. default:
  3400. x264_log( h, X264_LOG_ERROR, "internal error (invalid MB type)\n" );
  3401. break;
  3402. }
  3403. }
  3404. #ifndef NDEBUG
  3405. if( h->i_thread_frames > 1 && !IS_INTRA(h->mb.i_type) )
  3406. {
  3407. for( int l = 0; l <= (h->sh.i_type == SLICE_TYPE_B); l++ )
  3408. {
  3409. int completed;
  3410. int ref = h->mb.cache.ref[l][x264_scan8[0]];
  3411. if( ref < 0 )
  3412. continue;
  3413. completed = h->fref[l][ ref >> MB_INTERLACED ]->orig->i_lines_completed;
  3414. if( (h->mb.cache.mv[l][x264_scan8[15]][1] >> (2 - MB_INTERLACED)) + h->mb.i_mb_y*16 > completed )
  3415. {
  3416. x264_log( h, X264_LOG_WARNING, "internal error (MV out of thread range)\n");
  3417. x264_log( h, X264_LOG_DEBUG, "mb type: %d \n", h->mb.i_type);
  3418. x264_log( h, X264_LOG_DEBUG, "mv: l%dr%d (%d,%d) \n", l, ref,
  3419. h->mb.cache.mv[l][x264_scan8[15]][0],
  3420. h->mb.cache.mv[l][x264_scan8[15]][1] );
  3421. x264_log( h, X264_LOG_DEBUG, "limit: %d \n", h->mb.mv_max_spel[1]);
  3422. x264_log( h, X264_LOG_DEBUG, "mb_xy: %d,%d \n", h->mb.i_mb_x, h->mb.i_mb_y);
  3423. x264_log( h, X264_LOG_DEBUG, "completed: %d \n", completed );
  3424. x264_log( h, X264_LOG_WARNING, "recovering by using intra mode\n");
  3425. mb_analyse_intra( h, a, COST_MAX );
  3426. h->mb.i_type = I_16x16;
  3427. h->mb.i_intra16x16_pred_mode = a->i_predict16x16;
  3428. mb_analyse_intra_chroma( h, a );
  3429. }
  3430. }
  3431. }
  3432. #endif
  3433. }
  3434. #include "slicetype.c"