mc-c.c 149 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696
  1. /*****************************************************************************
  2. * mc-c.c: msa motion compensation
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Neha Rana <neha.rana@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "mc.h"
  28. #if !HIGH_BIT_DEPTH
  29. static const uint8_t pu_luma_mask_arr[16 * 8] =
  30. {
  31. /* 8 width cases */
  32. 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
  33. 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
  34. 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
  35. /* 4 width cases */
  36. 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
  37. 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
  38. 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
  39. 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
  40. 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
  41. };
  42. static const uint8_t pu_chroma_mask_arr[16 * 5] =
  43. {
  44. 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
  45. 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
  46. 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
  47. 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
  48. 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
  49. };
  50. static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
  51. uint8_t *p_dst, int32_t i_dst_stride,
  52. int32_t i_height )
  53. {
  54. uint32_t u_loop_cnt, u_h4w;
  55. v16u8 dst0;
  56. v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
  57. v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
  58. v16i8 mask0, mask1, mask2;
  59. v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
  60. v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
  61. v16i8 minus5b = __msa_ldi_b( -5 );
  62. v16i8 plus20b = __msa_ldi_b( 20 );
  63. u_h4w = i_height % 4;
  64. LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
  65. for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
  66. {
  67. LD_SB2( p_src, 8, src0, src1 );
  68. p_src += i_src_stride;
  69. LD_SB2( p_src, 8, src2, src3 );
  70. p_src += i_src_stride;
  71. XORI_B4_128_SB( src0, src1, src2, src3 );
  72. VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
  73. VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
  74. VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
  75. VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
  76. VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
  77. VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
  78. HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
  79. DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
  80. minus5b, res0, res1, res2, res3 );
  81. DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  82. plus20b, res0, res1, res2, res3 );
  83. LD_SB2( p_src, 8, src4, src5 );
  84. p_src += i_src_stride;
  85. LD_SB2( p_src, 8, src6, src7 );
  86. p_src += i_src_stride;
  87. XORI_B4_128_SB( src4, src5, src6, src7 );
  88. VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
  89. VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
  90. VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
  91. VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
  92. VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
  93. VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
  94. HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
  95. DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
  96. minus5b, res4, res5, res6, res7 );
  97. DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
  98. plus20b, res4, res5, res6, res7 );
  99. SRARI_H4_SH( res0, res1, res2, res3, 5 );
  100. SRARI_H4_SH( res4, res5, res6, res7, 5 );
  101. SAT_SH4_SH( res0, res1, res2, res3, 7 );
  102. SAT_SH4_SH( res4, res5, res6, res7, 7 );
  103. PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
  104. vec0, vec1, vec2, vec3 );
  105. XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
  106. ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
  107. p_dst += ( 4 * i_dst_stride );
  108. }
  109. for( u_loop_cnt = u_h4w; u_loop_cnt--; )
  110. {
  111. LD_SB2( p_src, 8, src0, src1 );
  112. p_src += i_src_stride;
  113. XORI_B2_128_SB( src0, src1 );
  114. VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
  115. VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
  116. VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
  117. res0 = __msa_hadd_s_h( vec0, vec0 );
  118. DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
  119. res1 = __msa_hadd_s_h( vec3, vec3 );
  120. DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
  121. SRARI_H2_SH( res0, res1, 5 );
  122. SAT_SH2_SH( res0, res1, 7 );
  123. dst0 = PCKEV_XORI128_UB( res0, res1 );
  124. ST_UB( dst0, p_dst );
  125. p_dst += i_dst_stride;
  126. }
  127. }
  128. static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
  129. uint8_t *p_dst, int32_t i_dst_stride,
  130. int32_t i_height )
  131. {
  132. uint32_t u_loop_cnt, u_h4w;
  133. const int16_t i_filt_const0 = 0xfb01;
  134. const int16_t i_filt_const1 = 0x1414;
  135. const int16_t i_filt_const2 = 0x1fb;
  136. v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  137. v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
  138. v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
  139. v16i8 src65_l, src87_l;
  140. v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
  141. v16u8 res0, res1, res2, res3;
  142. v16i8 filt0, filt1, filt2;
  143. u_h4w = i_height % 4;
  144. filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
  145. filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
  146. filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
  147. LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
  148. p_src += ( 5 * i_src_stride );
  149. XORI_B5_128_SB( src0, src1, src2, src3, src4 );
  150. ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
  151. src10_r, src21_r, src32_r, src43_r );
  152. ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
  153. src10_l, src21_l, src32_l, src43_l );
  154. for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
  155. {
  156. LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
  157. p_src += ( 4 * i_src_stride );
  158. XORI_B4_128_SB( src5, src6, src7, src8 );
  159. ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
  160. src54_r, src65_r, src76_r, src87_r );
  161. ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
  162. src54_l, src65_l, src76_l, src87_l );
  163. out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
  164. filt0, filt1, filt2 );
  165. out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
  166. filt0, filt1, filt2 );
  167. out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
  168. filt0, filt1, filt2 );
  169. out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
  170. filt0, filt1, filt2 );
  171. out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
  172. filt0, filt1, filt2 );
  173. out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
  174. filt0, filt1, filt2 );
  175. out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
  176. filt0, filt1, filt2 );
  177. out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
  178. filt0, filt1, filt2 );
  179. SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
  180. SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
  181. SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
  182. SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
  183. PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
  184. out3_r, res0, res1, res2, res3 );
  185. XORI_B4_128_UB( res0, res1, res2, res3 );
  186. ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
  187. p_dst += ( 4 * i_dst_stride );
  188. src10_r = src54_r;
  189. src32_r = src76_r;
  190. src21_r = src65_r;
  191. src43_r = src87_r;
  192. src10_l = src54_l;
  193. src32_l = src76_l;
  194. src21_l = src65_l;
  195. src43_l = src87_l;
  196. src4 = src8;
  197. }
  198. for( u_loop_cnt = u_h4w; u_loop_cnt--; )
  199. {
  200. src5 = LD_SB( p_src );
  201. p_src += ( i_src_stride );
  202. src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
  203. ILVRL_B2_SB( src5, src4, src54_r, src54_l );
  204. out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
  205. filt0, filt1, filt2 );
  206. out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
  207. filt0, filt1, filt2 );
  208. SRARI_H2_SH( out0_r, out0_l, 5 );
  209. SAT_SH2_SH( out0_r, out0_l, 7 );
  210. out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
  211. res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
  212. ST_UB( res0, p_dst );
  213. p_dst += i_dst_stride;
  214. src10_r = src21_r;
  215. src21_r = src32_r;
  216. src32_r = src43_r;
  217. src43_r = src54_r;
  218. src10_l = src21_l;
  219. src21_l = src32_l;
  220. src32_l = src43_l;
  221. src43_l = src54_l;
  222. src4 = src5;
  223. }
  224. }
  225. static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
  226. uint8_t *p_dst, int32_t i_dst_stride,
  227. int32_t i_height )
  228. {
  229. uint32_t u_loop_cnt, u_h4w;
  230. uint64_t u_out0;
  231. v16i8 tmp0;
  232. v16i8 src0, src1, src2, src3, src4;
  233. v16i8 mask0, mask1, mask2;
  234. v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
  235. v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
  236. v8i16 dst0, dst1, dst2, dst3;
  237. v16u8 out0, out1;
  238. u_h4w = i_height % 4;
  239. LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
  240. LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
  241. XORI_B5_128_SB( src0, src1, src2, src3, src4 );
  242. p_src += ( 5 * i_src_stride );
  243. hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
  244. hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
  245. hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
  246. hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
  247. hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
  248. for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
  249. {
  250. LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
  251. XORI_B4_128_SB( src0, src1, src2, src3 );
  252. p_src += ( 4 * i_src_stride );
  253. hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
  254. hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
  255. hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
  256. hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
  257. dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
  258. hz_out3, hz_out4, hz_out5 );
  259. dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
  260. hz_out4, hz_out5, hz_out6 );
  261. dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
  262. hz_out5, hz_out6, hz_out7 );
  263. dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
  264. hz_out6, hz_out7, hz_out8 );
  265. out0 = PCKEV_XORI128_UB( dst0, dst1 );
  266. out1 = PCKEV_XORI128_UB( dst2, dst3 );
  267. ST8x4_UB( out0, out1, p_dst, i_dst_stride );
  268. p_dst += ( 4 * i_dst_stride );
  269. hz_out3 = hz_out7;
  270. hz_out1 = hz_out5;
  271. hz_out5 = hz_out4;
  272. hz_out4 = hz_out8;
  273. hz_out2 = hz_out6;
  274. hz_out0 = hz_out5;
  275. }
  276. for( u_loop_cnt = u_h4w; u_loop_cnt--; )
  277. {
  278. src0 = LD_SB( p_src );
  279. p_src += i_src_stride;
  280. src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
  281. hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
  282. dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
  283. hz_out2, hz_out3,
  284. hz_out4, hz_out5 );
  285. tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
  286. tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
  287. u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
  288. SD( u_out0, p_dst );
  289. p_dst += i_dst_stride;
  290. hz_out0 = hz_out1;
  291. hz_out1 = hz_out2;
  292. hz_out2 = hz_out3;
  293. hz_out3 = hz_out4;
  294. hz_out4 = hz_out5;
  295. }
  296. }
  297. static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
  298. uint8_t *p_dst, int32_t i_dst_stride,
  299. int32_t i_height )
  300. {
  301. uint32_t u_multiple8_cnt;
  302. for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
  303. {
  304. avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
  305. i_height );
  306. p_src += 8;
  307. p_dst += 8;
  308. }
  309. }
  310. static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
  311. int32_t i_src_stride,
  312. uint8_t *p_dst_u,
  313. uint8_t *p_dst_v,
  314. int32_t i_dst_stride,
  315. uint32_t u_coef_hor0,
  316. uint32_t u_coef_hor1,
  317. uint32_t u_coef_ver0,
  318. uint32_t u_coef_ver1 )
  319. {
  320. uint16_t u_out0, u_out1, u_out2, u_out3;
  321. v16u8 src0, src1, src2, src3, src4;
  322. v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
  323. v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
  324. v16i8 mask;
  325. v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
  326. v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
  327. v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
  328. v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
  329. v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
  330. v8i16 res0, res1;
  331. mask = LD_SB( &pu_chroma_mask_arr[16] );
  332. LD_UB3( p_src, i_src_stride, src0, src1, src2 );
  333. VSHF_B2_UB( src0, src1, src1, src2,
  334. ( mask + 1 ), ( mask + 1 ), src3, src4 );
  335. VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
  336. DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
  337. coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
  338. res_hz3 );
  339. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  340. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  341. res_vt3 );
  342. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
  343. SRARI_H2_UH( res_vt0, res_vt2, 6 );
  344. SAT_UH2_UH( res_vt0, res_vt2, 7 );
  345. PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
  346. u_out0 = __msa_copy_u_h( res0, 0 );
  347. u_out1 = __msa_copy_u_h( res0, 2 );
  348. u_out2 = __msa_copy_u_h( res1, 0 );
  349. u_out3 = __msa_copy_u_h( res1, 2 );
  350. SH( u_out0, p_dst_u );
  351. p_dst_u += i_dst_stride;
  352. SH( u_out1, p_dst_u );
  353. SH( u_out2, p_dst_v );
  354. p_dst_v += i_dst_stride;
  355. SH( u_out3, p_dst_v );
  356. }
  357. static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
  358. int32_t i_src_stride,
  359. uint8_t *p_dst_u,
  360. uint8_t *p_dst_v,
  361. int32_t i_dst_stride,
  362. uint32_t u_coef_hor0,
  363. uint32_t u_coef_hor1,
  364. uint32_t u_coef_ver0,
  365. uint32_t u_coef_ver1 )
  366. {
  367. uint16_t u_out0, u_out1, u_out2, u_out3;
  368. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  369. v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
  370. v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
  371. v16i8 mask;
  372. v8i16 res0, res1;
  373. v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
  374. v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
  375. v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
  376. v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
  377. v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
  378. mask = LD_SB( &pu_chroma_mask_arr[16] );
  379. LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
  380. VSHF_B2_UB( src0, src1, src1, src2,
  381. ( mask + 1 ), ( mask + 1 ), src5, src6 );
  382. VSHF_B2_UB( src2, src3, src3, src4,
  383. ( mask + 1 ), ( mask + 1 ), src7, src8 );
  384. VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
  385. VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
  386. DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
  387. coeff_hz_vec, coeff_hz_vec, res_hz0,
  388. res_hz1, res_hz2, res_hz3 );
  389. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  390. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  391. res_vt3 );
  392. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
  393. SRARI_H2_UH( res_vt0, res_vt1, 6 );
  394. SAT_UH2_UH( res_vt0, res_vt1, 7 );
  395. PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
  396. u_out0 = __msa_copy_u_h( res0, 0 );
  397. u_out1 = __msa_copy_u_h( res0, 2 );
  398. u_out2 = __msa_copy_u_h( res1, 0 );
  399. u_out3 = __msa_copy_u_h( res1, 2 );
  400. SH( u_out0, p_dst_u );
  401. p_dst_u += i_dst_stride;
  402. SH( u_out1, p_dst_u );
  403. p_dst_u += i_dst_stride;
  404. SH( u_out2, p_dst_u );
  405. p_dst_u += i_dst_stride;
  406. SH( u_out3, p_dst_u );
  407. DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
  408. coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
  409. res_hz3 );
  410. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  411. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  412. res_vt3 );
  413. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
  414. SRARI_H2_UH( res_vt0, res_vt1, 6 );
  415. SAT_UH2_UH( res_vt0, res_vt1, 7 );
  416. PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
  417. u_out0 = __msa_copy_u_h( res0, 0 );
  418. u_out1 = __msa_copy_u_h( res0, 2 );
  419. u_out2 = __msa_copy_u_h( res1, 0 );
  420. u_out3 = __msa_copy_u_h( res1, 2 );
  421. SH( u_out0, p_dst_v );
  422. p_dst_v += i_dst_stride;
  423. SH( u_out1, p_dst_v );
  424. p_dst_v += i_dst_stride;
  425. SH( u_out2, p_dst_v );
  426. p_dst_v += i_dst_stride;
  427. SH( u_out3, p_dst_v );
  428. }
  429. static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
  430. int32_t i_src_stride,
  431. uint8_t *p_dst_u,
  432. uint8_t *p_dst_v,
  433. int32_t i_dst_stride,
  434. uint32_t u_coef_hor0,
  435. uint32_t u_coef_hor1,
  436. uint32_t u_coef_ver0,
  437. uint32_t u_coef_ver1,
  438. int32_t i_height )
  439. {
  440. if( 2 == i_height )
  441. {
  442. avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
  443. p_dst_u, p_dst_v, i_dst_stride,
  444. u_coef_hor0, u_coef_hor1,
  445. u_coef_ver0, u_coef_ver1 );
  446. }
  447. else if( 4 == i_height )
  448. {
  449. avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
  450. p_dst_u, p_dst_v, i_dst_stride,
  451. u_coef_hor0, u_coef_hor1,
  452. u_coef_ver0, u_coef_ver1 );
  453. }
  454. }
  455. static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
  456. int32_t i_src_stride,
  457. uint8_t *p_dst_u,
  458. uint8_t *p_dst_v,
  459. int32_t i_dst_stride,
  460. uint32_t u_coef_hor0,
  461. uint32_t u_coef_hor1,
  462. uint32_t u_coef_ver0,
  463. uint32_t u_coef_ver1 )
  464. {
  465. uint32_t u_out0, u_out1, u_out2, u_out3;
  466. v16u8 src0, src1, src2, src3, src4;
  467. v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
  468. v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
  469. v16i8 mask;
  470. v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
  471. v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
  472. v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
  473. v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
  474. v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
  475. v4i32 res0, res1;
  476. mask = LD_SB( &pu_chroma_mask_arr[16] );
  477. LD_UB3( p_src, i_src_stride, src0, src1, src2 );
  478. VSHF_B2_UB( src0, src1, src1, src2,
  479. ( mask + 1 ), ( mask + 1 ), src3, src4 );
  480. VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
  481. DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
  482. coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
  483. res_hz3 );
  484. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  485. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  486. res_vt3 );
  487. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
  488. SRARI_H2_UH( res_vt0, res_vt2, 6 );
  489. SAT_UH2_UH( res_vt0, res_vt2, 7 );
  490. PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
  491. u_out0 = __msa_copy_u_w( res0, 0 );
  492. u_out1 = __msa_copy_u_w( res0, 1 );
  493. u_out2 = __msa_copy_u_w( res1, 0 );
  494. u_out3 = __msa_copy_u_w( res1, 1 );
  495. SW( u_out0, p_dst_u );
  496. p_dst_u += i_dst_stride;
  497. SW( u_out1, p_dst_u );
  498. SW( u_out2, p_dst_v );
  499. p_dst_v += i_dst_stride;
  500. SW( u_out3, p_dst_v );
  501. }
  502. static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
  503. int32_t i_src_stride,
  504. uint8_t *p_dst_u,
  505. uint8_t *p_dst_v,
  506. int32_t i_dst_stride,
  507. uint32_t u_coef_hor0,
  508. uint32_t u_coef_hor1,
  509. uint32_t u_coef_ver0,
  510. uint32_t u_coef_ver1,
  511. int32_t i_height )
  512. {
  513. uint32_t u_row;
  514. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  515. v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
  516. v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
  517. v16i8 mask;
  518. v4i32 res0, res1;
  519. v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
  520. v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
  521. v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
  522. v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
  523. v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
  524. mask = LD_SB( &pu_chroma_mask_arr[16] );
  525. src0 = LD_UB( p_src );
  526. p_src += i_src_stride;
  527. for( u_row = ( i_height >> 2 ); u_row--; )
  528. {
  529. LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
  530. p_src += ( 4 * i_src_stride );
  531. VSHF_B2_UB( src0, src1, src1, src2,
  532. ( mask + 1 ), ( mask + 1 ), src5, src6 );
  533. VSHF_B2_UB( src2, src3, src3, src4,
  534. ( mask + 1 ), ( mask + 1 ), src7, src8 );
  535. VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
  536. VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
  537. DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
  538. coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
  539. res_hz3 );
  540. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  541. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  542. res_vt3 );
  543. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
  544. SRARI_H2_UH( res_vt0, res_vt1, 6 );
  545. SAT_UH2_UH( res_vt0, res_vt1, 7 );
  546. PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
  547. ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
  548. p_dst_u += ( 4 * i_dst_stride );
  549. DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
  550. coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
  551. res_hz3 );
  552. MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
  553. coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  554. res_vt3 );
  555. ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
  556. SRARI_H2_UH( res_vt0, res_vt1, 6 );
  557. SAT_UH2_UH( res_vt0, res_vt1, 7 );
  558. PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
  559. ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
  560. p_dst_v += ( 4 * i_dst_stride );
  561. src0 = src4;
  562. }
  563. }
  564. static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
  565. int32_t i_src_stride,
  566. uint8_t *p_dst_u,
  567. uint8_t *p_dst_v,
  568. int32_t i_dst_stride,
  569. uint32_t u_coef_hor0,
  570. uint32_t u_coef_hor1,
  571. uint32_t u_coef_ver0,
  572. uint32_t u_coef_ver1,
  573. int32_t i_height )
  574. {
  575. if( 2 == i_height )
  576. {
  577. avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
  578. p_dst_u, p_dst_v, i_dst_stride,
  579. u_coef_hor0, u_coef_hor1,
  580. u_coef_ver0, u_coef_ver1 );
  581. }
  582. else
  583. {
  584. avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
  585. p_dst_u, p_dst_v, i_dst_stride,
  586. u_coef_hor0, u_coef_hor1,
  587. u_coef_ver0, u_coef_ver1,
  588. i_height );
  589. }
  590. }
  591. static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
  592. int32_t i_src_stride,
  593. uint8_t *p_dst_u,
  594. uint8_t *p_dst_v,
  595. int32_t i_dst_stride,
  596. uint32_t u_coef_hor0,
  597. uint32_t u_coef_hor1,
  598. uint32_t u_coef_ver0,
  599. uint32_t u_coef_ver1,
  600. int32_t i_height )
  601. {
  602. uint32_t u_row;
  603. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
  604. v16u8 src10, src11, src12, src13, src14;
  605. v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
  606. v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
  607. v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
  608. v16i8 coeff_hz_vec0, coeff_hz_vec1;
  609. v16i8 tmp0, tmp1;
  610. v16u8 coeff_hz_vec;
  611. v8u16 coeff_vt_vec0, coeff_vt_vec1;
  612. coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
  613. coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
  614. coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
  615. coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
  616. coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
  617. LD_UB2( p_src, 16, src0, src13 );
  618. p_src += i_src_stride;
  619. VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
  620. DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
  621. for( u_row = ( i_height >> 2 ); u_row--; )
  622. {
  623. LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
  624. LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
  625. p_src += ( 4 * i_src_stride );
  626. VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
  627. VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
  628. DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
  629. coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
  630. res_hz4 );
  631. MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
  632. coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  633. res_vt3 );
  634. res_vt0 += ( res_hz0 * coeff_vt_vec1 );
  635. res_vt1 += ( res_hz1 * coeff_vt_vec1 );
  636. res_vt2 += ( res_hz2 * coeff_vt_vec1 );
  637. res_vt3 += ( res_hz3 * coeff_vt_vec1 );
  638. SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
  639. SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
  640. PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
  641. ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
  642. p_dst_u += ( 4 * i_dst_stride );
  643. res_hz0 = res_hz4;
  644. VSHF_B2_UB( src1, src5, src2, src6,
  645. ( mask + 1 ), ( mask + 1 ), src5, src6 );
  646. VSHF_B2_UB( src3, src7, src4, src8,
  647. ( mask + 1 ), ( mask + 1 ), src7, src8 );
  648. DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
  649. coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
  650. res_hz4 );
  651. MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
  652. coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
  653. res_vt3 );
  654. res_vt0 += ( res_hz5 * coeff_vt_vec1 );
  655. res_vt1 += ( res_hz1 * coeff_vt_vec1 );
  656. res_vt2 += ( res_hz2 * coeff_vt_vec1 );
  657. res_vt3 += ( res_hz3 * coeff_vt_vec1 );
  658. SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
  659. SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
  660. PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
  661. ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
  662. p_dst_v += ( 4 * i_dst_stride );
  663. res_hz5 = res_hz4;
  664. }
  665. }
  666. static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
  667. uint8_t *p_dst, int32_t i_dst_stride,
  668. int32_t i_log2_denom, int32_t i_weight,
  669. int32_t i_offset_in )
  670. {
  671. uint32_t u_load0, u_load1, u_out0, u_out1;
  672. v16u8 zero = { 0 };
  673. v16u8 src0, src1;
  674. v4i32 dst0, dst1;
  675. v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
  676. v8i16 vec0, vec1;
  677. i_offset_in <<= ( i_log2_denom );
  678. if( i_log2_denom )
  679. {
  680. i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
  681. }
  682. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  683. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  684. denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
  685. u_load0 = LW( p_src );
  686. p_src += i_src_stride;
  687. u_load1 = LW( p_src );
  688. src0 = ( v16u8 ) __msa_fill_w( u_load0 );
  689. src1 = ( v16u8 ) __msa_fill_w( u_load1 );
  690. ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
  691. MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
  692. ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
  693. MAXI_SH2_SH( vec0, vec1, 0 );
  694. tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
  695. tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
  696. SAT_UH2_UH( tp0, tp1, 7 );
  697. PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
  698. u_out0 = __msa_copy_u_w( dst0, 0 );
  699. u_out1 = __msa_copy_u_w( dst1, 0 );
  700. SW( u_out0, p_dst );
  701. p_dst += i_dst_stride;
  702. SW( u_out1, p_dst );
  703. }
  704. static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
  705. int32_t i_src_stride,
  706. uint8_t *p_dst,
  707. int32_t i_dst_stride,
  708. int32_t i_height,
  709. int32_t i_log2_denom,
  710. int32_t i_weight,
  711. int32_t i_offset_in )
  712. {
  713. uint8_t u_cnt;
  714. uint32_t u_load0, u_load1, u_load2, u_load3;
  715. v16u8 zero = { 0 };
  716. v16u8 src0, src1, src2, src3;
  717. v8u16 temp0, temp1, temp2, temp3;
  718. v8u16 wgt, denom, offset;
  719. i_offset_in <<= ( i_log2_denom );
  720. if( i_log2_denom )
  721. {
  722. i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
  723. }
  724. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  725. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  726. denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
  727. for( u_cnt = i_height / 4; u_cnt--; )
  728. {
  729. LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
  730. p_src += 4 * i_src_stride;
  731. src0 = ( v16u8 ) __msa_fill_w( u_load0 );
  732. src1 = ( v16u8 ) __msa_fill_w( u_load1 );
  733. src2 = ( v16u8 ) __msa_fill_w( u_load2 );
  734. src3 = ( v16u8 ) __msa_fill_w( u_load3 );
  735. ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
  736. temp0, temp1, temp2, temp3 );
  737. MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
  738. temp0, temp1, temp2, temp3 );
  739. ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
  740. temp0, temp1, temp2, temp3 );
  741. MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
  742. SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
  743. SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
  744. PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
  745. p_dst += ( 4 * i_dst_stride );
  746. }
  747. }
  748. static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
  749. uint8_t *p_dst, int32_t i_dst_stride,
  750. int32_t i_height, int32_t i_log2_denom,
  751. int32_t i_weight, int32_t i_offset_in )
  752. {
  753. if( 2 == i_height )
  754. {
  755. avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
  756. i_log2_denom, i_weight, i_offset_in );
  757. }
  758. else
  759. {
  760. avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
  761. p_dst, i_dst_stride,
  762. i_height, i_log2_denom,
  763. i_weight, i_offset_in );
  764. }
  765. }
  766. static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
  767. uint8_t *p_dst, int32_t i_dst_stride,
  768. int32_t i_height, int32_t i_log2_denom,
  769. int32_t i_weight, int32_t i_offset_in )
  770. {
  771. uint8_t u_cnt;
  772. v16u8 zero = { 0 };
  773. v16u8 src0, src1, src2, src3;
  774. v8u16 temp0, temp1, temp2, temp3;
  775. v8u16 wgt, denom, offset;
  776. v16i8 out0, out1;
  777. i_offset_in <<= ( i_log2_denom );
  778. if( i_log2_denom )
  779. {
  780. i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
  781. }
  782. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  783. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  784. denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
  785. for( u_cnt = i_height / 4; u_cnt--; )
  786. {
  787. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  788. p_src += 4 * i_src_stride;
  789. ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
  790. temp0, temp1, temp2, temp3 );
  791. MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
  792. temp0, temp1, temp2, temp3 );
  793. ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
  794. temp0, temp1, temp2, temp3 );
  795. MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
  796. SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
  797. SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
  798. PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
  799. ST8x4_UB( out0, out1, p_dst, i_dst_stride );
  800. p_dst += ( 4 * i_dst_stride );
  801. }
  802. }
  803. static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
  804. uint8_t *p_dst, int32_t i_dst_stride,
  805. int32_t i_height, int32_t i_log2_denom,
  806. int32_t i_weight, int32_t i_offset_in )
  807. {
  808. uint8_t u_cnt;
  809. v16i8 zero = { 0 };
  810. v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
  811. v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  812. v8u16 wgt, denom, offset;
  813. i_offset_in <<= ( i_log2_denom );
  814. if( i_log2_denom )
  815. {
  816. i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
  817. }
  818. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  819. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  820. denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
  821. for( u_cnt = i_height / 4; u_cnt--; )
  822. {
  823. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  824. p_src += 4 * i_src_stride;
  825. ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
  826. temp0, temp2, temp4, temp6 );
  827. ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
  828. temp1, temp3, temp5, temp7 );
  829. MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
  830. temp0, temp1, temp2, temp3 );
  831. MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
  832. temp4, temp5, temp6, temp7 );
  833. ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
  834. temp0, temp1, temp2, temp3 );
  835. ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
  836. temp4, temp5, temp6, temp7 );
  837. MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
  838. MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
  839. SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
  840. SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
  841. SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
  842. SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
  843. PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
  844. dst0, dst1, dst2, dst3 );
  845. ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
  846. p_dst += 4 * i_dst_stride;
  847. }
  848. }
  849. static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
  850. int32_t i_src1_stride,
  851. uint8_t *p_src2_in,
  852. int32_t i_src2_stride,
  853. uint8_t *p_dst,
  854. int32_t i_dst_stride,
  855. int32_t i_log2_denom,
  856. int32_t i_src1_weight,
  857. int32_t i_src2_weight,
  858. int32_t i_offset_in )
  859. {
  860. uint32_t u_load0, u_load1, u_out0, u_out1;
  861. v8i16 src1_wgt, src2_wgt;
  862. v16u8 in0, in1, in2, in3;
  863. v8i16 temp0, temp1, temp2, temp3;
  864. v16i8 zero = { 0 };
  865. v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
  866. src1_wgt = __msa_fill_h( i_src1_weight );
  867. src2_wgt = __msa_fill_h( i_src2_weight );
  868. u_load0 = LW( p_src1_in );
  869. u_load1 = LW( p_src1_in + i_src1_stride );
  870. in0 = ( v16u8 ) __msa_fill_w( u_load0 );
  871. in1 = ( v16u8 ) __msa_fill_w( u_load1 );
  872. u_load0 = LW( p_src2_in );
  873. u_load1 = LW( p_src2_in + i_src2_stride );
  874. in2 = ( v16u8 ) __msa_fill_w( u_load0 );
  875. in3 = ( v16u8 ) __msa_fill_w( u_load1 );
  876. ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
  877. temp0, temp1, temp2, temp3 );
  878. temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
  879. temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
  880. SRAR_H2_SH( temp0, temp1, denom );
  881. CLIP_SH2_0_255( temp0, temp1 );
  882. PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
  883. u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
  884. u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
  885. SW( u_out0, p_dst );
  886. p_dst += i_dst_stride;
  887. SW( u_out1, p_dst );
  888. }
  889. static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
  890. int32_t i_src1_stride,
  891. uint8_t *p_src2_in,
  892. int32_t i_src2_stride,
  893. uint8_t *p_dst,
  894. int32_t i_dst_stride,
  895. int32_t i_height,
  896. int32_t i_log2_denom,
  897. int32_t i_src1_weight,
  898. int32_t i_src2_weight,
  899. int32_t i_offset_in )
  900. {
  901. uint8_t u_cnt;
  902. uint32_t u_load0, u_load1, u_load2, u_load3;
  903. v8i16 src1_wgt, src2_wgt;
  904. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  905. v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  906. v16i8 zero = { 0 };
  907. v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
  908. src1_wgt = __msa_fill_h( i_src1_weight );
  909. src2_wgt = __msa_fill_h( i_src2_weight );
  910. for( u_cnt = i_height / 4; u_cnt--; )
  911. {
  912. LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
  913. p_src1_in += ( 4 * i_src1_stride );
  914. src0 = ( v16u8 ) __msa_fill_w( u_load0 );
  915. src1 = ( v16u8 ) __msa_fill_w( u_load1 );
  916. src2 = ( v16u8 ) __msa_fill_w( u_load2 );
  917. src3 = ( v16u8 ) __msa_fill_w( u_load3 );
  918. LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
  919. p_src2_in += ( 4 * i_src2_stride );
  920. src4 = ( v16u8 ) __msa_fill_w( u_load0 );
  921. src5 = ( v16u8 ) __msa_fill_w( u_load1 );
  922. src6 = ( v16u8 ) __msa_fill_w( u_load2 );
  923. src7 = ( v16u8 ) __msa_fill_w( u_load3 );
  924. ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
  925. temp0, temp1, temp2, temp3 );
  926. ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
  927. temp4, temp5, temp6, temp7 );
  928. temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
  929. temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
  930. temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
  931. temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
  932. SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
  933. CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
  934. PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
  935. p_dst += ( 4 * i_dst_stride );
  936. }
  937. }
  938. static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
  939. int32_t i_src1_stride,
  940. uint8_t *p_src2_in,
  941. int32_t i_src2_stride,
  942. uint8_t *p_dst,
  943. int32_t i_dst_stride,
  944. int32_t i_height,
  945. int32_t i_log2_denom,
  946. int32_t i_src1_weight,
  947. int32_t i_src2_weight,
  948. int32_t i_offset_in )
  949. {
  950. if( 2 == i_height )
  951. {
  952. avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
  953. p_src2_in, i_src2_stride,
  954. p_dst, i_dst_stride,
  955. i_log2_denom, i_src1_weight,
  956. i_src2_weight, i_offset_in );
  957. }
  958. else
  959. {
  960. avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
  961. p_src2_in, i_src2_stride,
  962. p_dst, i_dst_stride,
  963. i_height, i_log2_denom,
  964. i_src1_weight, i_src2_weight,
  965. i_offset_in );
  966. }
  967. }
  968. static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
  969. int32_t i_src1_stride,
  970. uint8_t *p_src2_in,
  971. int32_t i_src2_stride,
  972. uint8_t *p_dst,
  973. int32_t i_dst_stride,
  974. int32_t i_height,
  975. int32_t i_log2_denom,
  976. int32_t i_src1_weight,
  977. int32_t i_src2_weight,
  978. int32_t i_offset_in )
  979. {
  980. uint8_t u_cnt;
  981. v8i16 src1_wgt, src2_wgt;
  982. v16u8 src0, src1, src2, src3;
  983. v16u8 dst0, dst1, dst2, dst3;
  984. v8i16 temp0, temp1, temp2, temp3;
  985. v8i16 res0, res1, res2, res3;
  986. v16i8 zero = { 0 };
  987. v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
  988. src1_wgt = __msa_fill_h( i_src1_weight );
  989. src2_wgt = __msa_fill_h( i_src2_weight );
  990. for( u_cnt = i_height / 4; u_cnt--; )
  991. {
  992. LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
  993. p_src1_in += ( 4 * i_src1_stride );
  994. LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
  995. p_src2_in += ( 4 * i_src2_stride );
  996. ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
  997. temp0, temp1, temp2, temp3 );
  998. ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
  999. res0, res1, res2, res3 );
  1000. res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
  1001. res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
  1002. res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
  1003. res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
  1004. SRAR_H4_SH( res0, res1, res2, res3, denom );
  1005. CLIP_SH4_0_255( res0, res1, res2, res3 );
  1006. PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
  1007. dst0, dst1, dst2, dst3 );
  1008. ST8x1_UB( dst0, p_dst );
  1009. p_dst += i_dst_stride;
  1010. ST8x1_UB( dst1, p_dst );
  1011. p_dst += i_dst_stride;
  1012. ST8x1_UB( dst2, p_dst );
  1013. p_dst += i_dst_stride;
  1014. ST8x1_UB( dst3, p_dst );
  1015. p_dst += i_dst_stride;
  1016. }
  1017. }
  1018. static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
  1019. int32_t i_src1_stride,
  1020. uint8_t *p_src2_in,
  1021. int32_t i_src2_stride,
  1022. uint8_t *p_dst,
  1023. int32_t i_dst_stride,
  1024. int32_t i_height,
  1025. int32_t i_log2_denom,
  1026. int32_t i_src1_weight,
  1027. int32_t i_src2_weight,
  1028. int32_t i_offset_in )
  1029. {
  1030. uint8_t u_cnt;
  1031. v8i16 src1_wgt, src2_wgt;
  1032. v16u8 src0, src1, src2, src3;
  1033. v16u8 dst0, dst1, dst2, dst3;
  1034. v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1035. v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
  1036. v16i8 zero = { 0 };
  1037. v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
  1038. src1_wgt = __msa_fill_h( i_src1_weight );
  1039. src2_wgt = __msa_fill_h( i_src2_weight );
  1040. for( u_cnt = i_height / 4; u_cnt--; )
  1041. {
  1042. LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
  1043. p_src1_in += ( 4 * i_src1_stride );
  1044. LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
  1045. p_src2_in += ( 4 * i_src2_stride );
  1046. ILVRL_B2_SH( zero, src0, temp1, temp0 );
  1047. ILVRL_B2_SH( zero, src1, temp3, temp2 );
  1048. ILVRL_B2_SH( zero, src2, temp5, temp4 );
  1049. ILVRL_B2_SH( zero, src3, temp7, temp6 );
  1050. ILVRL_B2_SH( zero, dst0, res1, res0 );
  1051. ILVRL_B2_SH( zero, dst1, res3, res2 );
  1052. ILVRL_B2_SH( zero, dst2, res5, res4 );
  1053. ILVRL_B2_SH( zero, dst3, res7, res6 );
  1054. res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
  1055. res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
  1056. res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
  1057. res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
  1058. res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
  1059. res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
  1060. res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
  1061. res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
  1062. SRAR_H4_SH( res0, res1, res2, res3, denom );
  1063. SRAR_H4_SH( res4, res5, res6, res7, denom );
  1064. CLIP_SH4_0_255( res0, res1, res2, res3 );
  1065. CLIP_SH4_0_255( res4, res5, res6, res7 );
  1066. PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
  1067. dst0, dst1, dst2, dst3 );
  1068. ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
  1069. p_dst += 4 * i_dst_stride;
  1070. }
  1071. }
  1072. static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
  1073. int32_t i_src1_stride,
  1074. uint8_t *p_src2_in,
  1075. int32_t i_src2_stride,
  1076. uint8_t *p_dst, int32_t i_dst_stride,
  1077. int32_t i_log2_denom,
  1078. int32_t i_src1_weight,
  1079. int32_t i_src2_weight,
  1080. int32_t i_offset_in )
  1081. {
  1082. uint32_t u_load0, u_load1, u_out0, u_out1;
  1083. v16u8 src1_wgt, src2_wgt, wgt;
  1084. v16i8 in0, in1, in2, in3;
  1085. v8u16 temp0, temp1, denom, offset;
  1086. i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
  1087. src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
  1088. src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
  1089. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  1090. denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
  1091. wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
  1092. u_load0 = LW( p_src1_in );
  1093. u_load1 = LW( p_src1_in + i_src1_stride );
  1094. in0 = ( v16i8 ) __msa_fill_w( u_load0 );
  1095. in1 = ( v16i8 ) __msa_fill_w( u_load1 );
  1096. u_load0 = LW( p_src2_in );
  1097. u_load1 = LW( p_src2_in + i_src2_stride );
  1098. in2 = ( v16i8 ) __msa_fill_w( u_load0 );
  1099. in3 = ( v16i8 ) __msa_fill_w( u_load1 );
  1100. ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
  1101. temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
  1102. temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
  1103. temp0 >>= denom;
  1104. temp1 >>= denom;
  1105. MAXI_SH2_UH( temp0, temp1, 0 );
  1106. SAT_UH2_UH( temp0, temp1, 7 );
  1107. PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
  1108. u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
  1109. u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
  1110. SW( u_out0, p_dst );
  1111. p_dst += i_dst_stride;
  1112. SW( u_out1, p_dst );
  1113. }
  1114. static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
  1115. int32_t i_src1_stride,
  1116. uint8_t *p_src2_in,
  1117. int32_t i_src2_stride,
  1118. uint8_t *p_dst,
  1119. int32_t i_dst_stride,
  1120. int32_t i_height,
  1121. int32_t i_log2_denom,
  1122. int32_t i_src1_weight,
  1123. int32_t i_src2_weight,
  1124. int32_t i_offset_in )
  1125. {
  1126. uint8_t u_cnt;
  1127. uint32_t u_load0, u_load1, u_load2, u_load3;
  1128. v16u8 src1_wgt, src2_wgt, wgt;
  1129. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1130. v16u8 temp0, temp1, temp2, temp3;
  1131. v8u16 res0, res1, res2, res3;
  1132. v8u16 denom, offset;
  1133. i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
  1134. src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
  1135. src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
  1136. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  1137. denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
  1138. wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
  1139. for( u_cnt = i_height / 4; u_cnt--; )
  1140. {
  1141. LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
  1142. p_src1_in += ( 4 * i_src1_stride );
  1143. src0 = ( v16u8 ) __msa_fill_w( u_load0 );
  1144. src1 = ( v16u8 ) __msa_fill_w( u_load1 );
  1145. src2 = ( v16u8 ) __msa_fill_w( u_load2 );
  1146. src3 = ( v16u8 ) __msa_fill_w( u_load3 );
  1147. LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
  1148. p_src2_in += ( 4 * i_src2_stride );
  1149. src4 = ( v16u8 ) __msa_fill_w( u_load0 );
  1150. src5 = ( v16u8 ) __msa_fill_w( u_load1 );
  1151. src6 = ( v16u8 ) __msa_fill_w( u_load2 );
  1152. src7 = ( v16u8 ) __msa_fill_w( u_load3 );
  1153. ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1154. temp0, temp1, temp2, temp3 );
  1155. DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
  1156. res0, res1, res2, res3 );
  1157. ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
  1158. res0, res1, res2, res3 );
  1159. SRA_4V( res0, res1, res2, res3, denom );
  1160. MAXI_SH4_UH( res0, res1, res2, res3, 0 );
  1161. SAT_UH4_UH( res0, res1, res2, res3, 7 );
  1162. PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
  1163. p_dst += ( 4 * i_dst_stride );
  1164. }
  1165. }
  1166. static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
  1167. int32_t i_src1_stride,
  1168. uint8_t *p_src2_in,
  1169. int32_t i_src2_stride,
  1170. uint8_t *p_dst,
  1171. int32_t i_dst_stride,
  1172. int32_t i_height,
  1173. int32_t i_log2_denom,
  1174. int32_t i_src1_weight,
  1175. int32_t i_src2_weight,
  1176. int32_t i_offset_in )
  1177. {
  1178. if( 2 == i_height )
  1179. {
  1180. avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
  1181. p_src2_in, i_src2_stride,
  1182. p_dst, i_dst_stride,
  1183. i_log2_denom, i_src1_weight,
  1184. i_src2_weight, i_offset_in );
  1185. }
  1186. else
  1187. {
  1188. avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
  1189. p_src2_in, i_src2_stride,
  1190. p_dst, i_dst_stride,
  1191. i_height, i_log2_denom,
  1192. i_src1_weight,
  1193. i_src2_weight, i_offset_in );
  1194. }
  1195. }
  1196. static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
  1197. int32_t i_src1_stride,
  1198. uint8_t *p_src2_in,
  1199. int32_t i_src2_stride,
  1200. uint8_t *p_dst,
  1201. int32_t i_dst_stride,
  1202. int32_t i_height,
  1203. int32_t i_log2_denom,
  1204. int32_t i_src1_weight,
  1205. int32_t i_src2_weight,
  1206. int32_t i_offset_in )
  1207. {
  1208. uint8_t u_cnt;
  1209. v16u8 src1_wgt, src2_wgt, wgt;
  1210. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1211. v16u8 temp0, temp1, temp2, temp3;
  1212. v8u16 res0, res1, res2, res3;
  1213. v8u16 denom, offset;
  1214. v16i8 out0, out1;
  1215. i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
  1216. src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
  1217. src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
  1218. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  1219. denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
  1220. wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
  1221. for( u_cnt = i_height / 4; u_cnt--; )
  1222. {
  1223. LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
  1224. p_src1_in += ( 4 * i_src1_stride );
  1225. LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
  1226. p_src2_in += ( 4 * i_src2_stride );
  1227. ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1228. temp0, temp1, temp2, temp3 );
  1229. DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
  1230. res0, res1, res2, res3 );
  1231. ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
  1232. res0, res1, res2, res3 );
  1233. SRA_4V( res0, res1, res2, res3, denom );
  1234. MAXI_SH4_UH( res0, res1, res2, res3, 0 );
  1235. SAT_UH4_UH( res0, res1, res2, res3, 7 );
  1236. PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
  1237. ST8x4_UB( out0, out1, p_dst, i_dst_stride );
  1238. p_dst += 4 * i_dst_stride;
  1239. }
  1240. }
  1241. static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
  1242. int32_t i_src1_stride,
  1243. uint8_t *p_src2_in,
  1244. int32_t i_src2_stride,
  1245. uint8_t *p_dst,
  1246. int32_t i_dst_stride,
  1247. int32_t i_height,
  1248. int32_t i_log2_denom,
  1249. int32_t i_src1_weight,
  1250. int32_t i_src2_weight,
  1251. int32_t i_offset_in )
  1252. {
  1253. uint8_t u_cnt;
  1254. v16u8 src1_wgt, src2_wgt, wgt;
  1255. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1256. v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1257. v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
  1258. v8u16 denom, offset;
  1259. i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
  1260. src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
  1261. src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
  1262. offset = ( v8u16 ) __msa_fill_h( i_offset_in );
  1263. denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
  1264. wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
  1265. for( u_cnt = i_height / 4; u_cnt--; )
  1266. {
  1267. LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
  1268. p_src1_in += ( 4 * i_src1_stride );
  1269. LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
  1270. p_src2_in += ( 4 * i_src2_stride );
  1271. ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1272. temp0, temp2, temp4, temp6 );
  1273. ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1274. temp1, temp3, temp5, temp7 );
  1275. DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
  1276. res0, res1, res2, res3 );
  1277. ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
  1278. res0, res1, res2, res3 );
  1279. DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
  1280. res4, res5, res6, res7 );
  1281. ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
  1282. res4, res5, res6, res7 );
  1283. SRA_4V( res0, res1, res2, res3, denom );
  1284. SRA_4V( res4, res5, res6, res7, denom );
  1285. MAXI_SH4_UH( res0, res1, res2, res3, 0 );
  1286. MAXI_SH4_UH( res4, res5, res6, res7, 0 );
  1287. SAT_UH4_UH( res0, res1, res2, res3, 7 );
  1288. SAT_UH4_UH( res4, res5, res6, res7, 7 );
  1289. PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
  1290. temp0, temp1, temp2, temp3 );
  1291. ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
  1292. p_dst += 4 * i_dst_stride;
  1293. }
  1294. }
  1295. static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
  1296. uint8_t *p_dst, int32_t i_dst_stride,
  1297. int32_t i_height )
  1298. {
  1299. int32_t i_cnt;
  1300. uint32_t u_src0, u_src1;
  1301. for( i_cnt = ( i_height / 2 ); i_cnt--; )
  1302. {
  1303. u_src0 = LW( p_src );
  1304. p_src += i_src_stride;
  1305. u_src1 = LW( p_src );
  1306. p_src += i_src_stride;
  1307. SW( u_src0, p_dst );
  1308. p_dst += i_dst_stride;
  1309. SW( u_src1, p_dst );
  1310. p_dst += i_dst_stride;
  1311. }
  1312. }
  1313. static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
  1314. uint8_t *p_dst, int32_t i_dst_stride,
  1315. int32_t i_height )
  1316. {
  1317. int32_t i_cnt;
  1318. uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
  1319. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1320. if( 0 == i_height % 12 )
  1321. {
  1322. for( i_cnt = ( i_height / 12 ); i_cnt--; )
  1323. {
  1324. LD_UB8( p_src, i_src_stride,
  1325. src0, src1, src2, src3, src4, src5, src6, src7 );
  1326. p_src += ( 8 * i_src_stride );
  1327. u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
  1328. u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
  1329. u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
  1330. u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
  1331. u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
  1332. u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
  1333. u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
  1334. u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
  1335. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  1336. p_dst += ( 4 * i_dst_stride );
  1337. SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
  1338. p_dst += ( 4 * i_dst_stride );
  1339. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  1340. p_src += ( 4 * i_src_stride );
  1341. u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
  1342. u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
  1343. u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
  1344. u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
  1345. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  1346. p_dst += ( 4 * i_dst_stride );
  1347. }
  1348. }
  1349. else if( 0 == i_height % 8 )
  1350. {
  1351. for( i_cnt = i_height >> 3; i_cnt--; )
  1352. {
  1353. LD_UB8( p_src, i_src_stride,
  1354. src0, src1, src2, src3, src4, src5, src6, src7 );
  1355. p_src += ( 8 * i_src_stride );
  1356. u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
  1357. u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
  1358. u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
  1359. u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
  1360. u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
  1361. u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
  1362. u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
  1363. u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
  1364. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  1365. p_dst += ( 4 * i_dst_stride );
  1366. SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
  1367. p_dst += ( 4 * i_dst_stride );
  1368. }
  1369. }
  1370. else if( 0 == i_height % 4 )
  1371. {
  1372. for( i_cnt = ( i_height / 4 ); i_cnt--; )
  1373. {
  1374. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  1375. p_src += ( 4 * i_src_stride );
  1376. u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
  1377. u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
  1378. u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
  1379. u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
  1380. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  1381. p_dst += ( 4 * i_dst_stride );
  1382. }
  1383. }
  1384. else if( 0 == i_height % 2 )
  1385. {
  1386. for( i_cnt = ( i_height / 2 ); i_cnt--; )
  1387. {
  1388. LD_UB2( p_src, i_src_stride, src0, src1 );
  1389. p_src += ( 2 * i_src_stride );
  1390. u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
  1391. u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
  1392. SD( u_out0, p_dst );
  1393. p_dst += i_dst_stride;
  1394. SD( u_out1, p_dst );
  1395. p_dst += i_dst_stride;
  1396. }
  1397. }
  1398. }
  1399. static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
  1400. uint8_t *p_dst, int32_t i_dst_stride,
  1401. int32_t i_height, int32_t i_width )
  1402. {
  1403. int32_t i_cnt, i_loop_cnt;
  1404. uint8_t *p_src_tmp, *p_dst_tmp;
  1405. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1406. for( i_cnt = ( i_width >> 4 ); i_cnt--; )
  1407. {
  1408. p_src_tmp = p_src;
  1409. p_dst_tmp = p_dst;
  1410. for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
  1411. {
  1412. LD_UB8( p_src_tmp, i_src_stride,
  1413. src0, src1, src2, src3, src4, src5, src6, src7 );
  1414. p_src_tmp += ( 8 * i_src_stride );
  1415. ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
  1416. p_dst_tmp, i_dst_stride );
  1417. p_dst_tmp += ( 8 * i_dst_stride );
  1418. }
  1419. p_src += 16;
  1420. p_dst += 16;
  1421. }
  1422. }
  1423. static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
  1424. uint8_t *p_dst, int32_t i_dst_stride,
  1425. int32_t i_height )
  1426. {
  1427. int32_t i_cnt;
  1428. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1429. if( 0 == i_height % 12 )
  1430. {
  1431. for( i_cnt = ( i_height / 12 ); i_cnt--; )
  1432. {
  1433. LD_UB8( p_src, i_src_stride,
  1434. src0, src1, src2, src3, src4, src5, src6, src7 );
  1435. p_src += ( 8 * i_src_stride );
  1436. ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
  1437. p_dst, i_dst_stride );
  1438. p_dst += ( 8 * i_dst_stride );
  1439. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  1440. p_src += ( 4 * i_src_stride );
  1441. ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
  1442. p_dst += ( 4 * i_dst_stride );
  1443. }
  1444. }
  1445. else if( 0 == i_height % 8 )
  1446. {
  1447. copy_16multx8mult_msa( p_src, i_src_stride,
  1448. p_dst, i_dst_stride, i_height, 16 );
  1449. }
  1450. else if( 0 == i_height % 4 )
  1451. {
  1452. for( i_cnt = ( i_height >> 2 ); i_cnt--; )
  1453. {
  1454. LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
  1455. p_src += ( 4 * i_src_stride );
  1456. ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
  1457. p_dst += ( 4 * i_dst_stride );
  1458. }
  1459. }
  1460. }
  1461. static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
  1462. uint8_t *p_src2, int32_t i_src2_stride,
  1463. uint8_t *p_dst, int32_t i_dst_stride,
  1464. int32_t i_height )
  1465. {
  1466. int32_t i_cnt;
  1467. uint32_t u_out0, u_out1;
  1468. v16u8 src0, src1, src2, src3;
  1469. v16u8 dst0, dst1;
  1470. for( i_cnt = ( i_height / 2 ); i_cnt--; )
  1471. {
  1472. LD_UB2( p_src1, i_src1_stride, src0, src1 );
  1473. p_src1 += ( 2 * i_src1_stride );
  1474. LD_UB2( p_src2, i_src2_stride, src2, src3 );
  1475. p_src2 += ( 2 * i_src2_stride );
  1476. AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
  1477. u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
  1478. u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
  1479. SW( u_out0, p_dst );
  1480. p_dst += i_dst_stride;
  1481. SW( u_out1, p_dst );
  1482. p_dst += i_dst_stride;
  1483. }
  1484. }
  1485. static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
  1486. uint8_t *p_src2, int32_t i_src2_stride,
  1487. uint8_t *p_dst, int32_t i_dst_stride,
  1488. int32_t i_height )
  1489. {
  1490. int32_t i_cnt;
  1491. uint64_t u_out0, u_out1, u_out2, u_out3;
  1492. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1493. v16u8 dst0, dst1, dst2, dst3;
  1494. for( i_cnt = ( i_height / 4 ); i_cnt--; )
  1495. {
  1496. LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
  1497. p_src1 += ( 4 * i_src1_stride );
  1498. LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
  1499. p_src2 += ( 4 * i_src2_stride );
  1500. AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
  1501. dst0, dst1, dst2, dst3 );
  1502. u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
  1503. u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
  1504. u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
  1505. u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
  1506. SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
  1507. p_dst += ( 4 * i_dst_stride );
  1508. }
  1509. }
  1510. static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
  1511. uint8_t *p_src2, int32_t i_src2_stride,
  1512. uint8_t *p_dst, int32_t i_dst_stride,
  1513. int32_t i_height )
  1514. {
  1515. int32_t i_cnt;
  1516. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1517. v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
  1518. for( i_cnt = ( i_height / 8 ); i_cnt--; )
  1519. {
  1520. LD_UB8( p_src1, i_src1_stride,
  1521. src0, src1, src2, src3, src4, src5, src6, src7 );
  1522. p_src1 += ( 8 * i_src1_stride );
  1523. LD_UB8( p_src2, i_src2_stride,
  1524. dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
  1525. p_src2 += ( 8 * i_src2_stride );
  1526. AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
  1527. dst0, dst1, dst2, dst3 );
  1528. AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
  1529. dst4, dst5, dst6, dst7 );
  1530. ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
  1531. p_dst, i_dst_stride );
  1532. p_dst += ( 8 * i_dst_stride );
  1533. }
  1534. }
  1535. static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
  1536. int32_t i_height )
  1537. {
  1538. int8_t i_cnt;
  1539. v16u8 zero = { 0 };
  1540. for( i_cnt = ( i_height / 2 ); i_cnt--; )
  1541. {
  1542. ST_UB( zero, p_src );
  1543. p_src += i_stride;
  1544. ST_UB( zero, p_src );
  1545. p_src += i_stride;
  1546. }
  1547. }
  1548. static void core_plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
  1549. uint8_t *p_src1, int32_t i_src1_stride,
  1550. uint8_t *p_dst, int32_t i_dst_stride,
  1551. int32_t i_width, int32_t i_height )
  1552. {
  1553. int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
  1554. v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
  1555. v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
  1556. v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
  1557. i_w_mul8 = i_width - i_width % 8;
  1558. i_h4w = i_height - i_height % 4;
  1559. for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
  1560. {
  1561. for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
  1562. {
  1563. LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
  1564. LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
  1565. ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1566. vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
  1567. ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1568. vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
  1569. ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
  1570. p_dst, i_dst_stride );
  1571. ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
  1572. ( p_dst + 16 ), i_dst_stride );
  1573. p_src0 += 16;
  1574. p_src1 += 16;
  1575. p_dst += 32;
  1576. }
  1577. for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
  1578. {
  1579. LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
  1580. LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
  1581. ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
  1582. vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
  1583. ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
  1584. p_dst, i_dst_stride );
  1585. p_src0 += 8;
  1586. p_src1 += 8;
  1587. p_dst += 16;
  1588. }
  1589. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  1590. {
  1591. p_dst[0] = p_src0[0];
  1592. p_dst[1] = p_src1[0];
  1593. p_dst[i_dst_stride] = p_src0[i_src0_stride];
  1594. p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
  1595. p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
  1596. p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
  1597. p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
  1598. p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
  1599. p_src0 += 1;
  1600. p_src1 += 1;
  1601. p_dst += 2;
  1602. }
  1603. p_src0 += ( ( 4 * i_src0_stride ) - i_width );
  1604. p_src1 += ( ( 4 * i_src1_stride ) - i_width );
  1605. p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
  1606. }
  1607. for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
  1608. {
  1609. for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
  1610. {
  1611. src0 = LD_UB( p_src0 );
  1612. src4 = LD_UB( p_src1 );
  1613. ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
  1614. ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
  1615. p_src0 += 16;
  1616. p_src1 += 16;
  1617. p_dst += 32;
  1618. }
  1619. for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
  1620. {
  1621. src0 = LD_UB( p_src0 );
  1622. src4 = LD_UB( p_src1 );
  1623. vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
  1624. ( v16i8 ) src0 );
  1625. ST_UB( vec_ilv_r0, p_dst );
  1626. p_src0 += 8;
  1627. p_src1 += 8;
  1628. p_dst += 16;
  1629. }
  1630. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  1631. {
  1632. p_dst[0] = p_src0[0];
  1633. p_dst[1] = p_src1[0];
  1634. p_src0 += 1;
  1635. p_src1 += 1;
  1636. p_dst += 2;
  1637. }
  1638. p_src0 += ( i_src0_stride - i_width );
  1639. p_src1 += ( i_src1_stride - i_width );
  1640. p_dst += ( i_dst_stride - ( i_width * 2 ) );
  1641. }
  1642. }
  1643. static void core_plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
  1644. uint8_t *p_dst0, int32_t dst0_stride,
  1645. uint8_t *p_dst1, int32_t dst1_stride,
  1646. int32_t i_width, int32_t i_height )
  1647. {
  1648. int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
  1649. uint32_t u_res_w0, u_res_w1;
  1650. v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
  1651. v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
  1652. v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
  1653. uint8_t *p_dst;
  1654. i_w_mul8 = i_width - i_width % 8;
  1655. i_w_mul4 = i_width - i_width % 4;
  1656. i_h4w = i_height - i_height % 8;
  1657. for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
  1658. {
  1659. for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
  1660. {
  1661. LD_UB8( p_src, i_src_stride,
  1662. in0, in1, in2, in3, in4, in5, in6, in7 );
  1663. p_src += 16;
  1664. PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
  1665. vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
  1666. PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
  1667. vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
  1668. ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
  1669. p_dst = p_dst0 + 4 * dst0_stride;
  1670. ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
  1671. ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
  1672. p_dst = p_dst1 + 4 * dst1_stride;
  1673. ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
  1674. p_dst0 += 8;
  1675. p_dst1 += 8;
  1676. }
  1677. for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
  1678. {
  1679. LD_UB8( p_src, i_src_stride,
  1680. in0, in1, in2, in3, in4, in5, in6, in7 );
  1681. p_src += 8;
  1682. PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
  1683. vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
  1684. PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
  1685. vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
  1686. ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
  1687. p_dst = p_dst0 + 4 * dst0_stride;
  1688. ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
  1689. ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
  1690. p_dst = p_dst1 + 4 * dst1_stride;
  1691. ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
  1692. p_dst0 += 4;
  1693. p_dst1 += 4;
  1694. }
  1695. for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
  1696. {
  1697. p_dst0[0] = p_src[0];
  1698. p_dst1[0] = p_src[1];
  1699. p_dst0[dst0_stride] = p_src[i_src_stride];
  1700. p_dst1[dst1_stride] = p_src[i_src_stride + 1];
  1701. p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
  1702. p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
  1703. p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
  1704. p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
  1705. p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
  1706. p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
  1707. p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
  1708. p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
  1709. p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
  1710. p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
  1711. p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
  1712. p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
  1713. p_dst0 += 1;
  1714. p_dst1 += 1;
  1715. p_src += 2;
  1716. }
  1717. p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
  1718. p_dst0 += ( ( 8 * dst0_stride ) - i_width );
  1719. p_dst1 += ( ( 8 * dst1_stride ) - i_width );
  1720. }
  1721. for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
  1722. {
  1723. for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
  1724. {
  1725. in0 = LD_UB( p_src );
  1726. p_src += 16;
  1727. vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
  1728. ( v16i8 ) in0 );
  1729. vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
  1730. ( v16i8 ) in0 );
  1731. ST8x1_UB( vec_pckev0, p_dst0 );
  1732. ST8x1_UB( vec_pckod0, p_dst1 );
  1733. p_dst0 += 8;
  1734. p_dst1 += 8;
  1735. }
  1736. for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
  1737. {
  1738. in0 = LD_UB( p_src );
  1739. p_src += 8;
  1740. vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
  1741. ( v16i8 ) in0 );
  1742. vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
  1743. ( v16i8 ) in0 );
  1744. u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
  1745. SW( u_res_w0, p_dst0 );
  1746. u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
  1747. SW( u_res_w1, p_dst1 );
  1748. p_dst0 += 4;
  1749. p_dst1 += 4;
  1750. }
  1751. for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
  1752. {
  1753. p_dst0[0] = p_src[0];
  1754. p_dst1[0] = p_src[1];
  1755. p_dst0 += 1;
  1756. p_dst1 += 1;
  1757. p_src += 2;
  1758. }
  1759. p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
  1760. p_dst0 += ( ( dst0_stride ) - i_width );
  1761. p_dst1 += ( ( dst1_stride ) - i_width );
  1762. }
  1763. }
  1764. static void core_plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
  1765. int32_t i_src_stride,
  1766. uint8_t *p_dst0,
  1767. int32_t i_dst0_stride,
  1768. uint8_t *p_dst1,
  1769. int32_t i_dst1_stride,
  1770. uint8_t *p_dst2,
  1771. int32_t i_dst2_stride,
  1772. int32_t i_width,
  1773. int32_t i_height )
  1774. {
  1775. uint8_t *p_src_orig = p_src;
  1776. uint8_t *p_dst0_orig = p_dst0;
  1777. uint8_t *p_dst1_orig = p_dst1;
  1778. uint8_t *p_dst2_orig = p_dst2;
  1779. int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
  1780. v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
  1781. v16i8 temp0, temp1, temp2, temp3;
  1782. v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
  1783. v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
  1784. v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
  1785. i_w_mul8 = i_width - i_width % 8;
  1786. i_h_mul4 = i_height - i_height % 4;
  1787. for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
  1788. {
  1789. p_src = p_src_orig;
  1790. p_dst0 = p_dst0_orig;
  1791. p_dst1 = p_dst1_orig;
  1792. p_dst2 = p_dst2_orig;
  1793. for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
  1794. {
  1795. LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
  1796. LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
  1797. VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
  1798. VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
  1799. ST8x1_UB( temp0, p_dst0 );
  1800. ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
  1801. ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
  1802. ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
  1803. VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
  1804. VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
  1805. ST8x1_UB( temp0, p_dst1 );
  1806. ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
  1807. ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
  1808. ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
  1809. VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
  1810. VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
  1811. ST8x1_UB( temp0, p_dst2 );
  1812. ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
  1813. ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
  1814. ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
  1815. p_src += 8 * 3;
  1816. p_dst0 += 8;
  1817. p_dst1 += 8;
  1818. p_dst2 += 8;
  1819. }
  1820. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  1821. {
  1822. p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
  1823. p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
  1824. p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
  1825. p_dst0_orig[i_loop_width + i_dst0_stride] =
  1826. p_src_orig[0 + i_src_stride + 3 * i_loop_width];
  1827. p_dst1_orig[i_loop_width + i_dst1_stride] =
  1828. p_src_orig[1 + i_src_stride + 3 * i_loop_width];
  1829. p_dst2_orig[i_loop_width + i_dst2_stride] =
  1830. p_src_orig[2 + i_src_stride + 3 * i_loop_width];
  1831. p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
  1832. p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
  1833. p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
  1834. p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
  1835. p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
  1836. p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
  1837. p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
  1838. p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
  1839. p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
  1840. p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
  1841. p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
  1842. p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
  1843. }
  1844. p_src_orig += ( 4 * i_src_stride );
  1845. p_dst0_orig += ( 4 * i_dst0_stride );
  1846. p_dst1_orig += ( 4 * i_dst1_stride );
  1847. p_dst2_orig += ( 4 * i_dst2_stride );
  1848. }
  1849. for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
  1850. {
  1851. p_src = p_src_orig;
  1852. p_dst0 = p_dst0_orig;
  1853. p_dst1 = p_dst1_orig;
  1854. p_dst2 = p_dst2_orig;
  1855. for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
  1856. {
  1857. in0 = LD_SB( p_src );
  1858. in4 = LD_SB( p_src + 16 );
  1859. temp0 = __msa_vshf_b( mask0, in4, in0 );
  1860. ST8x1_UB( temp0, p_dst0 );
  1861. temp0 = __msa_vshf_b( mask1, in4, in0 );
  1862. ST8x1_UB( temp0, p_dst1 );
  1863. temp0 = __msa_vshf_b( mask2, in4, in0 );
  1864. ST8x1_UB( temp0, p_dst2 );
  1865. p_src += 8 * 3;
  1866. p_dst0 += 8;
  1867. p_dst1 += 8;
  1868. p_dst2 += 8;
  1869. }
  1870. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  1871. {
  1872. p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
  1873. p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
  1874. p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
  1875. }
  1876. p_src_orig += ( i_src_stride );
  1877. p_dst0_orig += ( i_dst0_stride );
  1878. p_dst1_orig += ( i_dst1_stride );
  1879. p_dst2_orig += ( i_dst2_stride );
  1880. }
  1881. }
  1882. static void core_plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
  1883. int32_t i_src_stride,
  1884. uint8_t *p_dst0,
  1885. int32_t i_dst0_stride,
  1886. uint8_t *p_dst1,
  1887. int32_t i_dst1_stride,
  1888. uint8_t *p_dst2,
  1889. int32_t i_dst2_stride,
  1890. int32_t i_width,
  1891. int32_t i_height )
  1892. {
  1893. uint8_t *p_src_orig = p_src;
  1894. uint8_t *p_dst0_orig = p_dst0;
  1895. uint8_t *p_dst1_orig = p_dst1;
  1896. uint8_t *p_dst2_orig = p_dst2;
  1897. int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
  1898. v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
  1899. v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
  1900. v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1901. v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
  1902. i_w_mul8 = i_width - i_width % 8;
  1903. i_h_mul4 = i_height - i_height % 4;
  1904. for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
  1905. {
  1906. p_src = p_src_orig;
  1907. p_dst0 = p_dst0_orig;
  1908. p_dst1 = p_dst1_orig;
  1909. p_dst2 = p_dst2_orig;
  1910. for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
  1911. {
  1912. LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
  1913. LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
  1914. LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
  1915. LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
  1916. PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
  1917. temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
  1918. temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
  1919. PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
  1920. temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
  1921. temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
  1922. PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
  1923. temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
  1924. temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
  1925. PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
  1926. temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
  1927. temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
  1928. PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
  1929. in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
  1930. PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
  1931. in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
  1932. PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
  1933. in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
  1934. PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
  1935. in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
  1936. ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
  1937. ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
  1938. ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
  1939. p_src += 16 * 4;
  1940. p_dst0 += 16;
  1941. p_dst1 += 16;
  1942. p_dst2 += 16;
  1943. }
  1944. for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
  1945. {
  1946. LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
  1947. LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
  1948. PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
  1949. temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
  1950. temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
  1951. PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
  1952. temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
  1953. temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
  1954. PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
  1955. in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
  1956. PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
  1957. in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
  1958. PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
  1959. in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
  1960. PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
  1961. in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
  1962. ST8x1_UB( in0, p_dst0 );
  1963. ST8x1_UB( in4, p_dst0 + i_dst0_stride );
  1964. ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
  1965. ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
  1966. ST8x1_UB( in1, p_dst2 );
  1967. ST8x1_UB( in5, p_dst2 + i_dst2_stride );
  1968. ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
  1969. ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
  1970. ST8x1_UB( in2, p_dst1 );
  1971. ST8x1_UB( in6, p_dst1 + i_dst1_stride );
  1972. ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
  1973. ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
  1974. p_src += 8 * 4;
  1975. p_dst0 += 8;
  1976. p_dst1 += 8;
  1977. p_dst2 += 8;
  1978. }
  1979. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  1980. {
  1981. p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
  1982. p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
  1983. p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
  1984. p_dst0_orig[i_dst0_stride + i_loop_width] =
  1985. p_src_orig[i_src_stride + 4 * i_loop_width];
  1986. p_dst1_orig[i_dst1_stride + i_loop_width] =
  1987. p_src_orig[i_src_stride + 4 * i_loop_width + 1];
  1988. p_dst2_orig[i_dst2_stride + i_loop_width] =
  1989. p_src_orig[i_src_stride + 4 * i_loop_width + 2];
  1990. p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
  1991. p_src_orig[2 * i_src_stride + 4 * i_loop_width];
  1992. p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
  1993. p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
  1994. p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
  1995. p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
  1996. p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
  1997. p_src_orig[3 * i_src_stride + 4 * i_loop_width];
  1998. p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
  1999. p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
  2000. p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
  2001. p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
  2002. }
  2003. p_src_orig += ( 4 * i_src_stride );
  2004. p_dst0_orig += ( 4 * i_dst0_stride );
  2005. p_dst1_orig += ( 4 * i_dst1_stride );
  2006. p_dst2_orig += ( 4 * i_dst2_stride );
  2007. }
  2008. for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
  2009. {
  2010. p_src = p_src_orig;
  2011. p_dst0 = p_dst0_orig;
  2012. p_dst1 = p_dst1_orig;
  2013. p_dst2 = p_dst2_orig;
  2014. for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
  2015. {
  2016. LD_SB4( p_src, 16, in0, in4, in8, in12 );
  2017. PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
  2018. temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
  2019. temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
  2020. PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
  2021. in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
  2022. ST_SB( in0, p_dst0 );
  2023. ST_SB( in0, p_dst0 );
  2024. ST_SB( in1, p_dst2 );
  2025. ST_SB( in1, p_dst2 );
  2026. ST_SB( in2, p_dst1 );
  2027. ST_SB( in2, p_dst1 );
  2028. p_src += 16 * 4;
  2029. p_dst0 += 16;
  2030. p_dst1 += 16;
  2031. p_dst2 += 16;
  2032. }
  2033. for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
  2034. {
  2035. in0 = LD_SB( p_src );
  2036. in4 = LD_SB( p_src + 16 );
  2037. temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
  2038. temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
  2039. PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
  2040. in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
  2041. ST8x1_UB( in0, p_dst0 );
  2042. ST8x1_UB( in1, p_dst2 );
  2043. ST8x1_UB( in2, p_dst1 );
  2044. p_src += 8 * 4;
  2045. p_dst0 += 8;
  2046. p_dst1 += 8;
  2047. p_dst2 += 8;
  2048. }
  2049. for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
  2050. {
  2051. p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
  2052. p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
  2053. p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
  2054. }
  2055. p_src_orig += ( i_src_stride );
  2056. p_dst0_orig += ( i_dst0_stride );
  2057. p_dst1_orig += ( i_dst1_stride );
  2058. p_dst2_orig += ( i_dst2_stride );
  2059. }
  2060. }
  2061. static void core_store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
  2062. uint8_t *p_src1, int32_t i_src1_stride,
  2063. uint8_t *p_dst, int32_t i_dst_stride,
  2064. int32_t i_height )
  2065. {
  2066. int32_t i_loop_height, i_h4w;
  2067. v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
  2068. v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
  2069. i_h4w = i_height % 4;
  2070. for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
  2071. {
  2072. LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
  2073. p_src0 += ( 4 * i_src0_stride );
  2074. LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
  2075. p_src1 += ( 4 * i_src1_stride );
  2076. ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
  2077. ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
  2078. ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
  2079. p_dst, i_dst_stride );
  2080. p_dst += ( 4 * i_dst_stride );
  2081. }
  2082. for( i_loop_height = i_h4w; i_loop_height--; )
  2083. {
  2084. in0 = LD_UB( p_src0 );
  2085. p_src0 += ( i_src0_stride );
  2086. in1 = LD_UB( p_src1 );
  2087. p_src1 += ( i_src1_stride );
  2088. ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
  2089. ST_UB( ilvr_vec0, p_dst );
  2090. p_dst += ( i_dst_stride );
  2091. }
  2092. }
  2093. static void core_frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
  2094. uint8_t *p_dst0, int32_t dst0_stride,
  2095. uint8_t *p_dst1, int32_t dst1_stride,
  2096. uint8_t *p_dst2, int32_t dst2_stride,
  2097. uint8_t *p_dst3, int32_t dst3_stride,
  2098. int32_t i_width, int32_t i_height )
  2099. {
  2100. int32_t i_loop_width, i_loop_height, i_w16_mul;
  2101. v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
  2102. v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
  2103. v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
  2104. v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
  2105. v16u8 tmp0, tmp1, tmp2, tmp3;
  2106. v16u8 res0, res1;
  2107. i_w16_mul = i_width - i_width % 16;
  2108. for( i_loop_height = i_height; i_loop_height--; )
  2109. {
  2110. LD_UB3( p_src, i_src_stride, src0, src1, src2 );
  2111. p_src += 16;
  2112. for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
  2113. {
  2114. LD_UB3( p_src, i_src_stride, src3, src4, src5 );
  2115. p_src += 16;
  2116. LD_UB3( p_src, i_src_stride, src6, src7, src8 );
  2117. p_src += 16;
  2118. PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
  2119. PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
  2120. pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
  2121. ( v16i8 ) src2 );
  2122. pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
  2123. ( v16i8 ) src2 );
  2124. AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
  2125. pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
  2126. tmp0, tmp1, tmp2, tmp3 );
  2127. AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
  2128. ST_UB( res0, p_dst0 );
  2129. ST_UB( res1, p_dst2 );
  2130. SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
  2131. SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
  2132. SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
  2133. PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
  2134. pckev_vec0, pckev_vec1 )
  2135. pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
  2136. ( v16i8 ) sld1_vec2 );
  2137. AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
  2138. pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
  2139. tmp0, tmp1, tmp2, tmp3 );
  2140. AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
  2141. ST_UB( res0, p_dst1 );
  2142. ST_UB( res1, p_dst3 );
  2143. src0 = src6;
  2144. src1 = src7;
  2145. src2 = src8;
  2146. p_dst0 += 16;
  2147. p_dst1 += 16;
  2148. p_dst2 += 16;
  2149. p_dst3 += 16;
  2150. }
  2151. for( i_loop_width = i_w16_mul; i_loop_width < i_width;
  2152. i_loop_width += 8 )
  2153. {
  2154. LD_UB3( p_src, i_src_stride, src3, src4, src5 );
  2155. p_src += 16;
  2156. PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
  2157. PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
  2158. pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
  2159. ( v16i8 ) src2 );
  2160. pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
  2161. ( v16i8 ) src2 );
  2162. AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
  2163. pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
  2164. tmp0, tmp1, tmp2, tmp3 );
  2165. AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
  2166. ST8x1_UB( res0, p_dst0 );
  2167. ST8x1_UB( res1, p_dst2 );
  2168. SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
  2169. SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
  2170. SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
  2171. PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
  2172. pckev_vec0, pckev_vec1 )
  2173. pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
  2174. ( v16i8 ) sld1_vec2 );
  2175. AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
  2176. pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
  2177. tmp0, tmp1, tmp2, tmp3 );
  2178. AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
  2179. ST8x1_UB( res0, p_dst1 );
  2180. ST8x1_UB( res1, p_dst3 );
  2181. p_dst0 += 8;
  2182. p_dst1 += 8;
  2183. p_dst2 += 8;
  2184. p_dst3 += 8;
  2185. }
  2186. p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
  2187. p_dst0 += ( dst0_stride - i_width );
  2188. p_dst1 += ( dst1_stride - i_width );
  2189. p_dst2 += ( dst2_stride - i_width );
  2190. p_dst3 += ( dst3_stride - i_width );
  2191. }
  2192. }
  2193. static void mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2194. uint8_t *p_src, intptr_t i_src_stride,
  2195. int32_t i_height )
  2196. {
  2197. copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
  2198. }
  2199. static void mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
  2200. intptr_t i_src_stride, int32_t i_height )
  2201. {
  2202. copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
  2203. }
  2204. static void mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
  2205. intptr_t i_src_stride, int32_t i_height )
  2206. {
  2207. copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
  2208. }
  2209. static void pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2210. uint8_t *p_pix2, intptr_t pix2_stride,
  2211. uint8_t *p_pix3, intptr_t pix3_stride,
  2212. int32_t i_weight )
  2213. {
  2214. if( 32 == i_weight )
  2215. {
  2216. avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2217. p_pix1, pix1_stride, 16 );
  2218. }
  2219. else if( i_weight < 0 || i_weight > 63 )
  2220. {
  2221. avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
  2222. p_pix3, pix3_stride,
  2223. p_pix1, pix1_stride,
  2224. 16, 5, i_weight,
  2225. ( 64 - i_weight ), 0 );
  2226. }
  2227. else
  2228. {
  2229. avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
  2230. p_pix3, pix3_stride,
  2231. p_pix1, pix1_stride,
  2232. 16, 5, i_weight,
  2233. ( 64 - i_weight ), 0 );
  2234. }
  2235. }
  2236. static void pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2237. uint8_t *p_pix2, intptr_t pix2_stride,
  2238. uint8_t *p_pix3, intptr_t pix3_stride,
  2239. int32_t i_weight )
  2240. {
  2241. if( 32 == i_weight )
  2242. {
  2243. avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2244. p_pix1, pix1_stride, 8 );
  2245. }
  2246. else if( i_weight < 0 || i_weight > 63 )
  2247. {
  2248. avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
  2249. p_pix3, pix3_stride,
  2250. p_pix1, pix1_stride,
  2251. 8, 5, i_weight,
  2252. ( 64 - i_weight ), 0 );
  2253. }
  2254. else
  2255. {
  2256. avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
  2257. p_pix3, pix3_stride,
  2258. p_pix1, pix1_stride,
  2259. 8, 5, i_weight,
  2260. ( 64 - i_weight ), 0 );
  2261. }
  2262. }
  2263. static void pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2264. uint8_t *p_pix2, intptr_t pix2_stride,
  2265. uint8_t *p_pix3, intptr_t pix3_stride,
  2266. int32_t i_weight )
  2267. {
  2268. if( 32 == i_weight )
  2269. {
  2270. avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2271. p_pix1, pix1_stride, 16 );
  2272. }
  2273. else if( i_weight < 0 || i_weight > 63 )
  2274. {
  2275. avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
  2276. p_pix3, pix3_stride,
  2277. p_pix1, pix1_stride, 16, 5, i_weight,
  2278. ( 64 - i_weight ), 0 );
  2279. }
  2280. else
  2281. {
  2282. avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
  2283. p_pix3, pix3_stride,
  2284. p_pix1, pix1_stride, 16, 5, i_weight,
  2285. ( 64 - i_weight ), 0 );
  2286. }
  2287. }
  2288. static void pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2289. uint8_t *p_pix2, intptr_t pix2_stride,
  2290. uint8_t *p_pix3, intptr_t pix3_stride,
  2291. int32_t i_weight )
  2292. {
  2293. if( 32 == i_weight )
  2294. {
  2295. avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2296. p_pix1, pix1_stride, 8 );
  2297. }
  2298. else if( i_weight < 0 || i_weight > 63 )
  2299. {
  2300. avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
  2301. p_pix3, pix3_stride,
  2302. p_pix1, pix1_stride, 8, 5, i_weight,
  2303. ( 64 - i_weight ), 0 );
  2304. }
  2305. else
  2306. {
  2307. avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
  2308. p_pix3, pix3_stride,
  2309. p_pix1, pix1_stride, 8, 5, i_weight,
  2310. ( 64 - i_weight ), 0 );
  2311. }
  2312. }
  2313. static void pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2314. uint8_t *p_pix2, intptr_t pix2_stride,
  2315. uint8_t *p_pix3, intptr_t pix3_stride,
  2316. int32_t i_weight )
  2317. {
  2318. if( 32 == i_weight )
  2319. {
  2320. avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2321. p_pix1, pix1_stride, 4 );
  2322. }
  2323. else if( i_weight < 0 || i_weight > 63 )
  2324. {
  2325. avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
  2326. p_pix3, pix3_stride,
  2327. p_pix1, pix1_stride, 4, 5, i_weight,
  2328. ( 64 - i_weight ), 0 );
  2329. }
  2330. else
  2331. {
  2332. avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
  2333. p_pix3, pix3_stride,
  2334. p_pix1, pix1_stride, 4, 5, i_weight,
  2335. ( 64 - i_weight ), 0 );
  2336. }
  2337. }
  2338. static void pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2339. uint8_t *p_pix2, intptr_t pix2_stride,
  2340. uint8_t *p_pix3, intptr_t pix3_stride,
  2341. int32_t i_weight )
  2342. {
  2343. if( 32 == i_weight )
  2344. {
  2345. avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2346. p_pix1, pix1_stride, 16 );
  2347. }
  2348. else if( i_weight < 0 || i_weight > 63 )
  2349. {
  2350. avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
  2351. p_pix3, pix3_stride,
  2352. p_pix1, pix1_stride, 16, 5, i_weight,
  2353. ( 64 - i_weight ), 0 );
  2354. }
  2355. else
  2356. {
  2357. avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
  2358. p_pix3, pix3_stride,
  2359. p_pix1, pix1_stride, 16, 5, i_weight,
  2360. ( 64 - i_weight ), 0 );
  2361. }
  2362. }
  2363. static void pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2364. uint8_t *p_pix2, intptr_t pix2_stride,
  2365. uint8_t *p_pix3, intptr_t pix3_stride,
  2366. int32_t i_weight )
  2367. {
  2368. if( 32 == i_weight )
  2369. {
  2370. avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2371. p_pix1, pix1_stride, 8 );
  2372. }
  2373. else if( i_weight < 0 || i_weight > 63 )
  2374. {
  2375. avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
  2376. p_pix3, pix3_stride,
  2377. p_pix1, pix1_stride, 8, 5, i_weight,
  2378. ( 64 - i_weight ), 0 );
  2379. }
  2380. else
  2381. {
  2382. avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
  2383. p_pix3, pix3_stride,
  2384. p_pix1, pix1_stride, 8, 5, i_weight,
  2385. ( 64 - i_weight ), 0 );
  2386. }
  2387. }
  2388. static void pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2389. uint8_t *p_pix2, intptr_t pix2_stride,
  2390. uint8_t *p_pix3, intptr_t pix3_stride,
  2391. int32_t i_weight )
  2392. {
  2393. if( 32 == i_weight )
  2394. {
  2395. avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2396. p_pix1, pix1_stride, 4 );
  2397. }
  2398. else if( i_weight < 0 || i_weight > 63 )
  2399. {
  2400. avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
  2401. p_pix3, pix3_stride,
  2402. p_pix1, pix1_stride, 4, 5, i_weight,
  2403. ( 64 - i_weight ), 0 );
  2404. }
  2405. else
  2406. {
  2407. avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
  2408. p_pix3, pix3_stride,
  2409. p_pix1, pix1_stride, 4, 5, i_weight,
  2410. ( 64 - i_weight ), 0 );
  2411. }
  2412. }
  2413. static void pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
  2414. uint8_t *p_pix2, intptr_t pix2_stride,
  2415. uint8_t *p_pix3, intptr_t pix3_stride,
  2416. int32_t i_weight )
  2417. {
  2418. if( 32 == i_weight )
  2419. {
  2420. avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
  2421. p_pix1, pix1_stride, 2 );
  2422. }
  2423. else if( i_weight < 0 || i_weight > 63 )
  2424. {
  2425. avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
  2426. p_pix3, pix3_stride,
  2427. p_pix1, pix1_stride, 5, i_weight,
  2428. ( 64 - i_weight ), 0 );
  2429. }
  2430. else
  2431. {
  2432. avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
  2433. p_pix3, pix3_stride,
  2434. p_pix1, pix1_stride, 5, i_weight,
  2435. ( 64 - i_weight ), 0 );
  2436. }
  2437. }
  2438. static void memzero_aligned_msa( void *p_dst, size_t n )
  2439. {
  2440. uint32_t u_tot32_mul_lines = n >> 5;
  2441. uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
  2442. memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
  2443. if( u_remaining )
  2444. {
  2445. memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
  2446. }
  2447. }
  2448. static void mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2449. uint8_t *p_src, intptr_t i_src_stride,
  2450. const x264_weight_t *pWeight, int32_t i_height )
  2451. {
  2452. int32_t i_log2_denom = pWeight->i_denom;
  2453. int32_t i_offset = pWeight->i_offset;
  2454. int32_t i_weight = pWeight->i_scale;
  2455. avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
  2456. i_height, i_log2_denom, i_weight, i_offset );
  2457. }
  2458. static void mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2459. uint8_t *p_src, intptr_t i_src_stride,
  2460. const x264_weight_t *pWeight, int32_t i_height )
  2461. {
  2462. int32_t i_log2_denom = pWeight->i_denom;
  2463. int32_t i_offset = pWeight->i_offset;
  2464. int32_t i_weight = pWeight->i_scale;
  2465. avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
  2466. i_height, i_log2_denom, i_weight, i_offset );
  2467. }
  2468. static void mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2469. uint8_t *p_src, intptr_t i_src_stride,
  2470. const x264_weight_t *pWeight, int32_t i_height )
  2471. {
  2472. int32_t i_log2_denom = pWeight->i_denom;
  2473. int32_t i_offset = pWeight->i_offset;
  2474. int32_t i_weight = pWeight->i_scale;
  2475. avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
  2476. i_height, i_log2_denom, i_weight, i_offset );
  2477. }
  2478. static void mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2479. uint8_t *p_src, intptr_t i_src_stride,
  2480. const x264_weight_t *pWeight, int32_t i_height )
  2481. {
  2482. mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
  2483. pWeight, i_height );
  2484. mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
  2485. pWeight, i_height );
  2486. }
  2487. static void mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2488. uint8_t *p_src[4], intptr_t i_src_stride,
  2489. int32_t m_vx, int32_t m_vy,
  2490. int32_t i_width, int32_t i_height,
  2491. const x264_weight_t *pWeight )
  2492. {
  2493. int32_t i_qpel_idx;
  2494. int32_t i_offset;
  2495. uint8_t *p_src1;
  2496. i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
  2497. i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
  2498. p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
  2499. ( 3 == ( m_vy & 3 ) ) * i_src_stride;
  2500. if( i_qpel_idx & 5 )
  2501. {
  2502. uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
  2503. i_offset + ( 3 == ( m_vx&3 ) );
  2504. if( 16 == i_width )
  2505. {
  2506. avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
  2507. p_dst, i_dst_stride, i_height );
  2508. }
  2509. else if( 8 == i_width )
  2510. {
  2511. avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
  2512. p_dst, i_dst_stride, i_height );
  2513. }
  2514. else if( 4 == i_width )
  2515. {
  2516. avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
  2517. p_dst, i_dst_stride, i_height );
  2518. }
  2519. if( pWeight->weightfn )
  2520. {
  2521. if( 16 == i_width )
  2522. {
  2523. mc_weight_w16_msa( p_dst, i_dst_stride,
  2524. p_dst, i_dst_stride,
  2525. pWeight, i_height );
  2526. }
  2527. else if( 8 == i_width )
  2528. {
  2529. mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
  2530. pWeight, i_height );
  2531. }
  2532. else if( 4 == i_width )
  2533. {
  2534. mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
  2535. pWeight, i_height );
  2536. }
  2537. }
  2538. }
  2539. else if( pWeight->weightfn )
  2540. {
  2541. if( 16 == i_width )
  2542. {
  2543. mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
  2544. pWeight, i_height );
  2545. }
  2546. else if( 8 == i_width )
  2547. {
  2548. mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
  2549. pWeight, i_height );
  2550. }
  2551. else if( 4 == i_width )
  2552. {
  2553. mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
  2554. pWeight, i_height );
  2555. }
  2556. }
  2557. else
  2558. {
  2559. if( 16 == i_width )
  2560. {
  2561. copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
  2562. i_height );
  2563. }
  2564. else if( 8 == i_width )
  2565. {
  2566. copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
  2567. i_height );
  2568. }
  2569. else if( 4 == i_width )
  2570. {
  2571. copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
  2572. i_height );
  2573. }
  2574. }
  2575. }
  2576. static void mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
  2577. intptr_t i_dst_stride,
  2578. uint8_t *p_src, intptr_t i_src_stride,
  2579. int32_t m_vx, int32_t m_vy,
  2580. int32_t i_width, int32_t i_height )
  2581. {
  2582. int32_t i_d8x = m_vx & 0x07;
  2583. int32_t i_d8y = m_vy & 0x07;
  2584. int32_t i_coeff_horiz1 = ( 8 - i_d8x );
  2585. int32_t i_coeff_vert1 = ( 8 - i_d8y );
  2586. int32_t i_coeff_horiz0 = i_d8x;
  2587. int32_t i_coeff_vert0 = i_d8y;
  2588. p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
  2589. if( 2 == i_width )
  2590. {
  2591. avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
  2592. p_dst_u, p_dst_v, i_dst_stride,
  2593. i_coeff_horiz0, i_coeff_horiz1,
  2594. i_coeff_vert0, i_coeff_vert1,
  2595. i_height );
  2596. }
  2597. else if( 4 == i_width )
  2598. {
  2599. avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
  2600. p_dst_u, p_dst_v, i_dst_stride,
  2601. i_coeff_horiz0, i_coeff_horiz1,
  2602. i_coeff_vert0, i_coeff_vert1,
  2603. i_height );
  2604. }
  2605. else if( 8 == i_width )
  2606. {
  2607. avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
  2608. p_dst_u, p_dst_v, i_dst_stride,
  2609. i_coeff_horiz0, i_coeff_horiz1,
  2610. i_coeff_vert0, i_coeff_vert1,
  2611. i_height );
  2612. }
  2613. }
  2614. static void hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
  2615. uint8_t *p_dstc, uint8_t *p_src,
  2616. intptr_t i_stride, int32_t i_width,
  2617. int32_t i_height, int16_t *p_buf )
  2618. {
  2619. for( int32_t i = 0; i < ( i_width / 16 ); i++ )
  2620. {
  2621. avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
  2622. p_dst_v - 2, i_stride, i_height );
  2623. avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
  2624. p_dstc, i_stride, i_height );
  2625. avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
  2626. p_src += 16;
  2627. p_dst_v += 16;
  2628. p_dsth += 16;
  2629. p_dstc += 16;
  2630. }
  2631. }
  2632. static void plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2633. uint8_t *p_src0, intptr_t i_src_stride0,
  2634. uint8_t *p_src1, intptr_t i_src_stride1,
  2635. int32_t i_width, int32_t i_height )
  2636. {
  2637. core_plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
  2638. p_dst, i_dst_stride, i_width, i_height );
  2639. }
  2640. static void plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
  2641. uint8_t *p_dst1, intptr_t i_dst_stride1,
  2642. uint8_t *p_src, intptr_t i_src_stride,
  2643. int32_t i_width, int32_t i_height )
  2644. {
  2645. core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
  2646. p_dst1, i_dst_stride1, i_width, i_height );
  2647. }
  2648. static void plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
  2649. intptr_t i_dst_stride0,
  2650. uint8_t *p_dst1,
  2651. intptr_t i_dst_stride1,
  2652. uint8_t *p_dst2,
  2653. intptr_t i_dst_stride2,
  2654. uint8_t *p_src,
  2655. intptr_t i_src_stride,
  2656. int32_t i_src_width,
  2657. int32_t i_width,
  2658. int32_t i_height )
  2659. {
  2660. if( 3 == i_src_width )
  2661. {
  2662. core_plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
  2663. p_dst0, i_dst_stride0,
  2664. p_dst1, i_dst_stride1,
  2665. p_dst2, i_dst_stride2,
  2666. i_width, i_height );
  2667. }
  2668. else if( 4 == i_src_width )
  2669. {
  2670. core_plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
  2671. p_dst0, i_dst_stride0,
  2672. p_dst1, i_dst_stride1,
  2673. p_dst2, i_dst_stride2,
  2674. i_width, i_height );
  2675. }
  2676. }
  2677. static void store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
  2678. uint8_t *p_src0, uint8_t *p_src1,
  2679. int32_t i_height )
  2680. {
  2681. core_store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
  2682. p_dst, i_dst_stride, i_height );
  2683. }
  2684. static void load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
  2685. intptr_t i_src_stride,
  2686. int32_t i_height )
  2687. {
  2688. core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
  2689. ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
  2690. 8, i_height );
  2691. }
  2692. static void load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
  2693. intptr_t i_src_stride,
  2694. int32_t i_height )
  2695. {
  2696. core_plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
  2697. ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
  2698. 8, i_height );
  2699. }
  2700. static void frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
  2701. uint8_t *p_dst1, uint8_t *p_dst2,
  2702. uint8_t *p_dst3, intptr_t i_src_stride,
  2703. intptr_t i_dst_stride, int32_t i_width,
  2704. int32_t i_height )
  2705. {
  2706. core_frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
  2707. p_dst1, i_dst_stride, p_dst2, i_dst_stride,
  2708. p_dst3, i_dst_stride, i_width, i_height );
  2709. }
  2710. static uint8_t *get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
  2711. uint8_t *p_src[4], intptr_t i_src_stride,
  2712. int32_t m_vx, int32_t m_vy,
  2713. int32_t i_width, int32_t i_height,
  2714. const x264_weight_t *pWeight )
  2715. {
  2716. int32_t i_qpel_idx, i_cnt, i_h4w;
  2717. int32_t i_offset;
  2718. uint8_t *p_src1, *src1_org;
  2719. i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
  2720. i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
  2721. p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
  2722. ( 3 == ( m_vy & 3 ) ) * i_src_stride;
  2723. i_h4w = i_height - i_height%4;
  2724. if( i_qpel_idx & 5 )
  2725. {
  2726. uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
  2727. i_offset + ( 3 == ( m_vx & 3 ) );
  2728. if( 16 == i_width )
  2729. {
  2730. avg_src_width16_msa( p_src1, i_src_stride,
  2731. p_src2, i_src_stride,
  2732. p_dst, *p_dst_stride, i_h4w );
  2733. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2734. {
  2735. v16u8 src_vec1, src_vec2;
  2736. v16u8 dst_vec0;
  2737. src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
  2738. src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
  2739. dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
  2740. ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
  2741. }
  2742. }
  2743. else if( 20 == i_width )
  2744. {
  2745. avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
  2746. p_dst, *p_dst_stride, i_h4w );
  2747. avg_src_width4_msa( p_src1 + 16, i_src_stride,
  2748. p_src2 + 16, i_src_stride,
  2749. p_dst + 16, *p_dst_stride, i_h4w );
  2750. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2751. {
  2752. v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
  2753. v16u8 dst_vec0, dst_vec1;
  2754. uint32_t temp0;
  2755. src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
  2756. src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
  2757. src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
  2758. src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
  2759. dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
  2760. dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
  2761. temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
  2762. ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
  2763. SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
  2764. }
  2765. }
  2766. else if( 12 == i_width )
  2767. {
  2768. avg_src_width8_msa( p_src1, i_src_stride,
  2769. p_src2, i_src_stride,
  2770. p_dst, *p_dst_stride, i_h4w );
  2771. avg_src_width4_msa( p_src1 + 8, i_src_stride,
  2772. p_src2 + 8, i_src_stride,
  2773. p_dst + 8, *p_dst_stride, i_h4w );
  2774. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2775. {
  2776. uint32_t temp0;
  2777. uint64_t dst0;
  2778. v16u8 src_vec1, src_vec2;
  2779. v16u8 dst_vec0;
  2780. src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
  2781. src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
  2782. dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
  2783. dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
  2784. temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
  2785. SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
  2786. SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
  2787. }
  2788. }
  2789. else if( 8 == i_width )
  2790. {
  2791. avg_src_width8_msa( p_src1, i_src_stride,
  2792. p_src2, i_src_stride,
  2793. p_dst, *p_dst_stride, i_h4w );
  2794. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2795. {
  2796. uint64_t dst0;
  2797. v16u8 src_vec1, src_vec2;
  2798. v16u8 dst_vec0;
  2799. src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
  2800. src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
  2801. dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
  2802. dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
  2803. SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
  2804. }
  2805. }
  2806. else if( 4 == i_width )
  2807. {
  2808. avg_src_width4_msa( p_src1, i_src_stride,
  2809. p_src2, i_src_stride,
  2810. p_dst, *p_dst_stride, i_h4w );
  2811. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2812. {
  2813. uint32_t temp0;
  2814. v16u8 src_vec1, src_vec2;
  2815. v16u8 dst_vec0;
  2816. src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
  2817. src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
  2818. dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
  2819. temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
  2820. SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
  2821. }
  2822. }
  2823. if( pWeight->weightfn )
  2824. {
  2825. int32_t i_log2_denom;
  2826. int32_t i_offset_val;
  2827. int32_t i_weight;
  2828. i_log2_denom = pWeight->i_denom;
  2829. i_offset_val = pWeight->i_offset;
  2830. i_weight = pWeight->i_scale;
  2831. if( 16 == i_width || 12 == i_width )
  2832. {
  2833. mc_weight_w16_msa( p_dst, *p_dst_stride,
  2834. p_dst, *p_dst_stride,
  2835. pWeight, i_h4w );
  2836. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2837. {
  2838. v16i8 zero = {0};
  2839. v16u8 src_vec0;
  2840. v16i8 tmp0;
  2841. v8u16 temp_vec0, temp_vec1;
  2842. v8u16 wgt, offset_val0;
  2843. v8i16 denom;
  2844. i_offset_val <<= ( i_log2_denom );
  2845. if( i_log2_denom )
  2846. {
  2847. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  2848. }
  2849. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  2850. offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
  2851. denom = __msa_fill_h( i_log2_denom );
  2852. src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
  2853. temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
  2854. ( v16i8 ) src_vec0 );
  2855. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
  2856. ( v16i8 ) src_vec0 );
  2857. temp_vec0 = wgt * temp_vec0;
  2858. temp_vec1 = wgt * temp_vec1;
  2859. temp_vec0 =
  2860. ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  2861. ( v8i16 ) offset_val0 );
  2862. temp_vec1 =
  2863. ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
  2864. ( v8i16 ) offset_val0 );
  2865. temp_vec0 =
  2866. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  2867. temp_vec1 =
  2868. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
  2869. temp_vec0 =
  2870. ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  2871. temp_vec1 =
  2872. ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
  2873. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  2874. temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
  2875. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
  2876. ( v16i8 ) temp_vec0 );
  2877. ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
  2878. }
  2879. }
  2880. else if( 20 == i_width )
  2881. {
  2882. mc_weight_w20_msa( p_dst, *p_dst_stride,
  2883. p_dst, *p_dst_stride,
  2884. pWeight, i_h4w );
  2885. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2886. {
  2887. uint32_t temp0;
  2888. v16i8 zero = {0};
  2889. v16u8 src_vec0;
  2890. v16i8 tmp0;
  2891. v8u16 temp_vec0, temp_vec1;
  2892. v8u16 wgt;
  2893. v8i16 denom, offset_val0;
  2894. i_offset_val <<= ( i_log2_denom );
  2895. if( i_log2_denom )
  2896. {
  2897. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  2898. }
  2899. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  2900. offset_val0 = __msa_fill_h( i_offset_val );
  2901. denom = __msa_fill_h( i_log2_denom );
  2902. src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
  2903. temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
  2904. temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
  2905. ( v16i8 ) src_vec0 );
  2906. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
  2907. ( v16i8 ) src_vec0 );
  2908. temp_vec0 = wgt * temp_vec0;
  2909. temp_vec1 = wgt * temp_vec1;
  2910. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  2911. offset_val0 );
  2912. temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
  2913. offset_val0 );
  2914. temp_vec0 =
  2915. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  2916. temp_vec1 =
  2917. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
  2918. temp_vec0 =
  2919. ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  2920. temp_vec1 =
  2921. ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
  2922. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  2923. temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
  2924. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
  2925. ( v16i8 ) temp_vec0 );
  2926. ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
  2927. src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
  2928. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
  2929. ( v16i8 ) src_vec0 );
  2930. temp_vec0 = wgt * temp_vec0;
  2931. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  2932. offset_val0 );
  2933. temp_vec0 =
  2934. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  2935. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
  2936. denom );
  2937. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  2938. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  2939. ( v16i8 ) temp_vec0 );
  2940. temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
  2941. SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
  2942. }
  2943. }
  2944. else if( 8 == i_width )
  2945. {
  2946. mc_weight_w8_msa( p_dst, *p_dst_stride,
  2947. p_dst, *p_dst_stride,
  2948. pWeight, i_h4w );
  2949. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2950. {
  2951. uint64_t temp0;
  2952. v16i8 zero = {0};
  2953. v16u8 src_vec0;
  2954. v16i8 tmp0;
  2955. v8u16 temp_vec0;
  2956. v8u16 wgt;
  2957. v8i16 denom, offset_val0;
  2958. i_offset_val = i_offset_val << i_log2_denom;
  2959. if( i_log2_denom )
  2960. {
  2961. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  2962. }
  2963. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  2964. offset_val0 = __msa_fill_h( i_offset_val );
  2965. denom = __msa_fill_h( i_log2_denom );
  2966. src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
  2967. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
  2968. ( v16i8 ) src_vec0 );
  2969. temp_vec0 = wgt * temp_vec0;
  2970. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  2971. offset_val0 );
  2972. temp_vec0 =
  2973. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  2974. temp_vec0 =
  2975. ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  2976. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  2977. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  2978. ( v16i8 ) temp_vec0 );
  2979. temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
  2980. SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
  2981. }
  2982. }
  2983. else if( 4 == i_width )
  2984. {
  2985. mc_weight_w4_msa( p_dst, *p_dst_stride,
  2986. p_dst, *p_dst_stride,
  2987. pWeight, i_h4w );
  2988. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  2989. {
  2990. uint32_t temp0;
  2991. v16i8 zero = {0};
  2992. v16u8 src_vec0;
  2993. v16i8 tmp0;
  2994. v8u16 temp_vec0;
  2995. v8u16 wgt;
  2996. v8i16 denom, offset_val0;
  2997. i_offset_val <<= ( i_log2_denom );
  2998. if( i_log2_denom )
  2999. {
  3000. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  3001. }
  3002. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  3003. offset_val0 = __msa_fill_h( i_offset_val );
  3004. denom = __msa_fill_h( i_log2_denom );
  3005. temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
  3006. src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
  3007. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
  3008. ( v16i8 ) src_vec0 );
  3009. temp_vec0 = wgt * temp_vec0;
  3010. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3011. offset_val0 );
  3012. temp_vec0 =
  3013. ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3014. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
  3015. denom );
  3016. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3017. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  3018. ( v16i8 ) temp_vec0 );
  3019. temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
  3020. SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
  3021. }
  3022. }
  3023. }
  3024. return p_dst;
  3025. }
  3026. else if( pWeight->weightfn )
  3027. {
  3028. int32_t i_offset_val, i_log2_denom, i_weight;
  3029. i_log2_denom = pWeight->i_denom;
  3030. i_offset_val = pWeight->i_offset;
  3031. i_weight = pWeight->i_scale;
  3032. i_h4w = i_height - i_height%4;
  3033. src1_org = p_src1;
  3034. if( 16 == i_width || 12 == i_width )
  3035. {
  3036. mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
  3037. pWeight, i_h4w );
  3038. p_src1 = src1_org + i_h4w * i_src_stride;
  3039. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  3040. {
  3041. v16i8 zero = {0};
  3042. v16u8 src_vec0;
  3043. v16i8 tmp0;
  3044. v8u16 temp_vec0, temp_vec1;
  3045. v8u16 wgt;
  3046. v8i16 denom, offset_val0;
  3047. i_offset_val <<= ( i_log2_denom );
  3048. if( i_log2_denom )
  3049. {
  3050. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  3051. }
  3052. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  3053. offset_val0 = __msa_fill_h( i_offset_val );
  3054. denom = __msa_fill_h( i_log2_denom );
  3055. src_vec0 = LD_UB( p_src1 );
  3056. p_src1 += i_src_stride;
  3057. temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
  3058. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
  3059. temp_vec0 = wgt * temp_vec0;
  3060. temp_vec1 = wgt * temp_vec1;
  3061. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3062. offset_val0 );
  3063. temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
  3064. offset_val0 );
  3065. temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3066. temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
  3067. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  3068. temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
  3069. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3070. temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
  3071. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
  3072. ( v16i8 ) temp_vec0 );
  3073. ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
  3074. }
  3075. }
  3076. else if( 20 == i_width )
  3077. {
  3078. mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
  3079. pWeight, i_h4w );
  3080. p_src1 = src1_org + i_h4w * i_src_stride;
  3081. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  3082. {
  3083. uint32_t temp0;
  3084. v16i8 zero = {0};
  3085. v16u8 src_vec0;
  3086. v16i8 tmp0;
  3087. v8u16 temp_vec0, temp_vec1;
  3088. v8u16 wgt;
  3089. v8i16 denom, offset_val0;
  3090. i_offset_val <<= ( i_log2_denom );
  3091. if( i_log2_denom )
  3092. {
  3093. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  3094. }
  3095. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  3096. offset_val0 = __msa_fill_h( i_offset_val );
  3097. denom = __msa_fill_h( i_log2_denom );
  3098. src_vec0 = LD_UB( p_src1 );
  3099. temp0 = LW( p_src1 + 16 );
  3100. p_src1 += i_src_stride;
  3101. temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
  3102. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
  3103. temp_vec0 = wgt * temp_vec0;
  3104. temp_vec1 = wgt * temp_vec1;
  3105. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3106. offset_val0 );
  3107. temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
  3108. offset_val0 );
  3109. temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3110. temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
  3111. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  3112. temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
  3113. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3114. temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
  3115. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
  3116. ( v16i8 ) temp_vec0 );
  3117. ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
  3118. src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
  3119. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
  3120. temp_vec0 = wgt * temp_vec0;
  3121. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3122. offset_val0 );
  3123. temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3124. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  3125. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3126. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  3127. ( v16i8 ) temp_vec0 );
  3128. temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
  3129. SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
  3130. }
  3131. }
  3132. else if( 8 == i_width )
  3133. {
  3134. mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
  3135. pWeight, i_h4w );
  3136. p_src1 = src1_org + i_h4w * i_src_stride;
  3137. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  3138. {
  3139. uint64_t u_temp0;
  3140. v16i8 zero = {0};
  3141. v16u8 src_vec0;
  3142. v16i8 tmp0;
  3143. v8u16 temp_vec0;
  3144. v8u16 wgt;
  3145. v8i16 denom, offset_val0;
  3146. i_offset_val = i_offset_val << i_log2_denom;
  3147. if( i_log2_denom )
  3148. {
  3149. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  3150. }
  3151. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  3152. offset_val0 = __msa_fill_h( i_offset_val );
  3153. denom = __msa_fill_h( i_log2_denom );
  3154. src_vec0 = LD_UB( p_src1 );
  3155. p_src1 += i_src_stride;
  3156. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
  3157. temp_vec0 = wgt * temp_vec0;
  3158. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3159. offset_val0 );
  3160. temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3161. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  3162. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3163. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  3164. ( v16i8 ) temp_vec0 );
  3165. u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
  3166. SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
  3167. }
  3168. }
  3169. else if( 4 == i_width )
  3170. {
  3171. mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
  3172. pWeight, i_h4w );
  3173. p_src1 = src1_org + i_h4w * i_src_stride;
  3174. for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
  3175. {
  3176. uint32_t u_temp0;
  3177. v16i8 zero = {0};
  3178. v16u8 src_vec0;
  3179. v16i8 tmp0;
  3180. v8u16 temp_vec0;
  3181. v8u16 wgt;
  3182. v8i16 denom, offset_val0;
  3183. i_offset_val <<= ( i_log2_denom );
  3184. if( i_log2_denom )
  3185. {
  3186. i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
  3187. }
  3188. wgt = ( v8u16 ) __msa_fill_h( i_weight );
  3189. offset_val0 = __msa_fill_h( i_offset_val );
  3190. denom = __msa_fill_h( i_log2_denom );
  3191. u_temp0 = LW( p_src1 );
  3192. p_src1 += i_src_stride;
  3193. src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
  3194. temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
  3195. temp_vec0 = wgt * temp_vec0;
  3196. temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
  3197. offset_val0 );
  3198. temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
  3199. temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
  3200. temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
  3201. tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
  3202. ( v16i8 ) temp_vec0 );
  3203. u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
  3204. SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
  3205. }
  3206. }
  3207. return p_dst;
  3208. }
  3209. else
  3210. {
  3211. *p_dst_stride = i_src_stride;
  3212. return p_src1;
  3213. }
  3214. }
  3215. static weight_fn_t mc_weight_wtab_msa[6] =
  3216. {
  3217. mc_weight_w4_msa,
  3218. mc_weight_w4_msa,
  3219. mc_weight_w8_msa,
  3220. mc_weight_w16_msa,
  3221. mc_weight_w16_msa,
  3222. mc_weight_w20_msa,
  3223. };
  3224. #endif // !HIGH_BIT_DEPTH
  3225. void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
  3226. {
  3227. #if !HIGH_BIT_DEPTH
  3228. if( cpu & X264_CPU_MSA )
  3229. {
  3230. pf->mc_luma = mc_luma_msa;
  3231. pf->mc_chroma = mc_chroma_msa;
  3232. pf->get_ref = get_ref_msa;
  3233. pf->avg[PIXEL_16x16]= pixel_avg_16x16_msa;
  3234. pf->avg[PIXEL_16x8] = pixel_avg_16x8_msa;
  3235. pf->avg[PIXEL_8x16] = pixel_avg_8x16_msa;
  3236. pf->avg[PIXEL_8x8] = pixel_avg_8x8_msa;
  3237. pf->avg[PIXEL_8x4] = pixel_avg_8x4_msa;
  3238. pf->avg[PIXEL_4x16] = pixel_avg_4x16_msa;
  3239. pf->avg[PIXEL_4x8] = pixel_avg_4x8_msa;
  3240. pf->avg[PIXEL_4x4] = pixel_avg_4x4_msa;
  3241. pf->avg[PIXEL_4x2] = pixel_avg_4x2_msa;
  3242. pf->weight = mc_weight_wtab_msa;
  3243. pf->offsetadd = mc_weight_wtab_msa;
  3244. pf->offsetsub = mc_weight_wtab_msa;
  3245. pf->copy_16x16_unaligned = mc_copy_w16_msa;
  3246. pf->copy[PIXEL_16x16] = mc_copy_w16_msa;
  3247. pf->copy[PIXEL_8x8] = mc_copy_w8_msa;
  3248. pf->copy[PIXEL_4x4] = mc_copy_w4_msa;
  3249. pf->store_interleave_chroma = store_interleave_chroma_msa;
  3250. pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_msa;
  3251. pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec_msa;
  3252. pf->plane_copy_interleave = plane_copy_interleave_msa;
  3253. pf->plane_copy_deinterleave = plane_copy_deinterleave_msa;
  3254. pf->plane_copy_deinterleave_rgb = plane_copy_deinterleave_rgb_msa;
  3255. pf->hpel_filter = hpel_filter_msa;
  3256. pf->memcpy_aligned = memcpy;
  3257. pf->memzero_aligned = memzero_aligned_msa;
  3258. pf->frame_init_lowres_core = frame_init_lowres_core_msa;
  3259. }
  3260. #endif // !HIGH_BIT_DEPTH
  3261. }