deblock-c.c 78 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011
  1. /*****************************************************************************
  2. * deblock-c.c: msa deblocking
  3. *****************************************************************************
  4. * Copyright (C) 2015-2018 x264 project
  5. *
  6. * Authors: Neha Rana <neha.rana@imgtec.com>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License as published by
  10. * the Free Software Foundation; either version 2 of the License, or
  11. * (at your option) any later version.
  12. *
  13. * This program is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. * GNU General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU General Public License
  19. * along with this program; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. *
  22. * This program is also available under a commercial proprietary license.
  23. * For more information, contact us at licensing@x264.com.
  24. *****************************************************************************/
  25. #include "common/common.h"
  26. #include "macros.h"
  27. #include "deblock.h"
  28. #if !HIGH_BIT_DEPTH
  29. #define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in, \
  30. q3_or_p3_org_in, p1_or_q1_org_in, \
  31. p2_or_q2_org_in, q1_or_p1_org_in, \
  32. p0_or_q0_out, p1_or_q1_out, p2_or_q2_out ) \
  33. { \
  34. v8i16 threshold; \
  35. v8i16 const3 = __msa_ldi_h( 3 ); \
  36. \
  37. threshold = p0_or_q0_org_in + q3_or_p3_org_in; \
  38. threshold += p1_or_q1_org_in; \
  39. \
  40. p0_or_q0_out = threshold << 1; \
  41. p0_or_q0_out += p2_or_q2_org_in; \
  42. p0_or_q0_out += q1_or_p1_org_in; \
  43. p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 ); \
  44. \
  45. p1_or_q1_out = p2_or_q2_org_in + threshold; \
  46. p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 ); \
  47. \
  48. p2_or_q2_out = p2_or_q2_org_in * const3; \
  49. p2_or_q2_out += p3_or_q3_org_in; \
  50. p2_or_q2_out += p3_or_q3_org_in; \
  51. p2_or_q2_out += threshold; \
  52. p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 ); \
  53. }
  54. /* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
  55. #define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in, \
  56. p1_or_q1_org_in, p0_or_q0_out ) \
  57. { \
  58. p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in; \
  59. p0_or_q0_out += p1_or_q1_org_in; \
  60. p0_or_q0_out += p1_or_q1_org_in; \
  61. p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 ); \
  62. }
  63. #define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in, \
  64. p1_or_q1_org_in, p2_or_q2_org_in, \
  65. negate_tc_in, tc_in, p1_or_q1_out ) \
  66. { \
  67. v8i16 clip3, temp; \
  68. \
  69. clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in, \
  70. ( v8u16 ) q0_or_p0_org_in ); \
  71. temp = p1_or_q1_org_in << 1; \
  72. clip3 -= temp; \
  73. clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 ); \
  74. clip3 = CLIP_SH( clip3, negate_tc_in, tc_in ); \
  75. p1_or_q1_out = p1_or_q1_org_in + clip3; \
  76. }
  77. #define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in, \
  78. p1_or_q1_org_in, q1_or_p1_org_in, \
  79. negate_threshold_in, threshold_in, \
  80. p0_or_q0_out, q0_or_p0_out ) \
  81. { \
  82. v8i16 q0_sub_p0, p1_sub_q1, delta; \
  83. \
  84. q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
  85. p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
  86. q0_sub_p0 <<= 2; \
  87. p1_sub_q1 += 4; \
  88. delta = q0_sub_p0 + p1_sub_q1; \
  89. delta >>= 3; \
  90. \
  91. delta = CLIP_SH( delta, negate_threshold_in, threshold_in ); \
  92. \
  93. p0_or_q0_out = p0_or_q0_org_in + delta; \
  94. q0_or_p0_out = q0_or_p0_org_in - delta; \
  95. \
  96. CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out ); \
  97. }
  98. static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
  99. uint8_t u_alpha_in,
  100. uint8_t u_beta_in,
  101. uint32_t u_img_width )
  102. {
  103. v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
  104. v16u8 alpha, beta;
  105. v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
  106. v16u8 p2, p1, p0, q0, q1, q2;
  107. v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
  108. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  109. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  110. v8i16 p2_r = { 0 };
  111. v8i16 p1_r = { 0 };
  112. v8i16 p0_r = { 0 };
  113. v8i16 q0_r = { 0 };
  114. v8i16 q1_r = { 0 };
  115. v8i16 q2_r = { 0 };
  116. v8i16 p2_l = { 0 };
  117. v8i16 p1_l = { 0 };
  118. v8i16 p0_l = { 0 };
  119. v8i16 q0_l = { 0 };
  120. v8i16 q1_l = { 0 };
  121. v8i16 q2_l = { 0 };
  122. v16u8 tmp_flag;
  123. v16i8 zero = { 0 };
  124. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  125. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  126. LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
  127. p1_org, p0_org, q0_org, q1_org );
  128. {
  129. v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
  130. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  131. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  132. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  133. is_less_than_alpha = ( p0_asub_q0 < alpha );
  134. is_less_than_beta = ( p1_asub_p0 < beta );
  135. is_less_than = is_less_than_beta & is_less_than_alpha;
  136. is_less_than_beta = ( q1_asub_q0 < beta );
  137. is_less_than = is_less_than_beta & is_less_than;
  138. }
  139. if( !__msa_test_bz_v( is_less_than ) )
  140. {
  141. q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
  142. p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
  143. p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
  144. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  145. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  146. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  147. tmp_flag = alpha >> 2;
  148. tmp_flag = tmp_flag + 2;
  149. tmp_flag = ( p0_asub_q0 < tmp_flag );
  150. p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
  151. is_less_than_beta = ( p2_asub_p0 < beta );
  152. is_less_than_beta = is_less_than_beta & tmp_flag;
  153. negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
  154. is_less_than_beta = is_less_than_beta & is_less_than;
  155. negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
  156. {
  157. v8u16 is_less_than_beta_l, is_less_than_beta_r;
  158. q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
  159. is_less_than_beta_r =
  160. ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
  161. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
  162. {
  163. v8i16 p3_org_r;
  164. ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
  165. AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
  166. q0_org_r, p1_org_r,
  167. p2_r, q1_org_r, p0_r, p1_r, p2_r );
  168. }
  169. q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
  170. is_less_than_beta_l =
  171. ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
  172. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
  173. {
  174. v8i16 p3_org_l;
  175. ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
  176. AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
  177. q0_org_l, p1_org_l,
  178. p2_l, q1_org_l, p0_l, p1_l, p2_l );
  179. }
  180. }
  181. /* combine and store */
  182. if( !__msa_test_bz_v( is_less_than_beta ) )
  183. {
  184. PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
  185. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
  186. p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
  187. p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
  188. ST_UB( p1_org, p_data - ( 2 * u_img_width ) );
  189. ST_UB( p2_org, p_data - ( 3 * u_img_width ) );
  190. }
  191. {
  192. v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
  193. negate_is_less_than_beta_r =
  194. ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
  195. zero, 8 );
  196. if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
  197. {
  198. AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
  199. }
  200. negate_is_less_than_beta_l =
  201. ( v8u16 ) __msa_sldi_b( zero,
  202. ( v16i8 ) negate_is_less_than_beta, 8 );
  203. if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
  204. {
  205. AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
  206. }
  207. }
  208. if( !__msa_test_bz_v( negate_is_less_than_beta ) )
  209. {
  210. p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
  211. p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
  212. }
  213. ST_UB( p0_org, p_data - u_img_width );
  214. q3_org = LD_UB( p_data + ( 3 * u_img_width ) );
  215. q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
  216. is_less_than_beta = ( q2_asub_q0 < beta );
  217. is_less_than_beta = is_less_than_beta & tmp_flag;
  218. negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
  219. is_less_than_beta = is_less_than_beta & is_less_than;
  220. negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
  221. {
  222. v8u16 is_less_than_beta_l, is_less_than_beta_r;
  223. is_less_than_beta_r =
  224. ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
  225. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
  226. {
  227. v8i16 q3_org_r;
  228. ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
  229. AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
  230. p0_org_r, q1_org_r,
  231. q2_r, p1_org_r, q0_r, q1_r, q2_r );
  232. }
  233. is_less_than_beta_l =
  234. ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
  235. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
  236. {
  237. v8i16 q3_org_l;
  238. ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
  239. AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
  240. p0_org_l, q1_org_l,
  241. q2_l, p1_org_l, q0_l, q1_l, q2_l );
  242. }
  243. }
  244. if( !__msa_test_bz_v( is_less_than_beta ) )
  245. {
  246. PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
  247. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
  248. q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
  249. q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
  250. ST_UB( q1_org, p_data + u_img_width );
  251. ST_UB( q2_org, p_data + 2 * u_img_width );
  252. }
  253. {
  254. v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
  255. negate_is_less_than_beta_r =
  256. ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
  257. zero, 8 );
  258. if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
  259. {
  260. AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
  261. }
  262. negate_is_less_than_beta_l =
  263. ( v8u16 ) __msa_sldi_b( zero,
  264. ( v16i8 ) negate_is_less_than_beta, 8 );
  265. if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
  266. {
  267. AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
  268. }
  269. }
  270. if( !__msa_test_bz_v( negate_is_less_than_beta ) )
  271. {
  272. q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
  273. q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
  274. }
  275. ST_UB( q0_org, p_data );
  276. }
  277. }
  278. static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data,
  279. uint8_t u_alpha_in,
  280. uint8_t u_beta_in,
  281. uint32_t u_img_width )
  282. {
  283. uint8_t *p_src;
  284. v16u8 alpha, beta, p0_asub_q0;
  285. v16u8 is_less_than_alpha, is_less_than;
  286. v16u8 is_less_than_beta, negate_is_less_than_beta;
  287. v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
  288. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  289. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  290. v8i16 p2_r = { 0 };
  291. v8i16 p1_r = { 0 };
  292. v8i16 p0_r = { 0 };
  293. v8i16 q0_r = { 0 };
  294. v8i16 q1_r = { 0 };
  295. v8i16 q2_r = { 0 };
  296. v8i16 p2_l = { 0 };
  297. v8i16 p1_l = { 0 };
  298. v8i16 p0_l = { 0 };
  299. v8i16 q0_l = { 0 };
  300. v8i16 q1_l = { 0 };
  301. v8i16 q2_l = { 0 };
  302. v16i8 zero = { 0 };
  303. v16u8 tmp_flag;
  304. p_src = p_data - 4;
  305. {
  306. v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  307. v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
  308. LD_UB8( p_src, u_img_width,
  309. row0, row1, row2, row3, row4, row5, row6, row7 );
  310. LD_UB8( p_src + ( 8 * u_img_width ), u_img_width,
  311. row8, row9, row10, row11, row12, row13, row14, row15 );
  312. TRANSPOSE16x8_UB_UB( row0, row1, row2, row3,
  313. row4, row5, row6, row7,
  314. row8, row9, row10, row11,
  315. row12, row13, row14, row15,
  316. p3_org, p2_org, p1_org, p0_org,
  317. q0_org, q1_org, q2_org, q3_org );
  318. }
  319. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  320. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  321. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  322. UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
  323. {
  324. v16u8 p1_asub_p0, q1_asub_q0;
  325. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  326. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  327. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  328. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  329. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  330. is_less_than_alpha = ( p0_asub_q0 < alpha );
  331. is_less_than_beta = ( p1_asub_p0 < beta );
  332. is_less_than = is_less_than_beta & is_less_than_alpha;
  333. is_less_than_beta = ( q1_asub_q0 < beta );
  334. is_less_than = is_less_than_beta & is_less_than;
  335. }
  336. if( !__msa_test_bz_v( is_less_than ) )
  337. {
  338. tmp_flag = alpha >> 2;
  339. tmp_flag = tmp_flag + 2;
  340. tmp_flag = ( p0_asub_q0 < tmp_flag );
  341. {
  342. v16u8 p2_asub_p0;
  343. p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
  344. is_less_than_beta = ( p2_asub_p0 < beta );
  345. }
  346. is_less_than_beta = tmp_flag & is_less_than_beta;
  347. negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
  348. is_less_than_beta = is_less_than_beta & is_less_than;
  349. negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
  350. {
  351. v16u8 is_less_than_beta_r;
  352. is_less_than_beta_r =
  353. ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
  354. if( !__msa_test_bz_v( is_less_than_beta_r ) )
  355. {
  356. v8i16 p3_org_r;
  357. ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
  358. AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
  359. q0_org_r, p1_org_r,
  360. p2_r, q1_org_r, p0_r, p1_r, p2_r );
  361. }
  362. }
  363. {
  364. v16u8 is_less_than_beta_l;
  365. is_less_than_beta_l =
  366. ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
  367. if( !__msa_test_bz_v( is_less_than_beta_l ) )
  368. {
  369. v8i16 p3_org_l;
  370. ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
  371. AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
  372. q0_org_l, p1_org_l,
  373. p2_l, q1_org_l, p0_l, p1_l, p2_l );
  374. }
  375. }
  376. if( !__msa_test_bz_v( is_less_than_beta ) )
  377. {
  378. v16u8 p0, p2, p1;
  379. PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
  380. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
  381. p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
  382. p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
  383. }
  384. {
  385. v16u8 negate_is_less_than_beta_r;
  386. negate_is_less_than_beta_r =
  387. ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
  388. zero, 8 );
  389. if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
  390. {
  391. AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
  392. }
  393. }
  394. {
  395. v16u8 negate_is_less_than_beta_l;
  396. negate_is_less_than_beta_l =
  397. ( v16u8 ) __msa_sldi_b( zero,
  398. ( v16i8 ) negate_is_less_than_beta, 8 );
  399. if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
  400. {
  401. AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
  402. }
  403. }
  404. if( !__msa_test_bz_v( negate_is_less_than_beta ) )
  405. {
  406. v16u8 p0;
  407. p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
  408. p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
  409. }
  410. {
  411. v16u8 q2_asub_q0;
  412. q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
  413. is_less_than_beta = ( q2_asub_q0 < beta );
  414. }
  415. is_less_than_beta = is_less_than_beta & tmp_flag;
  416. negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
  417. is_less_than_beta = is_less_than_beta & is_less_than;
  418. negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
  419. {
  420. v16u8 is_less_than_beta_r;
  421. is_less_than_beta_r =
  422. ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
  423. if( !__msa_test_bz_v( is_less_than_beta_r ) )
  424. {
  425. v8i16 q3_org_r;
  426. ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
  427. AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
  428. p0_org_r, q1_org_r,
  429. q2_r, p1_org_r, q0_r, q1_r, q2_r );
  430. }
  431. }
  432. {
  433. v16u8 is_less_than_beta_l;
  434. is_less_than_beta_l =
  435. ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
  436. if( !__msa_test_bz_v( is_less_than_beta_l ) )
  437. {
  438. v8i16 q3_org_l;
  439. ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
  440. AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
  441. p0_org_l, q1_org_l,
  442. q2_l, p1_org_l, q0_l, q1_l, q2_l );
  443. }
  444. }
  445. if( !__msa_test_bz_v( is_less_than_beta ) )
  446. {
  447. v16u8 q0, q1, q2;
  448. PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
  449. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
  450. q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
  451. q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
  452. }
  453. {
  454. v16u8 negate_is_less_than_beta_r;
  455. negate_is_less_than_beta_r =
  456. ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
  457. zero, 8 );
  458. if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
  459. {
  460. AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
  461. }
  462. }
  463. {
  464. v16u8 negate_is_less_than_beta_l;
  465. negate_is_less_than_beta_l =
  466. ( v16u8 ) __msa_sldi_b( zero,
  467. ( v16i8 ) negate_is_less_than_beta, 8 );
  468. if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
  469. {
  470. AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
  471. }
  472. }
  473. if( !__msa_test_bz_v( negate_is_less_than_beta ) )
  474. {
  475. v16u8 q0;
  476. q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
  477. q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
  478. }
  479. }
  480. {
  481. v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  482. ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 );
  483. ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 );
  484. ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
  485. ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 );
  486. ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 );
  487. p_src = p_data - 3;
  488. ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width );
  489. ST2x4_UB( tmp2, 0, p_src + 4, u_img_width );
  490. p_src += 4 * u_img_width;
  491. ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width );
  492. ST2x4_UB( tmp2, 4, p_src + 4, u_img_width );
  493. p_src += 4 * u_img_width;
  494. ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width );
  495. ST2x4_UB( tmp5, 0, p_src + 4, u_img_width );
  496. p_src += 4 * u_img_width;
  497. ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width );
  498. ST2x4_UB( tmp5, 4, p_src + 4, u_img_width );
  499. }
  500. }
  501. static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma,
  502. uint8_t u_alpha_in,
  503. uint8_t u_beta_in,
  504. uint32_t u_img_width )
  505. {
  506. v16u8 alpha, beta, is_less_than;
  507. v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
  508. v8i16 p0_r = { 0 };
  509. v8i16 q0_r = { 0 };
  510. v8i16 p0_l = { 0 };
  511. v8i16 q0_l = { 0 };
  512. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  513. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  514. LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
  515. p1_org, p0_org, q0_org, q1_org );
  516. {
  517. v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
  518. v16u8 is_less_than_alpha, is_less_than_beta;
  519. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  520. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  521. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  522. is_less_than_alpha = ( p0_asub_q0 < alpha );
  523. is_less_than_beta = ( p1_asub_p0 < beta );
  524. is_less_than = is_less_than_beta & is_less_than_alpha;
  525. is_less_than_beta = ( q1_asub_q0 < beta );
  526. is_less_than = is_less_than_beta & is_less_than;
  527. }
  528. if( !__msa_test_bz_v( is_less_than ) )
  529. {
  530. v16i8 zero = { 0 };
  531. v16u8 is_less_than_r, is_less_than_l;
  532. is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
  533. zero, 8 );
  534. if( !__msa_test_bz_v( is_less_than_r ) )
  535. {
  536. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  537. ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
  538. zero, q1_org, p1_org_r, p0_org_r, q0_org_r,
  539. q1_org_r );
  540. AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
  541. AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
  542. }
  543. is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
  544. ( v16i8 ) is_less_than, 8 );
  545. if( !__msa_test_bz_v( is_less_than_l ) )
  546. {
  547. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  548. ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
  549. zero, q1_org, p1_org_l, p0_org_l, q0_org_l,
  550. q1_org_l );
  551. AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
  552. AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
  553. }
  554. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  555. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
  556. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
  557. ST_UB( p0_org, ( p_chroma - u_img_width ) );
  558. ST_UB( q0_org, p_chroma );
  559. }
  560. }
  561. static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma,
  562. uint8_t u_alpha_in,
  563. uint8_t u_beta_in,
  564. uint32_t u_img_width )
  565. {
  566. v16u8 is_less_than;
  567. v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
  568. v8i16 p0_r = { 0 };
  569. v8i16 q0_r = { 0 };
  570. v8i16 p0_l = { 0 };
  571. v8i16 q0_l = { 0 };
  572. v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
  573. v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
  574. v16i8 tmp0, tmp1, tmp2, tmp3;
  575. v4i32 vec0, vec1;
  576. v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  577. LD_UB8( ( p_chroma - 4 ), u_img_width,
  578. row0, row1, row2, row3, row4, row5, row6, row7 );
  579. TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
  580. p1_u_org, p1_v_org, p0_u_org, p0_v_org,
  581. q0_u_org, q0_v_org, q1_u_org, q1_v_org );
  582. ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
  583. q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
  584. {
  585. v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
  586. v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta;
  587. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  588. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  589. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  590. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  591. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  592. is_less_than_alpha = ( p0_asub_q0 < alpha );
  593. is_less_than_beta = ( p1_asub_p0 < beta );
  594. is_less_than = is_less_than_beta & is_less_than_alpha;
  595. is_less_than_beta = ( q1_asub_q0 < beta );
  596. is_less_than = is_less_than_beta & is_less_than;
  597. }
  598. if( !__msa_test_bz_v( is_less_than ) )
  599. {
  600. v16u8 is_less_than_r, is_less_than_l;
  601. v16i8 zero = { 0 };
  602. is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
  603. zero, 8 );
  604. if( !__msa_test_bz_v( is_less_than_r ) )
  605. {
  606. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  607. ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
  608. zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r );
  609. AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
  610. AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
  611. }
  612. is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
  613. ( v16i8 ) is_less_than, 8 );
  614. if( !__msa_test_bz_v( is_less_than_l ) )
  615. {
  616. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  617. ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
  618. zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l );
  619. AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
  620. AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
  621. }
  622. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  623. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
  624. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
  625. SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
  626. ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
  627. ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
  628. ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
  629. ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
  630. }
  631. }
  632. static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data,
  633. uint8_t u_bs0,
  634. uint8_t u_bs1,
  635. uint8_t u_bs2,
  636. uint8_t u_bs3,
  637. uint8_t u_tc0,
  638. uint8_t u_tc1,
  639. uint8_t u_tc2,
  640. uint8_t u_tc3,
  641. uint8_t u_alpha_in,
  642. uint8_t u_beta_in,
  643. uint32_t u_img_width )
  644. {
  645. uint8_t *p_src;
  646. v16u8 beta, tmp_vec, bs = { 0 };
  647. v16u8 tc = { 0 };
  648. v16u8 is_less_than, is_less_than_beta;
  649. v16u8 p1, p0, q0, q1;
  650. v8i16 p0_r, q0_r, p1_r = { 0 };
  651. v8i16 q1_r = { 0 };
  652. v8i16 p0_l, q0_l, p1_l = { 0 };
  653. v8i16 q1_l = { 0 };
  654. v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
  655. v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
  656. v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
  657. v8i16 tc_r, tc_l;
  658. v16i8 zero = { 0 };
  659. v16u8 is_bs_greater_than0;
  660. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
  661. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
  662. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
  663. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
  664. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
  665. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
  666. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
  667. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
  668. if( !__msa_test_bz_v( bs ) )
  669. {
  670. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
  671. tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
  672. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
  673. tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
  674. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
  675. tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
  676. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
  677. tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
  678. is_bs_greater_than0 = ( zero < bs );
  679. {
  680. v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  681. v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
  682. p_src = p_data;
  683. p_src -= 4;
  684. LD_UB8( p_src, u_img_width,
  685. row0, row1, row2, row3, row4, row5, row6, row7 );
  686. p_src += ( 8 * u_img_width );
  687. LD_UB8( p_src, u_img_width,
  688. row8, row9, row10, row11, row12, row13, row14, row15 );
  689. TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
  690. row8, row9, row10, row11,
  691. row12, row13, row14, row15,
  692. p3_org, p2_org, p1_org, p0_org,
  693. q0_org, q1_org, q2_org, q3_org );
  694. }
  695. {
  696. v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
  697. v16u8 is_less_than_alpha;
  698. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  699. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  700. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  701. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  702. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  703. is_less_than_alpha = ( p0_asub_q0 < alpha );
  704. is_less_than_beta = ( p1_asub_p0 < beta );
  705. is_less_than = is_less_than_beta & is_less_than_alpha;
  706. is_less_than_beta = ( q1_asub_q0 < beta );
  707. is_less_than = is_less_than_beta & is_less_than;
  708. is_less_than = is_less_than & is_bs_greater_than0;
  709. }
  710. if( !__msa_test_bz_v( is_less_than ) )
  711. {
  712. v16i8 negate_tc, sign_negate_tc;
  713. v8i16 negate_tc_r, i16_negatetc_l;
  714. negate_tc = zero - ( v16i8 ) tc;
  715. sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
  716. ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
  717. i16_negatetc_l );
  718. UNPCK_UB_SH( tc, tc_r, tc_l );
  719. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  720. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  721. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  722. {
  723. v16u8 p2_asub_p0;
  724. v16u8 is_less_than_beta_r, is_less_than_beta_l;
  725. p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
  726. is_less_than_beta = ( p2_asub_p0 < beta );
  727. is_less_than_beta = is_less_than_beta & is_less_than;
  728. is_less_than_beta_r =
  729. ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
  730. zero, 8 );
  731. if( !__msa_test_bz_v( is_less_than_beta_r ) )
  732. {
  733. p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
  734. AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
  735. negate_tc_r, tc_r, p1_r );
  736. }
  737. is_less_than_beta_l =
  738. ( v16u8 ) __msa_sldi_b( zero,
  739. ( v16i8 ) is_less_than_beta, 8 );
  740. if( !__msa_test_bz_v( is_less_than_beta_l ) )
  741. {
  742. p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
  743. AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
  744. i16_negatetc_l, tc_l, p1_l );
  745. }
  746. }
  747. if( !__msa_test_bz_v( is_less_than_beta ) )
  748. {
  749. p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
  750. p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
  751. is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
  752. tc = tc + is_less_than_beta;
  753. }
  754. {
  755. v16u8 u8_q2asub_q0;
  756. v16u8 is_less_than_beta_l, is_less_than_beta_r;
  757. u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
  758. is_less_than_beta = ( u8_q2asub_q0 < beta );
  759. is_less_than_beta = is_less_than_beta & is_less_than;
  760. q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
  761. is_less_than_beta_r =
  762. ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
  763. zero, 8 );
  764. if( !__msa_test_bz_v( is_less_than_beta_r ) )
  765. {
  766. q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
  767. AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
  768. negate_tc_r, tc_r, q1_r );
  769. }
  770. q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
  771. is_less_than_beta_l =
  772. ( v16u8 ) __msa_sldi_b( zero,
  773. ( v16i8 ) is_less_than_beta, 8 );
  774. if( !__msa_test_bz_v( is_less_than_beta_l ) )
  775. {
  776. q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
  777. AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
  778. i16_negatetc_l, tc_l, q1_l );
  779. }
  780. }
  781. if( !__msa_test_bz_v( is_less_than_beta ) )
  782. {
  783. q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
  784. q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
  785. is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
  786. tc = tc + is_less_than_beta;
  787. }
  788. {
  789. v8i16 threshold_r, negate_thresh_r;
  790. v8i16 threshold_l, negate_thresh_l;
  791. v16i8 negate_thresh, sign_negate_thresh;
  792. negate_thresh = zero - ( v16i8 ) tc;
  793. sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
  794. ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
  795. threshold_r, negate_thresh_r );
  796. AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
  797. negate_thresh_r, threshold_r, p0_r, q0_r );
  798. threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc );
  799. negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
  800. negate_thresh );
  801. AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
  802. negate_thresh_l, threshold_l, p0_l, q0_l );
  803. }
  804. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  805. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
  806. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
  807. }
  808. {
  809. v16i8 tp0, tp1, tp2, tp3;
  810. v8i16 tmp2, tmp5;
  811. v4i32 tmp3, tmp4, tmp6, tmp7;
  812. uint32_t u_out0, u_out2;
  813. uint16_t u_out1, u_out3;
  814. p_src = p_data - 3;
  815. ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 );
  816. ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 );
  817. ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
  818. ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 );
  819. ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 );
  820. u_out0 = __msa_copy_u_w( tmp3, 0 );
  821. u_out1 = __msa_copy_u_h( tmp2, 0 );
  822. u_out2 = __msa_copy_u_w( tmp3, 1 );
  823. u_out3 = __msa_copy_u_h( tmp2, 1 );
  824. SW( u_out0, p_src );
  825. SH( u_out1, ( p_src + 4 ) );
  826. p_src += u_img_width;
  827. SW( u_out2, p_src );
  828. SH( u_out3, ( p_src + 4 ) );
  829. u_out0 = __msa_copy_u_w( tmp3, 2 );
  830. u_out1 = __msa_copy_u_h( tmp2, 2 );
  831. u_out2 = __msa_copy_u_w( tmp3, 3 );
  832. u_out3 = __msa_copy_u_h( tmp2, 3 );
  833. p_src += u_img_width;
  834. SW( u_out0, p_src );
  835. SH( u_out1, ( p_src + 4 ) );
  836. p_src += u_img_width;
  837. SW( u_out2, p_src );
  838. SH( u_out3, ( p_src + 4 ) );
  839. u_out0 = __msa_copy_u_w( tmp4, 0 );
  840. u_out1 = __msa_copy_u_h( tmp2, 4 );
  841. u_out2 = __msa_copy_u_w( tmp4, 1 );
  842. u_out3 = __msa_copy_u_h( tmp2, 5 );
  843. p_src += u_img_width;
  844. SW( u_out0, p_src );
  845. SH( u_out1, ( p_src + 4 ) );
  846. p_src += u_img_width;
  847. SW( u_out2, p_src );
  848. SH( u_out3, ( p_src + 4 ) );
  849. u_out0 = __msa_copy_u_w( tmp4, 2 );
  850. u_out1 = __msa_copy_u_h( tmp2, 6 );
  851. u_out2 = __msa_copy_u_w( tmp4, 3 );
  852. u_out3 = __msa_copy_u_h( tmp2, 7 );
  853. p_src += u_img_width;
  854. SW( u_out0, p_src );
  855. SH( u_out1, ( p_src + 4 ) );
  856. p_src += u_img_width;
  857. SW( u_out2, p_src );
  858. SH( u_out3, ( p_src + 4 ) );
  859. u_out0 = __msa_copy_u_w( tmp6, 0 );
  860. u_out1 = __msa_copy_u_h( tmp5, 0 );
  861. u_out2 = __msa_copy_u_w( tmp6, 1 );
  862. u_out3 = __msa_copy_u_h( tmp5, 1 );
  863. p_src += u_img_width;
  864. SW( u_out0, p_src );
  865. SH( u_out1, ( p_src + 4 ) );
  866. p_src += u_img_width;
  867. SW( u_out2, p_src );
  868. SH( u_out3, ( p_src + 4 ) );
  869. u_out0 = __msa_copy_u_w( tmp6, 2 );
  870. u_out1 = __msa_copy_u_h( tmp5, 2 );
  871. u_out2 = __msa_copy_u_w( tmp6, 3 );
  872. u_out3 = __msa_copy_u_h( tmp5, 3 );
  873. p_src += u_img_width;
  874. SW( u_out0, p_src );
  875. SH( u_out1, ( p_src + 4 ) );
  876. p_src += u_img_width;
  877. SW( u_out2, p_src );
  878. SH( u_out3, ( p_src + 4 ) );
  879. u_out0 = __msa_copy_u_w( tmp7, 0 );
  880. u_out1 = __msa_copy_u_h( tmp5, 4 );
  881. u_out2 = __msa_copy_u_w( tmp7, 1 );
  882. u_out3 = __msa_copy_u_h( tmp5, 5 );
  883. p_src += u_img_width;
  884. SW( u_out0, p_src );
  885. SH( u_out1, ( p_src + 4 ) );
  886. p_src += u_img_width;
  887. SW( u_out2, p_src );
  888. SH( u_out3, ( p_src + 4 ) );
  889. u_out0 = __msa_copy_u_w( tmp7, 2 );
  890. u_out1 = __msa_copy_u_h( tmp5, 6 );
  891. u_out2 = __msa_copy_u_w( tmp7, 3 );
  892. u_out3 = __msa_copy_u_h( tmp5, 7 );
  893. p_src += u_img_width;
  894. SW( u_out0, p_src );
  895. SH( u_out1, ( p_src + 4 ) );
  896. p_src += u_img_width;
  897. SW( u_out2, p_src );
  898. SH( u_out3, ( p_src + 4 ) );
  899. }
  900. }
  901. }
  902. static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data,
  903. uint8_t u_bs0,
  904. uint8_t u_bs1,
  905. uint8_t u_bs2,
  906. uint8_t u_bs3,
  907. uint8_t u_tc0,
  908. uint8_t u_tc1,
  909. uint8_t u_tc2,
  910. uint8_t u_tc3,
  911. uint8_t u_alpha_in,
  912. uint8_t u_beta_in,
  913. uint32_t u_image_width )
  914. {
  915. v16u8 p2_asub_p0, u8_q2asub_q0;
  916. v16u8 alpha, beta, is_less_than, is_less_than_beta;
  917. v16u8 p1, p0, q0, q1;
  918. v8i16 p1_r = { 0 };
  919. v8i16 p0_r, q0_r, q1_r = { 0 };
  920. v8i16 p1_l = { 0 };
  921. v8i16 p0_l, q0_l, q1_l = { 0 };
  922. v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
  923. v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
  924. v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
  925. v16i8 zero = { 0 };
  926. v16u8 tmp_vec;
  927. v16u8 bs = { 0 };
  928. v16i8 tc = { 0 };
  929. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
  930. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
  931. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
  932. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
  933. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
  934. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
  935. tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
  936. bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
  937. if( !__msa_test_bz_v( bs ) )
  938. {
  939. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
  940. tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
  941. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
  942. tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
  943. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
  944. tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
  945. tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
  946. tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
  947. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  948. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  949. LD_UB5( p_data - ( 3 * u_image_width ), u_image_width,
  950. p2_org, p1_org, p0_org, q0_org, q1_org );
  951. {
  952. v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
  953. v16u8 is_less_than_alpha, is_bs_greater_than0;
  954. is_bs_greater_than0 = ( ( v16u8 ) zero < bs );
  955. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  956. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  957. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  958. is_less_than_alpha = ( p0_asub_q0 < alpha );
  959. is_less_than_beta = ( p1_asub_p0 < beta );
  960. is_less_than = is_less_than_beta & is_less_than_alpha;
  961. is_less_than_beta = ( q1_asub_q0 < beta );
  962. is_less_than = is_less_than_beta & is_less_than;
  963. is_less_than = is_less_than & is_bs_greater_than0;
  964. }
  965. if( !__msa_test_bz_v( is_less_than ) )
  966. {
  967. v16i8 sign_negate_tc, negate_tc;
  968. v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
  969. q2_org = LD_UB( p_data + ( 2 * u_image_width ) );
  970. negate_tc = zero - tc;
  971. sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
  972. ILVRL_B2_SH( sign_negate_tc, negate_tc,
  973. negate_tc_r, i16_negatetc_l );
  974. UNPCK_UB_SH( tc, tc_r, tc_l );
  975. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  976. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  977. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  978. p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
  979. is_less_than_beta = ( p2_asub_p0 < beta );
  980. is_less_than_beta = is_less_than_beta & is_less_than;
  981. {
  982. v8u16 is_less_than_beta_r, is_less_than_beta_l;
  983. is_less_than_beta_r =
  984. ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
  985. zero, 8 );
  986. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
  987. {
  988. p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
  989. AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
  990. negate_tc_r, tc_r, p1_r );
  991. }
  992. is_less_than_beta_l =
  993. ( v8u16 ) __msa_sldi_b( zero,
  994. ( v16i8 ) is_less_than_beta, 8 );
  995. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
  996. {
  997. p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
  998. AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
  999. i16_negatetc_l, tc_l, p1_l );
  1000. }
  1001. }
  1002. if( !__msa_test_bz_v( is_less_than_beta ) )
  1003. {
  1004. p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
  1005. p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
  1006. ST_UB( p1_org, p_data - ( 2 * u_image_width ) );
  1007. is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
  1008. tc = tc + ( v16i8 ) is_less_than_beta;
  1009. }
  1010. u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
  1011. is_less_than_beta = ( u8_q2asub_q0 < beta );
  1012. is_less_than_beta = is_less_than_beta & is_less_than;
  1013. {
  1014. v8u16 is_less_than_beta_r, is_less_than_beta_l;
  1015. is_less_than_beta_r =
  1016. ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
  1017. zero, 8 );
  1018. q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
  1019. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
  1020. {
  1021. q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
  1022. AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
  1023. negate_tc_r, tc_r, q1_r );
  1024. }
  1025. is_less_than_beta_l =
  1026. ( v8u16 ) __msa_sldi_b( zero,
  1027. ( v16i8 ) is_less_than_beta, 8 );
  1028. q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
  1029. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
  1030. {
  1031. q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
  1032. AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
  1033. i16_negatetc_l, tc_l, q1_l );
  1034. }
  1035. }
  1036. if( !__msa_test_bz_v( is_less_than_beta ) )
  1037. {
  1038. q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
  1039. q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
  1040. ST_UB( q1_org, p_data + u_image_width );
  1041. is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
  1042. tc = tc + ( v16i8 ) is_less_than_beta;
  1043. }
  1044. {
  1045. v16i8 negate_thresh, sign_negate_thresh;
  1046. v8i16 threshold_r, threshold_l;
  1047. v8i16 negate_thresh_l, negate_thresh_r;
  1048. negate_thresh = zero - tc;
  1049. sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
  1050. ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
  1051. threshold_r, negate_thresh_r );
  1052. AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
  1053. negate_thresh_r, threshold_r, p0_r, q0_r );
  1054. threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc );
  1055. negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
  1056. negate_thresh );
  1057. AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
  1058. negate_thresh_l, threshold_l, p0_l, q0_l );
  1059. }
  1060. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  1061. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
  1062. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
  1063. ST_UB( p0_org, ( p_data - u_image_width ) );
  1064. ST_UB( q0_org, p_data );
  1065. }
  1066. }
  1067. }
  1068. static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma,
  1069. uint8_t u_bs0,
  1070. uint8_t u_bs1,
  1071. uint8_t u_bs2,
  1072. uint8_t u_bs3,
  1073. uint8_t u_tc0,
  1074. uint8_t u_tc1,
  1075. uint8_t u_tc2,
  1076. uint8_t u_tc3,
  1077. uint8_t u_alpha_in,
  1078. uint8_t u_beta_in,
  1079. uint32_t u_img_width )
  1080. {
  1081. v16u8 alpha, beta;
  1082. v4i32 tmp_vec, bs = { 0 };
  1083. v4i32 tc = { 0 };
  1084. v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
  1085. v16u8 is_less_than;
  1086. v8i16 is_less_than_r, is_less_than_l;
  1087. v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
  1088. v16u8 p0, q0;
  1089. v8i16 p0_r = { 0 };
  1090. v8i16 q0_r = { 0 };
  1091. v8i16 p0_l = { 0 };
  1092. v8i16 q0_l = { 0 };
  1093. v16u8 p1_org, p0_org, q0_org, q1_org;
  1094. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  1095. v16i8 negate_tc, sign_negate_tc;
  1096. v8i16 negate_tc_r, i16_negatetc_l;
  1097. v8i16 tc_r, tc_l;
  1098. v16i8 zero = { 0 };
  1099. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  1100. tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 );
  1101. bs = __msa_insve_w( bs, 0, tmp_vec );
  1102. tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 );
  1103. bs = __msa_insve_w( bs, 1, tmp_vec );
  1104. tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 );
  1105. bs = __msa_insve_w( bs, 2, tmp_vec );
  1106. tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 );
  1107. bs = __msa_insve_w( bs, 3, tmp_vec );
  1108. if( !__msa_test_bz_v( ( v16u8 ) bs ) )
  1109. {
  1110. tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 );
  1111. tc = __msa_insve_w( tc, 0, tmp_vec );
  1112. tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 );
  1113. tc = __msa_insve_w( tc, 1, tmp_vec );
  1114. tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 );
  1115. tc = __msa_insve_w( tc, 2, tmp_vec );
  1116. tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 );
  1117. tc = __msa_insve_w( tc, 3, tmp_vec );
  1118. is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
  1119. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  1120. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  1121. LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
  1122. p1_org, p0_org, q0_org, q1_org );
  1123. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  1124. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  1125. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  1126. is_less_than_alpha = ( p0_asub_q0 < alpha );
  1127. is_less_than_beta = ( p1_asub_p0 < beta );
  1128. is_less_than = is_less_than_beta & is_less_than_alpha;
  1129. is_less_than_beta = ( q1_asub_q0 < beta );
  1130. is_less_than = is_less_than_beta & is_less_than;
  1131. is_less_than = is_less_than & is_bs_greater_than0;
  1132. if( !__msa_test_bz_v( is_less_than ) )
  1133. {
  1134. negate_tc = zero - ( v16i8 ) tc;
  1135. sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
  1136. ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
  1137. i16_negatetc_l );
  1138. UNPCK_UB_SH( tc, tc_r, tc_l );
  1139. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  1140. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  1141. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  1142. UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
  1143. is_less_than_r =
  1144. ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 );
  1145. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
  1146. {
  1147. AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
  1148. negate_tc_r, tc_r, p0_r, q0_r );
  1149. }
  1150. is_less_than_l =
  1151. ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 );
  1152. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
  1153. {
  1154. AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
  1155. i16_negatetc_l, tc_l, p0_l, q0_l );
  1156. }
  1157. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  1158. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
  1159. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
  1160. ST_UB( p0_org, p_chroma - u_img_width );
  1161. ST_UB( q0_org, p_chroma );
  1162. }
  1163. }
  1164. }
  1165. static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma,
  1166. uint8_t u_bs0,
  1167. uint8_t u_bs1,
  1168. uint8_t u_bs2,
  1169. uint8_t u_bs3,
  1170. uint8_t u_tc0,
  1171. uint8_t u_tc1,
  1172. uint8_t u_tc2,
  1173. uint8_t u_tc3,
  1174. uint8_t u_alpha_in,
  1175. uint8_t u_beta_in,
  1176. uint32_t u_img_width )
  1177. {
  1178. v16u8 alpha, beta;
  1179. v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
  1180. v16u8 is_less_than, is_less_than1;
  1181. v8i16 is_less_than_r, is_less_than_l;
  1182. v16u8 is_less_than_beta, is_less_than_alpha;
  1183. v8i16 p0_r = { 0 };
  1184. v8i16 q0_r = { 0 };
  1185. v8i16 p0_l = { 0 };
  1186. v8i16 q0_l = { 0 };
  1187. v16u8 p1_org, p0_org, q0_org, q1_org;
  1188. v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
  1189. v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
  1190. v16u8 is_bs_less_than4, is_bs_greater_than0;
  1191. v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
  1192. v16u8 const4;
  1193. v16i8 zero = { 0 };
  1194. v8i16 tmp_vec, bs = { 0 };
  1195. v8i16 tc = { 0 };
  1196. v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
  1197. v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
  1198. v16i8 tmp0, tmp1, tmp2, tmp3;
  1199. v4i32 vec0, vec1;
  1200. v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
  1201. v16i8 negate_tc, sign_negate_tc;
  1202. const4 = ( v16u8 ) __msa_ldi_b( 4 );
  1203. tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 );
  1204. bs = __msa_insve_h( bs, 0, tmp_vec );
  1205. bs = __msa_insve_h( bs, 4, tmp_vec );
  1206. tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 );
  1207. bs = __msa_insve_h( bs, 1, tmp_vec );
  1208. bs = __msa_insve_h( bs, 5, tmp_vec );
  1209. tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 );
  1210. bs = __msa_insve_h( bs, 2, tmp_vec );
  1211. bs = __msa_insve_h( bs, 6, tmp_vec );
  1212. tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 );
  1213. bs = __msa_insve_h( bs, 3, tmp_vec );
  1214. bs = __msa_insve_h( bs, 7, tmp_vec );
  1215. if( !__msa_test_bz_v( ( v16u8 ) bs ) )
  1216. {
  1217. tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 );
  1218. tc = __msa_insve_h( tc, 0, tmp_vec );
  1219. tc = __msa_insve_h( tc, 4, tmp_vec );
  1220. tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 );
  1221. tc = __msa_insve_h( tc, 1, tmp_vec );
  1222. tc = __msa_insve_h( tc, 5, tmp_vec );
  1223. tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 );
  1224. tc = __msa_insve_h( tc, 2, tmp_vec );
  1225. tc = __msa_insve_h( tc, 6, tmp_vec );
  1226. tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 );
  1227. tc = __msa_insve_h( tc, 3, tmp_vec );
  1228. tc = __msa_insve_h( tc, 7, tmp_vec );
  1229. is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
  1230. LD_UB8( ( p_chroma - 4 ), u_img_width,
  1231. row0, row1, row2, row3, row4, row5, row6, row7 );
  1232. TRANSPOSE8x8_UB_UB( row0, row1, row2, row3,
  1233. row4, row5, row6, row7,
  1234. p1_u_org, p1_v_org, p0_u_org, p0_v_org,
  1235. q0_u_org, q0_v_org, q1_u_org, q1_v_org );
  1236. ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
  1237. q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
  1238. p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
  1239. p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
  1240. q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
  1241. alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
  1242. beta = ( v16u8 ) __msa_fill_b( u_beta_in );
  1243. is_less_than_alpha = ( p0_asub_q0 < alpha );
  1244. is_less_than_beta = ( p1_asub_p0 < beta );
  1245. is_less_than = is_less_than_beta & is_less_than_alpha;
  1246. is_less_than_beta = ( q1_asub_q0 < beta );
  1247. is_less_than = is_less_than_beta & is_less_than;
  1248. is_less_than = is_bs_greater_than0 & is_less_than;
  1249. if( !__msa_test_bz_v( is_less_than ) )
  1250. {
  1251. UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
  1252. UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
  1253. UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
  1254. UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
  1255. is_bs_less_than4 = ( ( v16u8 ) bs < const4 );
  1256. is_less_than1 = is_less_than & is_bs_less_than4;
  1257. if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) )
  1258. {
  1259. negate_tc = zero - ( v16i8 ) tc;
  1260. sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
  1261. ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
  1262. i16_negatetc_l );
  1263. UNPCK_UB_SH( tc, tc_r, tc_l );
  1264. is_less_than_r =
  1265. ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 );
  1266. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
  1267. {
  1268. AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
  1269. negate_tc_r, tc_r, p0_r, q0_r );
  1270. }
  1271. is_less_than_l =
  1272. ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 );
  1273. if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
  1274. {
  1275. AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
  1276. i16_negatetc_l, tc_l, p0_l, q0_l );
  1277. }
  1278. PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
  1279. p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 );
  1280. q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 );
  1281. }
  1282. SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
  1283. ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
  1284. ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
  1285. ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
  1286. ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
  1287. }
  1288. }
  1289. }
  1290. static void avc_deblock_strength_msa( uint8_t *nnz,
  1291. int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
  1292. int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
  1293. uint8_t pu_bs[2][8][4],
  1294. int32_t i_mvy_limit )
  1295. {
  1296. uint32_t u_tmp;
  1297. v16u8 nnz0, nnz1, nnz2, nnz3, nnz4;
  1298. v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 };
  1299. v16i8 ref0, ref1, ref2, ref3, ref4;
  1300. v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5;
  1301. v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b;
  1302. v8u16 four, mvy_limit_vec, sub0, sub1;
  1303. nnz0 = LD_UB( nnz + 4 );
  1304. nnz2 = LD_UB( nnz + 20 );
  1305. nnz4 = LD_UB( nnz + 36 );
  1306. ref0 = LD_SB( pi_ref[0] + 4 );
  1307. ref2 = LD_SB( pi_ref[0] + 20 );
  1308. ref4 = LD_SB( pi_ref[0] + 36 );
  1309. mv0 = LD_SH( ( pi_mv[0] + 4 )[0] );
  1310. mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
  1311. mv2 = LD_SH( ( pi_mv[0] + 20 )[0] );
  1312. mv3 = LD_SH( ( pi_mv[0] + 28 )[0] );
  1313. mv4 = LD_SH( ( pi_mv[0] + 36 )[0] );
  1314. mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
  1315. four = ( v8u16 ) __msa_fill_h( 4 );
  1316. mask = ( v16u8 ) __msa_ldi_b( 0 );
  1317. one = ( v16u8 ) __msa_ldi_b( 1 );
  1318. two = ( v16u8 ) __msa_ldi_b( 2 );
  1319. mv5 = __msa_pckod_h( mv0, mv0 );
  1320. mv6 = __msa_pckod_h( mv1, mv1 );
  1321. mv_a = __msa_pckev_h( mv0, mv0 );
  1322. mv_b = __msa_pckev_h( mv1, mv1 );
  1323. nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 );
  1324. ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 );
  1325. nnz_mask = nnz0 | nnz1;
  1326. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1327. two = __msa_bmnz_v( two, mask, nnz_mask );
  1328. ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
  1329. ref_mask = ref_mask ^ 255;
  1330. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1331. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1332. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1333. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1334. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1335. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1336. dst = __msa_bmnz_v( dst, one, ref_mask );
  1337. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1338. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1339. SW( u_tmp, pu_bs[1][0] );
  1340. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1341. two = ( v16u8 ) __msa_ldi_b( 2 );
  1342. mv5 = __msa_pckod_h( mv1, mv1 );
  1343. mv6 = __msa_pckod_h( mv2, mv2 );
  1344. mv_a = __msa_pckev_h( mv1, mv1 );
  1345. mv_b = __msa_pckev_h( mv2, mv2 );
  1346. nnz_mask = nnz2 | nnz1;
  1347. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1348. two = __msa_bmnz_v( two, mask, nnz_mask );
  1349. ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
  1350. ref_mask = ref_mask ^ 255;
  1351. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1352. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1353. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1354. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1355. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1356. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1357. dst = __msa_bmnz_v( dst, one, ref_mask );
  1358. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1359. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1360. SW( u_tmp, pu_bs[1][1] );
  1361. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1362. two = ( v16u8 ) __msa_ldi_b( 2 );
  1363. mv5 = __msa_pckod_h( mv2, mv2 );
  1364. mv6 = __msa_pckod_h( mv3, mv3 );
  1365. mv_a = __msa_pckev_h( mv2, mv2 );
  1366. mv_b = __msa_pckev_h( mv3, mv3 );
  1367. nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 );
  1368. ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 );
  1369. nnz_mask = nnz3 | nnz2;
  1370. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1371. two = __msa_bmnz_v( two, mask, nnz_mask );
  1372. ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
  1373. ref_mask = ref_mask ^ 255;
  1374. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1375. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1376. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1377. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1378. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1379. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1380. dst = __msa_bmnz_v( dst, one, ref_mask );
  1381. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1382. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1383. SW( u_tmp, pu_bs[1][2] );
  1384. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1385. two = ( v16u8 ) __msa_ldi_b( 2 );
  1386. mv5 = __msa_pckod_h( mv3, mv3 );
  1387. mv6 = __msa_pckod_h( mv4, mv4 );
  1388. mv_a = __msa_pckev_h( mv3, mv3 );
  1389. mv_b = __msa_pckev_h( mv4, mv4 );
  1390. nnz_mask = nnz4 | nnz3;
  1391. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1392. two = __msa_bmnz_v( two, mask, nnz_mask );
  1393. ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
  1394. ref_mask = ref_mask ^ 255;
  1395. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1396. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1397. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1398. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1399. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1400. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1401. dst = __msa_bmnz_v( dst, one, ref_mask );
  1402. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1403. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1404. SW( u_tmp, pu_bs[1][3] );
  1405. nnz0 = LD_UB( nnz + 8 );
  1406. nnz2 = LD_UB( nnz + 24 );
  1407. ref0 = LD_SB( pi_ref[0] + 8 );
  1408. ref2 = LD_SB( pi_ref[0] + 24 );
  1409. mv0 = LD_SH( ( pi_mv[0] + 8 )[0] );
  1410. mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
  1411. mv2 = LD_SH( ( pi_mv[0] + 16 )[0] );
  1412. mv3 = LD_SH( ( pi_mv[0] + 20 )[0] );
  1413. mv4 = LD_SH( ( pi_mv[0] + 24 )[0] );
  1414. mv7 = LD_SH( ( pi_mv[0] + 28 )[0] );
  1415. mv8 = LD_SH( ( pi_mv[0] + 32 )[0] );
  1416. mv9 = LD_SH( ( pi_mv[0] + 36 )[0] );
  1417. nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 );
  1418. nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 );
  1419. ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 );
  1420. ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 );
  1421. nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
  1422. nnz1 = ( v16u8 ) temp_vec4;
  1423. nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 );
  1424. nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 );
  1425. nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 );
  1426. ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 );
  1427. ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 );
  1428. ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 );
  1429. ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 );
  1430. ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
  1431. ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 );
  1432. ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 );
  1433. ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 );
  1434. TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 );
  1435. TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 );
  1436. mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
  1437. four = ( v8u16 ) __msa_fill_h( 4 );
  1438. mask = ( v16u8 ) __msa_ldi_b( 0 );
  1439. one = ( v16u8 ) __msa_ldi_b( 1 );
  1440. two = ( v16u8 ) __msa_ldi_b( 2 );
  1441. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1442. mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 );
  1443. mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
  1444. mv_a = mv0;
  1445. mv_b = mv1;
  1446. nnz_mask = nnz0 | nnz1;
  1447. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1448. two = __msa_bmnz_v( two, mask, nnz_mask );
  1449. ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
  1450. ref_mask = ref_mask ^ 255;
  1451. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1452. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1453. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1454. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1455. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1456. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1457. dst = __msa_bmnz_v( dst, one, ref_mask );
  1458. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1459. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1460. SW( u_tmp, pu_bs[0][0] );
  1461. two = ( v16u8 ) __msa_ldi_b( 2 );
  1462. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1463. mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
  1464. mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
  1465. mv_a = mv1;
  1466. mv_b = mv2;
  1467. nnz_mask = nnz1 | nnz2;
  1468. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1469. two = __msa_bmnz_v( two, mask, nnz_mask );
  1470. ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
  1471. ref_mask = ref_mask ^ 255;
  1472. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1473. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1474. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1475. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1476. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1477. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1478. dst = __msa_bmnz_v( dst, one, ref_mask );
  1479. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1480. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1481. SW( u_tmp, pu_bs[0][1] );
  1482. two = ( v16u8 ) __msa_ldi_b( 2 );
  1483. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1484. mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
  1485. mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
  1486. mv_a = mv2;
  1487. mv_b = mv3;
  1488. nnz_mask = nnz2 | nnz3;
  1489. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1490. two = __msa_bmnz_v( two, mask, nnz_mask );
  1491. ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
  1492. ref_mask = ref_mask ^ 255;
  1493. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1494. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1495. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1496. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1497. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1498. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1499. dst = __msa_bmnz_v( dst, one, ref_mask );
  1500. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1501. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1502. SW( u_tmp, pu_bs[0][2] );
  1503. two = ( v16u8 ) __msa_ldi_b( 2 );
  1504. dst = ( v16u8 ) __msa_ldi_b( 0 );
  1505. mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
  1506. mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 );
  1507. mv_a = mv3;
  1508. mv_b = mv4;
  1509. nnz_mask = nnz3 | nnz4;
  1510. nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
  1511. two = __msa_bmnz_v( two, mask, nnz_mask );
  1512. ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
  1513. ref_mask = ref_mask ^ 255;
  1514. sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
  1515. sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
  1516. sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
  1517. sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
  1518. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
  1519. ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
  1520. dst = __msa_bmnz_v( dst, one, ref_mask );
  1521. dst = __msa_bmnz_v( two, dst, nnz_mask );
  1522. u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
  1523. SW( u_tmp, pu_bs[0][3] );
  1524. }
  1525. void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
  1526. int32_t i_alpha, int32_t i_beta )
  1527. {
  1528. avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
  1529. ( uint8_t ) i_beta, i_stride );
  1530. }
  1531. void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
  1532. int32_t i_alpha, int32_t i_beta )
  1533. {
  1534. avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
  1535. ( uint8_t ) i_beta, i_stride );
  1536. }
  1537. void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
  1538. int32_t i_alpha, int32_t i_beta )
  1539. {
  1540. avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
  1541. ( uint8_t ) i_beta, i_stride );
  1542. }
  1543. void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
  1544. int32_t i_alpha, int32_t i_beta )
  1545. {
  1546. avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
  1547. ( uint8_t ) i_beta, i_stride );
  1548. }
  1549. void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride,
  1550. int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
  1551. {
  1552. uint8_t u_bs0 = 1;
  1553. uint8_t u_bs1 = 1;
  1554. uint8_t u_bs2 = 1;
  1555. uint8_t u_bs3 = 1;
  1556. if( p_tc0[0] < 0 ) u_bs0 = 0;
  1557. if( p_tc0[1] < 0 ) u_bs1 = 0;
  1558. if( p_tc0[2] < 0 ) u_bs2 = 0;
  1559. if( p_tc0[3] < 0 ) u_bs3 = 0;
  1560. avc_loopfilter_luma_inter_edge_ver_msa( p_pix,
  1561. u_bs0, u_bs1, u_bs2, u_bs3,
  1562. p_tc0[0], p_tc0[1], p_tc0[2],
  1563. p_tc0[3], i_alpha, i_beta,
  1564. i_stride );
  1565. }
  1566. void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride,
  1567. int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
  1568. {
  1569. uint8_t u_bs0 = 1;
  1570. uint8_t u_bs1 = 1;
  1571. uint8_t u_bs2 = 1;
  1572. uint8_t u_bs3 = 1;
  1573. if( p_tc0[0] < 0 ) u_bs0 = 0;
  1574. if( p_tc0[1] < 0 ) u_bs1 = 0;
  1575. if( p_tc0[2] < 0 ) u_bs2 = 0;
  1576. if( p_tc0[3] < 0 ) u_bs3 = 0;
  1577. avc_loopfilter_luma_inter_edge_hor_msa( p_pix,
  1578. u_bs0, u_bs1, u_bs2, u_bs3,
  1579. p_tc0[0], p_tc0[1], p_tc0[2],
  1580. p_tc0[3], i_alpha, i_beta,
  1581. i_stride );
  1582. }
  1583. void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
  1584. int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
  1585. {
  1586. uint8_t u_bs0 = 1;
  1587. uint8_t u_bs1 = 1;
  1588. uint8_t u_bs2 = 1;
  1589. uint8_t u_bs3 = 1;
  1590. if( p_tc0[0] < 0 ) u_bs0 = 0;
  1591. if( p_tc0[1] < 0 ) u_bs1 = 0;
  1592. if( p_tc0[2] < 0 ) u_bs2 = 0;
  1593. if( p_tc0[3] < 0 ) u_bs3 = 0;
  1594. avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix,
  1595. u_bs0, u_bs1, u_bs2, u_bs3,
  1596. p_tc0[0], p_tc0[1], p_tc0[2],
  1597. p_tc0[3], i_alpha, i_beta,
  1598. i_stride );
  1599. }
  1600. void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
  1601. int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
  1602. {
  1603. uint8_t u_bs0 = 1;
  1604. uint8_t u_bs1 = 1;
  1605. uint8_t u_bs2 = 1;
  1606. uint8_t u_bs3 = 1;
  1607. if( p_tc0[0] < 0 ) u_bs0 = 0;
  1608. if( p_tc0[1] < 0 ) u_bs1 = 0;
  1609. if( p_tc0[2] < 0 ) u_bs2 = 0;
  1610. if( p_tc0[3] < 0 ) u_bs3 = 0;
  1611. avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix,
  1612. u_bs0, u_bs1, u_bs2, u_bs3,
  1613. p_tc0[0], p_tc0[1], p_tc0[2],
  1614. p_tc0[3], i_alpha, i_beta,
  1615. i_stride );
  1616. }
  1617. void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE],
  1618. int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
  1619. int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
  1620. uint8_t pu_bs[2][8][4], int32_t i_mvy_limit,
  1621. int32_t i_bframe )
  1622. {
  1623. if( i_bframe )
  1624. {
  1625. for( int32_t i_dir = 0; i_dir < 2; i_dir++ )
  1626. {
  1627. int32_t s1 = i_dir ? 1 : 8;
  1628. int32_t s2 = i_dir ? 8 : 1;
  1629. for( int32_t i_edge = 0; i_edge < 4; i_edge++ )
  1630. {
  1631. for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4;
  1632. i++, loc += s1 )
  1633. {
  1634. int32_t locn = loc - s2;
  1635. if( u_nnz[loc] || u_nnz[locn] )
  1636. {
  1637. pu_bs[i_dir][i_edge][i] = 2;
  1638. }
  1639. else if( pi_ref[0][loc] != pi_ref[0][locn] ||
  1640. abs( pi_mv[0][loc][0] -
  1641. pi_mv[0][locn][0] ) >= 4 ||
  1642. abs( pi_mv[0][loc][1] -
  1643. pi_mv[0][locn][1] ) >= i_mvy_limit ||
  1644. ( i_bframe &&
  1645. ( pi_ref[1][loc] != pi_ref[1][locn] ||
  1646. abs( pi_mv[1][loc][0] -
  1647. pi_mv[1][locn][0] ) >= 4 ||
  1648. abs( pi_mv[1][loc][1] -
  1649. pi_mv[1][locn][1] ) >= i_mvy_limit ) )
  1650. )
  1651. {
  1652. pu_bs[i_dir][i_edge][i] = 1;
  1653. }
  1654. else
  1655. {
  1656. pu_bs[i_dir][i_edge][i] = 0;
  1657. }
  1658. }
  1659. }
  1660. }
  1661. }
  1662. else
  1663. {
  1664. avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit );
  1665. }
  1666. }
  1667. #endif