123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755 |
- /*****************************************************************************
- * mc.S: aarch64 motion compensation
- *****************************************************************************
- * Copyright (C) 2009-2018 x264 project
- *
- * Authors: David Conrad <lessen42@gmail.com>
- * Janne Grunau <janne-x264@jannau.net>
- * Mans Rullgard <mans@mansr.com>
- * Stefan Groenroos <stefan.gronroos@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
- #include "asm.S"
- // note: prefetch stuff assumes 64-byte cacheline
- // void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
- function prefetch_ref_aarch64, export=1
- cmp w2, #1
- csel x2, xzr, x1, eq
- add x0, x0, #64
- add x0, x0, x2, lsl #3
- lsl x2, x1, #1
- add x3, x1, x1, lsl #1
- add x4, x0, x1, lsl #2
- prfm pldl1strm, [x0]
- prfm pldl1strm, [x0, x1]
- prfm pldl1strm, [x0, x2]
- prfm pldl1strm, [x0, x3]
- prfm pldl1strm, [x4]
- prfm pldl1strm, [x4, x1]
- prfm pldl1strm, [x4, x2]
- prfm pldl1strm, [x4, x3]
- ret
- endfunc
- // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
- // uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
- .macro prefetch_fenc sub
- function prefetch_fenc_\sub\()_aarch64, export=1
- and w6, w5, #3
- and w7, w5, #3
- mul x6, x6, x1
- mul x7, x7, x3
- add x0, x0, #64
- add x2, x2, #64
- add x0, x0, x6, lsl #2
- add x6, x0, x1, lsl #1
- prfm pldl1strm, [x0]
- prfm pldl1strm, [x0, x1]
- prfm pldl1strm, [x6]
- prfm pldl1strm, [x6, x1]
- add x2, x2, x7, lsl #1
- prfm pldl1strm, [x2]
- prfm pldl1strm, [x2, x3]
- .ifc \sub, 422
- add x7, x2, x3, lsl #1
- prfm pldl1strm, [x7]
- prfm pldl1strm, [x7, x3]
- .endif
- ret
- endfunc
- .endm
- prefetch_fenc 420
- prefetch_fenc 422
- // void pixel_avg( uint8_t *dst, intptr_t dst_stride,
- // uint8_t *src1, intptr_t src1_stride,
- // uint8_t *src2, intptr_t src2_stride, int weight );
- .macro AVGH w h
- function pixel_avg_\w\()x\h\()_neon, export=1
- mov w10, #64
- cmp w6, #32
- mov w9, #\h
- b.eq pixel_avg_w\w\()_neon
- subs w7, w10, w6
- b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
- cmp w6, #0
- b.ge pixel_avg_weight_w\w\()_add_add_neon
- b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
- endfunc
- .endm
- AVGH 4, 2
- AVGH 4, 4
- AVGH 4, 8
- AVGH 4, 16
- AVGH 8, 4
- AVGH 8, 8
- AVGH 8, 16
- AVGH 16, 8
- AVGH 16, 16
- // 0 < weight < 64
- .macro load_weights_add_add
- mov w6, w6
- .endm
- .macro weight_add_add dst, s1, s2, h=
- .ifc \h, 2
- umull2 \dst, \s1, v30.16b
- umlal2 \dst, \s2, v31.16b
- .else
- umull \dst, \s1, v30.8b
- umlal \dst, \s2, v31.8b
- .endif
- .endm
- // weight > 64
- .macro load_weights_add_sub
- neg w7, w7
- .endm
- .macro weight_add_sub dst, s1, s2, h=
- .ifc \h, 2
- umull2 \dst, \s1, v30.16b
- umlsl2 \dst, \s2, v31.16b
- .else
- umull \dst, \s1, v30.8b
- umlsl \dst, \s2, v31.8b
- .endif
- .endm
- // weight < 0
- .macro load_weights_sub_add
- neg w6, w6
- .endm
- .macro weight_sub_add dst, s1, s2, h=
- .ifc \h, 2
- umull2 \dst, \s2, v31.16b
- umlsl2 \dst, \s1, v30.16b
- .else
- umull \dst, \s2, v31.8b
- umlsl \dst, \s1, v30.8b
- .endif
- .endm
- .macro AVG_WEIGHT ext
- function pixel_avg_weight_w4_\ext\()_neon
- load_weights_\ext
- dup v30.8b, w6
- dup v31.8b, w7
- 1: // height loop
- subs w9, w9, #2
- ld1 {v0.s}[0], [x2], x3
- ld1 {v1.s}[0], [x4], x5
- weight_\ext v4.8h, v0.8b, v1.8b
- ld1 {v2.s}[0], [x2], x3
- ld1 {v3.s}[0], [x4], x5
- sqrshrun v0.8b, v4.8h, #6
- weight_\ext v5.8h, v2.8b, v3.8b
- st1 {v0.s}[0], [x0], x1
- sqrshrun v1.8b, v5.8h, #6
- st1 {v1.s}[0], [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg_weight_w8_\ext\()_neon
- load_weights_\ext
- dup v30.8b, w6
- dup v31.8b, w7
- 1: // height loop
- subs w9, w9, #4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x4], x5
- weight_\ext v16.8h, v0.8b, v1.8b
- ld1 {v2.8b}, [x2], x3
- ld1 {v3.8b}, [x4], x5
- weight_\ext v17.8h, v2.8b, v3.8b
- ld1 {v4.8b}, [x2], x3
- ld1 {v5.8b}, [x4], x5
- weight_\ext v18.8h, v4.8b, v5.8b
- ld1 {v6.8b}, [x2], x3
- ld1 {v7.8b}, [x4], x5
- weight_\ext v19.8h, v6.8b, v7.8b
- sqrshrun v0.8b, v16.8h, #6
- sqrshrun v1.8b, v17.8h, #6
- sqrshrun v2.8b, v18.8h, #6
- sqrshrun v3.8b, v19.8h, #6
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
- st1 {v2.8b}, [x0], x1
- st1 {v3.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg_weight_w16_\ext\()_neon
- load_weights_\ext
- dup v30.16b, w6
- dup v31.16b, w7
- 1: // height loop
- subs w9, w9, #2
- ld1 {v0.16b}, [x2], x3
- ld1 {v1.16b}, [x4], x5
- weight_\ext v16.8h, v0.8b, v1.8b
- weight_\ext v17.8h, v0.16b, v1.16b, 2
- ld1 {v2.16b}, [x2], x3
- ld1 {v3.16b}, [x4], x5
- weight_\ext v18.8h, v2.8b, v3.8b
- weight_\ext v19.8h, v2.16b, v3.16b, 2
- sqrshrun v0.8b, v16.8h, #6
- sqrshrun v1.8b, v18.8h, #6
- sqrshrun2 v0.16b, v17.8h, #6
- sqrshrun2 v1.16b, v19.8h, #6
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- .endm
- AVG_WEIGHT add_add
- AVG_WEIGHT add_sub
- AVG_WEIGHT sub_add
- function pixel_avg_w4_neon
- 1: subs w9, w9, #2
- ld1 {v0.s}[0], [x2], x3
- ld1 {v2.s}[0], [x4], x5
- urhadd v0.8b, v0.8b, v2.8b
- ld1 {v1.s}[0], [x2], x3
- ld1 {v3.s}[0], [x4], x5
- urhadd v1.8b, v1.8b, v3.8b
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg_w8_neon
- 1: subs w9, w9, #4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x4], x5
- ld1 {v2.8b}, [x2], x3
- urhadd v0.8b, v0.8b, v1.8b
- ld1 {v3.8b}, [x4], x5
- st1 {v0.8b}, [x0], x1
- ld1 {v4.8b}, [x2], x3
- urhadd v1.8b, v2.8b, v3.8b
- ld1 {v5.8b}, [x4], x5
- st1 {v1.8b}, [x0], x1
- ld1 {v6.8b}, [x2], x3
- ld1 {v7.8b}, [x4], x5
- urhadd v2.8b, v4.8b, v5.8b
- urhadd v3.8b, v6.8b, v7.8b
- st1 {v2.8b}, [x0], x1
- st1 {v3.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg_w16_neon
- 1: subs w9, w9, #4
- ld1 {v0.16b}, [x2], x3
- ld1 {v1.16b}, [x4], x5
- ld1 {v2.16b}, [x2], x3
- urhadd v0.16b, v0.16b, v1.16b
- ld1 {v3.16b}, [x4], x5
- st1 {v0.16b}, [x0], x1
- ld1 {v4.16b}, [x2], x3
- urhadd v1.16b, v2.16b, v3.16b
- ld1 {v5.16b}, [x4], x5
- st1 {v1.16b}, [x0], x1
- ld1 {v6.16b}, [x2], x3
- ld1 {v7.16b}, [x4], x5
- urhadd v2.16b, v4.16b, v5.16b
- urhadd v3.16b, v6.16b, v7.16b
- st1 {v2.16b}, [x0], x1
- st1 {v3.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg2_w4_neon, export=1
- 1:
- subs w5, w5, #2
- ld1 {v0.s}[0], [x2], x3
- ld1 {v2.s}[0], [x4], x3
- urhadd v0.8b, v0.8b, v2.8b
- ld1 {v1.s}[0], [x2], x3
- ld1 {v3.s}[0], [x4], x3
- urhadd v1.8b, v1.8b, v3.8b
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg2_w8_neon, export=1
- 1:
- subs w5, w5, #2
- ld1 {v0.8b}, [x2], x3
- ld1 {v2.8b}, [x4], x3
- urhadd v0.8b, v0.8b, v2.8b
- ld1 {v1.8b}, [x2], x3
- ld1 {v3.8b}, [x4], x3
- urhadd v1.8b, v1.8b, v3.8b
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg2_w16_neon, export=1
- 1:
- subs w5, w5, #2
- ld1 {v0.16b}, [x2], x3
- ld1 {v2.16b}, [x4], x3
- urhadd v0.16b, v0.16b, v2.16b
- ld1 {v1.16b}, [x2], x3
- ld1 {v3.16b}, [x4], x3
- urhadd v1.16b, v1.16b, v3.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function pixel_avg2_w20_neon, export=1
- sub x1, x1, #16
- 1:
- subs w5, w5, #2
- ld1 {v0.16b,v1.16b}, [x2], x3
- ld1 {v2.16b,v3.16b}, [x4], x3
- urhadd v0.16b, v0.16b, v2.16b
- urhadd v1.8b, v1.8b, v3.8b
- ld1 {v4.16b,v5.16b}, [x2], x3
- ld1 {v6.16b,v7.16b}, [x4], x3
- urhadd v4.16b, v4.16b, v6.16b
- urhadd v5.8b, v5.8b, v7.8b
- st1 {v0.16b}, [x0], #16
- st1 {v1.s}[0], [x0], x1
- st1 {v4.16b}, [x0], #16
- st1 {v5.s}[0], [x0], x1
- b.gt 1b
- ret
- endfunc
- .macro weight_prologue type
- mov w9, w5 // height
- .ifc \type, full
- ldr w12, [x4, #32] // denom
- .endif
- ldp w4, w5, [x4, #32+4] // scale, offset
- dup v0.16b, w4
- dup v1.8h, w5
- .ifc \type, full
- neg w12, w12
- dup v2.8h, w12
- .endif
- .endm
- // void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
- // intptr_t dst_stride, const x264_weight_t *weight, int h )
- function mc_weight_w20_neon, export=1
- weight_prologue full
- sub x1, x1, #16
- 1:
- subs w9, w9, #2
- ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
- ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
- umull v22.8h, v16.8b, v0.8b
- umull v23.8h, v17.8b, v0.8b
- zip1 v18.2s, v18.2s, v21.2s
- umull v25.8h, v19.8b, v0.8b
- umull v26.8h, v20.8b, v0.8b
- umull v24.8h, v18.8b, v0.8b
- srshl v22.8h, v22.8h, v2.8h
- srshl v23.8h, v23.8h, v2.8h
- srshl v24.8h, v24.8h, v2.8h
- srshl v25.8h, v25.8h, v2.8h
- srshl v26.8h, v26.8h, v2.8h
- add v22.8h, v22.8h, v1.8h
- add v23.8h, v23.8h, v1.8h
- add v24.8h, v24.8h, v1.8h
- add v25.8h, v25.8h, v1.8h
- add v26.8h, v26.8h, v1.8h
- sqxtun v4.8b, v22.8h
- sqxtun2 v4.16b, v23.8h
- sqxtun v6.8b, v24.8h
- sqxtun v5.8b, v25.8h
- sqxtun2 v5.16b, v26.8h
- st1 {v4.16b}, [x0], #16
- st1 {v6.s}[0], [x0], x1
- st1 {v5.16b}, [x0], #16
- st1 {v6.s}[1], [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w16_neon, export=1
- weight_prologue full
- weight16_loop:
- 1:
- subs w9, w9, #2
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- umull v22.8h, v4.8b, v0.8b
- umull2 v23.8h, v4.16b, v0.16b
- umull v24.8h, v5.8b, v0.8b
- umull2 v25.8h, v5.16b, v0.16b
- srshl v22.8h, v22.8h, v2.8h
- srshl v23.8h, v23.8h, v2.8h
- srshl v24.8h, v24.8h, v2.8h
- srshl v25.8h, v25.8h, v2.8h
- add v22.8h, v22.8h, v1.8h
- add v23.8h, v23.8h, v1.8h
- add v24.8h, v24.8h, v1.8h
- add v25.8h, v25.8h, v1.8h
- sqxtun v4.8b, v22.8h
- sqxtun2 v4.16b, v23.8h
- sqxtun v5.8b, v24.8h
- sqxtun2 v5.16b, v25.8h
- st1 {v4.16b}, [x0], x1
- st1 {v5.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w8_neon, export=1
- weight_prologue full
- 1:
- subs w9, w9, #2
- ld1 {v16.8b}, [x2], x3
- ld1 {v17.8b}, [x2], x3
- umull v4.8h, v16.8b, v0.8b
- umull v5.8h, v17.8b, v0.8b
- srshl v4.8h, v4.8h, v2.8h
- srshl v5.8h, v5.8h, v2.8h
- add v4.8h, v4.8h, v1.8h
- add v5.8h, v5.8h, v1.8h
- sqxtun v16.8b, v4.8h
- sqxtun v17.8b, v5.8h
- st1 {v16.8b}, [x0], x1
- st1 {v17.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w4_neon, export=1
- weight_prologue full
- 1:
- subs w9, w9, #2
- ld1 {v16.s}[0], [x2], x3
- ld1 {v16.s}[1], [x2], x3
- umull v4.8h, v16.8b, v0.8b
- srshl v4.8h, v4.8h, v2.8h
- add v4.8h, v4.8h, v1.8h
- sqxtun v16.8b, v4.8h
- st1 {v16.s}[0], [x0], x1
- st1 {v16.s}[1], [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w20_nodenom_neon, export=1
- weight_prologue nodenom
- sub x1, x1, #16
- 1:
- subs w9, w9, #2
- ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
- mov v27.16b, v1.16b
- mov v28.16b, v1.16b
- ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
- mov v31.16b, v1.16b
- mov v29.16b, v1.16b
- mov v30.16b, v1.16b
- zip1 v18.2s, v18.2s, v21.2s
- umlal v27.8h, v16.8b, v0.8b
- umlal v28.8h, v17.8b, v0.8b
- umlal v31.8h, v18.8b, v0.8b
- umlal v29.8h, v19.8b, v0.8b
- umlal v30.8h, v20.8b, v0.8b
- sqxtun v4.8b, v27.8h
- sqxtun2 v4.16b, v28.8h
- sqxtun v5.8b, v29.8h
- sqxtun2 v5.16b, v30.8h
- sqxtun v6.8b, v31.8h
- st1 {v4.16b}, [x0], #16
- st1 {v6.s}[0], [x0], x1
- st1 {v5.16b}, [x0], #16
- st1 {v6.s}[1], [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w16_nodenom_neon, export=1
- weight_prologue nodenom
- 1:
- subs w9, w9, #2
- ld1 {v6.16b}, [x2], x3
- mov v27.16b, v1.16b
- mov v28.16b, v1.16b
- ld1 {v7.16b}, [x2], x3
- mov v29.16b, v1.16b
- mov v30.16b, v1.16b
- umlal v27.8h, v6.8b, v0.8b
- umlal2 v28.8h, v6.16b, v0.16b
- umlal v29.8h, v7.8b, v0.8b
- umlal2 v30.8h, v7.16b, v0.16b
- sqxtun v4.8b, v27.8h
- sqxtun2 v4.16b, v28.8h
- sqxtun v5.8b, v29.8h
- sqxtun2 v5.16b, v30.8h
- st1 {v4.16b}, [x0], x1
- st1 {v5.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w8_nodenom_neon, export=1
- weight_prologue nodenom
- 1:
- subs w9, w9, #2
- ld1 {v16.8b}, [x2], x3
- mov v27.16b, v1.16b
- ld1 {v17.8b}, [x2], x3
- mov v29.16b, v1.16b
- umlal v27.8h, v16.8b, v0.8b
- umlal v29.8h, v17.8b, v0.8b
- sqxtun v4.8b, v27.8h
- sqxtun v5.8b, v29.8h
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w4_nodenom_neon, export=1
- weight_prologue nodenom
- 1:
- subs w9, w9, #2
- ld1 {v16.s}[0], [x2], x3
- ld1 {v16.s}[1], [x2], x3
- mov v27.16b, v1.16b
- umlal v27.8h, v16.8b, v0.8b
- sqxtun v4.8b, v27.8h
- st1 {v4.s}[0], [x0], x1
- st1 {v4.s}[1], [x0], x1
- b.gt 1b
- ret
- endfunc
- .macro weight_simple_prologue
- ldr w6, [x4] // offset
- dup v1.16b, w6
- .endm
- .macro weight_simple name op
- function mc_weight_w20_\name\()_neon, export=1
- weight_simple_prologue
- 1:
- subs w5, w5, #2
- ldr s18, [x2, #16]
- ld1 {v16.16b}, [x2], x3
- ldr s19, [x2, #16]
- ld1 {v17.16b}, [x2], x3
- \op v18.8b, v18.8b, v1.8b
- \op v16.16b, v16.16b, v1.16b
- \op v19.8b, v19.8b, v1.8b
- \op v17.16b, v17.16b, v1.16b
- str s18, [x0, #16]
- st1 {v16.16b}, [x0], x1
- str s19, [x0, #16]
- st1 {v17.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w16_\name\()_neon, export=1
- weight_simple_prologue
- 1:
- subs w5, w5, #2
- ld1 {v16.16b}, [x2], x3
- ld1 {v17.16b}, [x2], x3
- \op v16.16b, v16.16b, v1.16b
- \op v17.16b, v17.16b, v1.16b
- st1 {v16.16b}, [x0], x1
- st1 {v17.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w8_\name\()_neon, export=1
- weight_simple_prologue
- 1:
- subs w5, w5, #2
- ld1 {v16.8b}, [x2], x3
- ld1 {v17.8b}, [x2], x3
- \op v16.8b, v16.8b, v1.8b
- \op v17.8b, v17.8b, v1.8b
- st1 {v16.8b}, [x0], x1
- st1 {v17.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_weight_w4_\name\()_neon, export=1
- weight_simple_prologue
- 1:
- subs w5, w5, #2
- ld1 {v16.s}[0], [x2], x3
- ld1 {v16.s}[1], [x2], x3
- \op v16.8b, v16.8b, v1.8b
- st1 {v16.s}[0], [x0], x1
- st1 {v16.s}[1], [x0], x1
- b.gt 1b
- ret
- endfunc
- .endm
- weight_simple offsetadd, uqadd
- weight_simple offsetsub, uqsub
- // void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
- function mc_copy_w4_neon, export=1
- 1:
- subs w4, w4, #4
- ld1 {v0.s}[0], [x2], x3
- ld1 {v1.s}[0], [x2], x3
- ld1 {v2.s}[0], [x2], x3
- ld1 {v3.s}[0], [x2], x3
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
- st1 {v2.s}[0], [x0], x1
- st1 {v3.s}[0], [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_copy_w8_neon, export=1
- 1: subs w4, w4, #4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- ld1 {v2.8b}, [x2], x3
- ld1 {v3.8b}, [x2], x3
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
- st1 {v2.8b}, [x0], x1
- st1 {v3.8b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- function mc_copy_w16_neon, export=1
- 1: subs w4, w4, #4
- ld1 {v0.16b}, [x2], x3
- ld1 {v1.16b}, [x2], x3
- ld1 {v2.16b}, [x2], x3
- ld1 {v3.16b}, [x2], x3
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
- st1 {v2.16b}, [x0], x1
- st1 {v3.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- // void mc_chroma( uint8_t *dst_u, uint8_t *dst_v,
- // intptr_t i_dst_stride,
- // uint8_t *src, intptr_t i_src_stride,
- // int dx, int dy, int i_width, int i_height );
- function mc_chroma_neon, export=1
- ldr w15, [sp] // height
- sbfx x12, x6, #3, #29 // asr(3) and sign extend
- sbfx x11, x5, #3, #29 // asr(3) and sign extend
- cmp w7, #4
- mul x12, x12, x4
- add x3, x3, x11, lsl #1
- and w5, w5, #7
- and w6, w6, #7
- add x3, x3, x12
- //pld [x3]
- //pld [x3, x4]
- b.gt mc_chroma_w8_neon
- b.eq mc_chroma_w4_neon
- endfunc
- .macro CHROMA_MC_START r00, r01, r10, r11
- mul w12, w5, w6 // cD = d8x *d8y
- lsl w13, w5, #3
- add w9, w12, #64
- lsl w14, w6, #3
- tst w12, w12
- sub w9, w9, w13
- sub w10, w13, w12 // cB = d8x *(8-d8y);
- sub w11, w14, w12 // cC = (8-d8x)*d8y
- sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
- .endm
- .macro CHROMA_MC width, vsize
- function mc_chroma_w\width\()_neon
- // since the element size varies, there's a different index for the 2nd store
- .if \width == 4
- .set idx2, 1
- .else
- .set idx2, 2
- .endif
- CHROMA_MC_START
- b.eq 2f
- ld2 {v28.8b,v29.8b}, [x3], x4
- dup v0.8b, w9 // cA
- dup v1.8b, w10 // cB
- ext v6.8b, v28.8b, v6.8b, #1
- ext v7.8b, v29.8b, v7.8b, #1
- ld2 {v30.8b,v31.8b}, [x3], x4
- dup v2.8b, w11 // cC
- dup v3.8b, w12 // cD
- ext v22.8b, v30.8b, v22.8b, #1
- ext v23.8b, v31.8b, v23.8b, #1
- trn1 v0.2s, v0.2s, v1.2s
- trn1 v2.2s, v2.2s, v3.2s
- trn1 v4.2s, v28.2s, v6.2s
- trn1 v5.2s, v29.2s, v7.2s
- trn1 v20.2s, v30.2s, v22.2s
- trn1 v21.2s, v31.2s, v23.2s
- 1: // height loop, interpolate xy
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b
- umlal v16.8h, v20.8b, v2.8b
- umull v17.8h, v5.8b, v0.8b
- umlal v17.8h, v21.8b, v2.8b
- ld2 {v28.8b,v29.8b}, [x3], x4
- transpose v24.2d, v25.2d, v16.2d, v17.2d
- ext v6.8b, v28.8b, v6.8b, #1
- ext v7.8b, v29.8b, v7.8b, #1
- trn1 v4.2s, v28.2s, v6.2s
- trn1 v5.2s, v29.2s, v7.2s
- add v16.8h, v24.8h, v25.8h
- umull v18.8h, v20.8b, v0.8b
- umlal v18.8h, v4.8b, v2.8b
- umull v19.8h, v21.8b, v0.8b
- umlal v19.8h, v5.8b, v2.8b
- ld2 {v30.8b,v31.8b}, [x3], x4
- transpose v26.2d, v27.2d, v18.2d, v19.2d
- ext v22.8b, v30.8b, v22.8b, #1
- ext v23.8b, v31.8b, v23.8b, #1
- trn1 v20.2s, v30.2s, v22.2s
- trn1 v21.2s, v31.2s, v23.2s
- add v17.8h, v26.8h, v27.8h
- rshrn v16.8b, v16.8h, #6
- rshrn v17.8b, v17.8h, #6
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.\vsize}[0], [x0], x2
- st1 {v16.\vsize}[idx2], [x1], x2
- st1 {v17.\vsize}[0], [x0], x2
- st1 {v17.\vsize}[idx2], [x1], x2
- b.gt 1b
- ret
- 2: // dx or dy are 0
- tst w11, w11
- add w10, w10, w11
- dup v0.8b, w9
- dup v1.8b, w10
- b.eq 4f
- ld1 {v4.8b}, [x3], x4
- ld1 {v6.8b}, [x3], x4
- 3: // vertical interpolation loop
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b
- ld1 {v4.8b}, [x3], x4
- umlal v16.8h, v6.8b, v1.8b
- umull v17.8h, v6.8b, v0.8b
- ld1 {v6.8b}, [x3], x4
- umlal v17.8h, v4.8b, v1.8b
- rshrn v20.8b, v16.8h, #6 // uvuvuvuv
- rshrn v21.8b, v17.8h, #6 // uvuvuvuv
- uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
- uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.\vsize}[0], [x0], x2
- st1 {v16.\vsize}[idx2], [x0], x2
- st1 {v17.\vsize}[0], [x1], x2
- st1 {v17.\vsize}[idx2], [x1], x2
- b.gt 3b
- ret
- 4: // dy is 0
- ld1 {v4.8b,v5.8b}, [x3], x4
- ld1 {v6.8b,v7.8b}, [x3], x4
- ext v5.8b, v4.8b, v5.8b, #2
- ext v7.8b, v6.8b, v7.8b, #2
- 5: // horizontal interpolation loop
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b
- umlal v16.8h, v5.8b, v1.8b
- umull v17.8h, v6.8b, v0.8b
- umlal v17.8h, v7.8b, v1.8b
- ld1 {v4.8b,v5.8b}, [x3], x4
- ld1 {v6.8b,v7.8b}, [x3], x4
- rshrn v20.8b, v16.8h, #6
- rshrn v21.8b, v17.8h, #6
- ext v5.8b, v4.8b, v5.8b, #2
- ext v7.8b, v6.8b, v7.8b, #2
- uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
- uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.\vsize}[0], [x0], x2
- st1 {v16.\vsize}[idx2], [x0], x2
- st1 {v17.\vsize}[0], [x1], x2
- st1 {v17.\vsize}[idx2], [x1], x2
- b.gt 5b
- ret
- endfunc
- .endm
- CHROMA_MC 2, h
- CHROMA_MC 4, s
- function mc_chroma_w8_neon
- CHROMA_MC_START
- b.eq 2f
- ld2 {v4.16b,v5.16b}, [x3], x4
- ld2 {v20.16b,v21.16b}, [x3], x4
- dup v0.8b, w9 // cA
- dup v1.8b, w10 // cB
- ext v6.16b, v4.16b, v4.16b, #1
- ext v7.16b, v5.16b, v5.16b, #1
- dup v2.8b, w11 // cC
- dup v3.8b, w12 // cD
- ext v22.16b, v20.16b, v20.16b, #1
- ext v23.16b, v21.16b, v21.16b, #1
- 1: // height loop, interpolate xy
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b
- umlal v16.8h, v6.8b, v1.8b
- umlal v16.8h, v20.8b, v2.8b
- umlal v16.8h, v22.8b, v3.8b
- umull v17.8h, v5.8b, v0.8b
- umlal v17.8h, v7.8b, v1.8b
- umlal v17.8h, v21.8b, v2.8b
- umlal v17.8h, v23.8b, v3.8b
- ld2 {v4.16b,v5.16b}, [x3], x4
- ext v6.16b, v4.16b, v4.16b, #1
- ext v7.16b, v5.16b, v5.16b, #1
- umull v18.8h, v20.8b, v0.8b
- umlal v18.8h, v22.8b, v1.8b
- umlal v18.8h, v4.8b, v2.8b
- umlal v18.8h, v6.8b, v3.8b
- umull v19.8h, v21.8b, v0.8b
- umlal v19.8h, v23.8b, v1.8b
- umlal v19.8h, v5.8b, v2.8b
- umlal v19.8h, v7.8b, v3.8b
- ld2 {v20.16b,v21.16b}, [x3], x4
- rshrn v16.8b, v16.8h, #6
- rshrn v17.8b, v17.8h, #6
- rshrn v18.8b, v18.8h, #6
- rshrn v19.8b, v19.8h, #6
- ext v22.16b, v20.16b, v20.16b, #1
- ext v23.16b, v21.16b, v21.16b, #1
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.8b}, [x0], x2
- st1 {v17.8b}, [x1], x2
- st1 {v18.8b}, [x0], x2
- st1 {v19.8b}, [x1], x2
- b.gt 1b
- ret
- 2: // dx or dy are 0
- tst w11, w11
- add w10, w10, w11
- dup v0.8b, w9
- dup v1.8b, w10
- b.eq 4f
- ld2 {v4.8b,v5.8b}, [x3], x4
- ld2 {v6.8b,v7.8b}, [x3], x4
- 3: // vertical interpolation loop
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b //U
- umlal v16.8h, v6.8b, v1.8b
- umull v17.8h, v5.8b, v0.8b //V
- umlal v17.8h, v7.8b, v1.8b
- ld2 {v4.8b,v5.8b}, [x3], x4
- umull v18.8h, v6.8b, v0.8b
- umlal v18.8h, v4.8b, v1.8b
- umull v19.8h, v7.8b, v0.8b
- umlal v19.8h, v5.8b, v1.8b
- ld2 {v6.8b,v7.8b}, [x3], x4
- rshrn v16.8b, v16.8h, #6
- rshrn v17.8b, v17.8h, #6
- rshrn v18.8b, v18.8h, #6
- rshrn v19.8b, v19.8h, #6
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.8b}, [x0], x2
- st1 {v17.8b}, [x1], x2
- st1 {v18.8b}, [x0], x2
- st1 {v19.8b}, [x1], x2
- b.gt 3b
- ret
- 4: // dy is 0
- ld2 {v4.16b,v5.16b}, [x3], x4
- ext v6.16b, v4.16b, v4.16b, #1
- ext v7.16b, v5.16b, v5.16b, #1
- ld2 {v20.16b,v21.16b}, [x3], x4
- ext v22.16b, v20.16b, v20.16b, #1
- ext v23.16b, v21.16b, v21.16b, #1
- 5: // horizontal interpolation loop
- subs w15, w15, #2
- umull v16.8h, v4.8b, v0.8b //U
- umlal v16.8h, v6.8b, v1.8b
- umull v17.8h, v5.8b, v0.8b //V
- umlal v17.8h, v7.8b, v1.8b
- ld2 {v4.16b,v5.16b}, [x3], x4
- umull v18.8h, v20.8b, v0.8b
- umlal v18.8h, v22.8b, v1.8b
- umull v19.8h, v21.8b, v0.8b
- umlal v19.8h, v23.8b, v1.8b
- ld2 {v20.16b,v21.16b}, [x3], x4
- rshrn v16.8b, v16.8h, #6
- rshrn v17.8b, v17.8h, #6
- rshrn v18.8b, v18.8h, #6
- rshrn v19.8b, v19.8h, #6
- ext v6.16b, v4.16b, v4.16b, #1
- ext v7.16b, v5.16b, v5.16b, #1
- ext v22.16b, v20.16b, v20.16b, #1
- ext v23.16b, v21.16b, v21.16b, #1
- //pld [x3]
- //pld [x3, x4]
- st1 {v16.8b}, [x0], x2
- st1 {v17.8b}, [x1], x2
- st1 {v18.8b}, [x0], x2
- st1 {v19.8b}, [x1], x2
- b.gt 5b
- ret
- endfunc
- // void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
- // intptr_t stride, int width, int height, int16_t *buf )
- function hpel_filter_neon, export=1
- ubfm x9, x3, #0, #3
- add w15, w5, w9
- sub x13, x3, x9 // align src
- sub x10, x0, x9
- sub x11, x1, x9
- sub x12, x2, x9
- movi v30.16b, #5
- movi v31.16b, #20
- 1: // line start
- mov x3, x13
- mov x2, x12
- mov x1, x11
- mov x0, x10
- add x7, x3, #16 // src pointer next 16b for horiz filter
- mov x5, x15 // restore width
- sub x3, x3, x4, lsl #1 // src - 2*stride
- ld1 {v28.16b}, [x7], #16 // src[16:31]
- add x9, x3, x5 // holds src - 2*stride + width
- ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
- ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
- ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
- ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
- ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
- ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
- ext v22.16b, v7.16b, v18.16b, #14
- uaddl v1.8h, v16.8b, v21.8b
- ext v26.16b, v18.16b, v28.16b, #3
- umlsl v1.8h, v17.8b, v30.8b
- ext v23.16b, v7.16b, v18.16b, #15
- umlal v1.8h, v18.8b, v31.8b
- ext v24.16b, v18.16b, v28.16b, #1
- umlal v1.8h, v19.8b, v31.8b
- ext v25.16b, v18.16b, v28.16b, #2
- umlsl v1.8h, v20.8b, v30.8b
- 2: // next 16 pixel of line
- subs x5, x5, #16
- sub x3, x9, x5 // src - 2*stride += 16
- uaddl v4.8h, v22.8b, v26.8b
- uaddl2 v5.8h, v22.16b, v26.16b
- sqrshrun v6.8b, v1.8h, #5
- umlsl v4.8h, v23.8b, v30.8b
- umlsl2 v5.8h, v23.16b, v30.16b
- umlal v4.8h, v18.8b, v31.8b
- umlal2 v5.8h, v18.16b, v31.16b
- umlal v4.8h, v24.8b, v31.8b
- umlal2 v5.8h, v24.16b, v31.16b
- umlsl v4.8h, v25.8b, v30.8b
- umlsl2 v5.8h, v25.16b, v30.16b
- uaddl2 v2.8h, v16.16b, v21.16b
- sqrshrun v4.8b, v4.8h, #5
- mov v7.16b, v18.16b
- sqrshrun2 v4.16b, v5.8h, #5
- umlsl2 v2.8h, v17.16b, v30.16b
- ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15]
- umlal2 v2.8h, v18.16b, v31.16b
- ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15]
- umlal2 v2.8h, v19.16b, v31.16b
- ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15]
- umlsl2 v2.8h, v20.16b, v30.16b
- ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15]
- st1 {v4.16b}, [x0], #16
- sqrshrun2 v6.16b, v2.8h, #5
- ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15]
- ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15]
- ext v22.16b, v0.16b, v1.16b, #12
- ext v26.16b, v1.16b, v2.16b, #6
- ext v23.16b, v0.16b, v1.16b, #14
- st1 {v6.16b}, [x1], #16
- uaddl v3.8h, v16.8b, v21.8b
- ext v25.16b, v1.16b, v2.16b, #4
- umlsl v3.8h, v17.8b, v30.8b
- ext v24.16b, v1.16b, v2.16b, #2
- umlal v3.8h, v18.8b, v31.8b
- add v4.8h, v22.8h, v26.8h
- umlal v3.8h, v19.8b, v31.8b
- add v5.8h, v23.8h, v25.8h
- umlsl v3.8h, v20.8b, v30.8b
- add v6.8h, v24.8h, v1.8h
- ext v22.16b, v1.16b, v2.16b, #12
- ext v26.16b, v2.16b, v3.16b, #6
- ext v23.16b, v1.16b, v2.16b, #14
- ext v25.16b, v2.16b, v3.16b, #4
- ext v24.16b, v2.16b, v3.16b, #2
- add v22.8h, v22.8h, v26.8h
- add v23.8h, v23.8h, v25.8h
- add v24.8h, v24.8h, v2.8h
- sub v4.8h, v4.8h, v5.8h // a-b
- sub v5.8h, v5.8h, v6.8h // b-c
- sub v22.8h, v22.8h, v23.8h // a-b
- sub v23.8h, v23.8h, v24.8h // b-c
- sshr v4.8h, v4.8h, #2 // (a-b)/4
- sshr v22.8h, v22.8h, #2 // (a-b)/4
- sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
- sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
- sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
- sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
- add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
- sqrshrun v4.8b, v4.8h, #6
- ld1 {v28.16b}, [x7], #16 // src[16:31]
- mov v0.16b, v2.16b
- ext v23.16b, v7.16b, v18.16b, #15
- sqrshrun2 v4.16b, v22.8h, #6
- mov v1.16b, v3.16b
- ext v22.16b, v7.16b, v18.16b, #14
- ext v24.16b, v18.16b, v28.16b, #1
- ext v25.16b, v18.16b, v28.16b, #2
- ext v26.16b, v18.16b, v28.16b, #3
- st1 {v4.16b}, [x2], #16
- b.gt 2b
- subs w6, w6, #1
- add x10, x10, x4
- add x11, x11, x4
- add x12, x12, x4
- add x13, x13, x4
- b.gt 1b
- ret
- endfunc
- // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
- // uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
- // intptr_t dst_stride, int width, int height )
- function frame_init_lowres_core_neon, export=1
- ldr w8, [sp]
- sub x10, x6, w7, uxtw // dst_stride - width
- and x10, x10, #~15
- 1:
- mov w9, w7 // width
- mov x11, x0 // src0
- add x12, x0, x5 // src1 = src0 + src_stride
- add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
- ld2 {v0.16b,v1.16b}, [x11], #32
- ld2 {v2.16b,v3.16b}, [x12], #32
- ld2 {v4.16b,v5.16b}, [x13], #32
- urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
- urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
- 2:
- subs w9, w9, #16
- urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
- urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
- ld2 {v0.16b,v1.16b}, [x11], #32
- ld2 {v2.16b,v3.16b}, [x12], #32
- ld2 {v4.16b,v5.16b}, [x13], #32
- urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
- urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
- ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
- ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
- urhadd v16.16b, v20.16b, v21.16b
- urhadd v18.16b, v22.16b, v23.16b
- urhadd v17.16b, v21.16b, v24.16b
- urhadd v19.16b, v23.16b, v25.16b
- st1 {v16.16b}, [x1], #16
- st1 {v18.16b}, [x3], #16
- st1 {v17.16b}, [x2], #16
- st1 {v19.16b}, [x4], #16
- b.le 3f
- subs w9, w9, #16
- urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
- urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
- ld2 {v0.16b,v1.16b}, [x11], #32
- ld2 {v2.16b,v3.16b}, [x12], #32
- ld2 {v4.16b,v5.16b}, [x13], #32
- urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
- urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
- ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
- ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
- urhadd v16.16b, v30.16b, v21.16b
- urhadd v18.16b, v31.16b, v23.16b
- urhadd v17.16b, v21.16b, v24.16b
- urhadd v19.16b, v23.16b, v25.16b
- st1 {v16.16b}, [x1], #16
- st1 {v18.16b}, [x3], #16
- st1 {v17.16b}, [x2], #16
- st1 {v19.16b}, [x4], #16
- b.gt 2b
- 3:
- subs w8, w8, #1
- add x0, x0, x5, lsl #1
- add x1, x1, x10
- add x2, x2, x10
- add x3, x3, x10
- add x4, x4, x10
- b.gt 1b
- ret
- endfunc
- function load_deinterleave_chroma_fenc_neon, export=1
- mov x4, #FENC_STRIDE/2
- b load_deinterleave_chroma
- endfunc
- function load_deinterleave_chroma_fdec_neon, export=1
- mov x4, #FDEC_STRIDE/2
- load_deinterleave_chroma:
- ld2 {v0.8b,v1.8b}, [x1], x2
- ld2 {v2.8b,v3.8b}, [x1], x2
- subs w3, w3, #2
- st1 {v0.8b}, [x0], x4
- st1 {v1.8b}, [x0], x4
- st1 {v2.8b}, [x0], x4
- st1 {v3.8b}, [x0], x4
- b.gt load_deinterleave_chroma
- ret
- endfunc
- function plane_copy_core_neon, export=1
- add w8, w4, #15 // 32-bit write clears the upper 32-bit the register
- and w4, w8, #~15
- // safe use of the full reg since negative width makes no sense
- sub x1, x1, x4
- sub x3, x3, x4
- 1:
- mov w8, w4
- 16:
- tst w8, #16
- b.eq 32f
- subs w8, w8, #16
- ldr q0, [x2], #16
- str q0, [x0], #16
- b.eq 0f
- 32:
- subs w8, w8, #32
- ldp q0, q1, [x2], #32
- stp q0, q1, [x0], #32
- b.gt 32b
- 0:
- subs w5, w5, #1
- add x2, x2, x3
- add x0, x0, x1
- b.gt 1b
- ret
- endfunc
- function plane_copy_swap_core_neon, export=1
- lsl w4, w4, #1
- sub x1, x1, x4
- sub x3, x3, x4
- 1:
- mov w8, w4
- tbz w4, #4, 32f
- subs w8, w8, #16
- ld1 {v0.16b}, [x2], #16
- rev16 v0.16b, v0.16b
- st1 {v0.16b}, [x0], #16
- b.eq 0f
- 32:
- subs w8, w8, #32
- ld1 {v0.16b,v1.16b}, [x2], #32
- rev16 v0.16b, v0.16b
- rev16 v1.16b, v1.16b
- st1 {v0.16b,v1.16b}, [x0], #32
- b.gt 32b
- 0:
- subs w5, w5, #1
- add x2, x2, x3
- add x0, x0, x1
- b.gt 1b
- ret
- endfunc
- function plane_copy_deinterleave_neon, export=1
- add w9, w6, #15
- and w9, w9, #0xfffffff0
- sub x1, x1, x9
- sub x3, x3, x9
- sub x5, x5, x9, lsl #1
- 1:
- ld2 {v0.16b,v1.16b}, [x4], #32
- subs w9, w9, #16
- st1 {v0.16b}, [x0], #16
- st1 {v1.16b}, [x2], #16
- b.gt 1b
- add x4, x4, x5
- subs w7, w7, #1
- add x0, x0, x1
- add x2, x2, x3
- mov w9, w6
- b.gt 1b
- ret
- endfunc
- .macro deinterleave_rgb
- subs x11, x11, #8
- st1 {v0.8b}, [x0], #8
- st1 {v1.8b}, [x2], #8
- st1 {v2.8b}, [x4], #8
- b.gt 1b
- subs w10, w10, #1
- add x0, x0, x1
- add x2, x2, x3
- add x4, x4, x5
- add x6, x6, x7
- mov x11, x9
- b.gt 1b
- .endm
- function plane_copy_deinterleave_rgb_neon, export=1
- #if SYS_MACOSX
- ldr w8, [sp]
- ldp w9, w10, [sp, #4]
- #else
- ldr x8, [sp]
- ldp x9, x10, [sp, #8]
- #endif
- cmp w8, #3
- uxtw x9, w9
- add x11, x9, #7
- and x11, x11, #~7
- sub x1, x1, x11
- sub x3, x3, x11
- sub x5, x5, x11
- b.ne 4f
- sub x7, x7, x11, lsl #1
- sub x7, x7, x11
- 1:
- ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
- deinterleave_rgb
- ret
- 4:
- sub x7, x7, x11, lsl #2
- 1:
- ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
- deinterleave_rgb
- ret
- endfunc
- function plane_copy_interleave_core_neon, export=1
- add w9, w6, #15
- and w9, w9, #0xfffffff0
- sub x1, x1, x9, lsl #1
- sub x3, x3, x9
- sub x5, x5, x9
- 1:
- ld1 {v0.16b}, [x2], #16
- ld1 {v1.16b}, [x4], #16
- subs w9, w9, #16
- st2 {v0.16b,v1.16b}, [x0], #32
- b.gt 1b
- subs w7, w7, #1
- add x0, x0, x1
- add x2, x2, x3
- add x4, x4, x5
- mov w9, w6
- b.gt 1b
- ret
- endfunc
- function store_interleave_chroma_neon, export=1
- mov x5, #FDEC_STRIDE
- 1:
- ld1 {v0.8b}, [x2], x5
- ld1 {v1.8b}, [x3], x5
- ld1 {v2.8b}, [x2], x5
- ld1 {v3.8b}, [x3], x5
- subs w4, w4, #2
- zip1 v4.16b, v0.16b, v1.16b
- zip1 v5.16b, v2.16b, v3.16b
- st1 {v4.16b}, [x0], x1
- st1 {v5.16b}, [x0], x1
- b.gt 1b
- ret
- endfunc
- .macro integral4h p1, p2
- ext v1.8b, \p1\().8b, \p2\().8b, #1
- ext v2.8b, \p1\().8b, \p2\().8b, #2
- ext v3.8b, \p1\().8b, \p2\().8b, #3
- uaddl v0.8h, \p1\().8b, v1.8b
- uaddl v4.8h, v2.8b, v3.8b
- add v0.8h, v0.8h, v4.8h
- add v0.8h, v0.8h, v5.8h
- .endm
- function integral_init4h_neon, export=1
- sub x3, x0, x2, lsl #1
- ld1 {v6.8b,v7.8b}, [x1], #16
- 1:
- subs x2, x2, #16
- ld1 {v5.8h}, [x3], #16
- integral4h v6, v7
- ld1 {v6.8b}, [x1], #8
- ld1 {v5.8h}, [x3], #16
- st1 {v0.8h}, [x0], #16
- integral4h v7, v6
- ld1 {v7.8b}, [x1], #8
- st1 {v0.8h}, [x0], #16
- b.gt 1b
- ret
- endfunc
- .macro integral8h p1, p2, s
- ext v1.8b, \p1\().8b, \p2\().8b, #1
- ext v2.8b, \p1\().8b, \p2\().8b, #2
- ext v3.8b, \p1\().8b, \p2\().8b, #3
- ext v4.8b, \p1\().8b, \p2\().8b, #4
- ext v5.8b, \p1\().8b, \p2\().8b, #5
- ext v6.8b, \p1\().8b, \p2\().8b, #6
- ext v7.8b, \p1\().8b, \p2\().8b, #7
- uaddl v0.8h, \p1\().8b, v1.8b
- uaddl v2.8h, v2.8b, v3.8b
- uaddl v4.8h, v4.8b, v5.8b
- uaddl v6.8h, v6.8b, v7.8b
- add v0.8h, v0.8h, v2.8h
- add v4.8h, v4.8h, v6.8h
- add v0.8h, v0.8h, v4.8h
- add v0.8h, v0.8h, \s\().8h
- .endm
- function integral_init8h_neon, export=1
- sub x3, x0, x2, lsl #1
- ld1 {v16.8b,v17.8b}, [x1], #16
- 1:
- subs x2, x2, #16
- ld1 {v18.8h}, [x3], #16
- integral8h v16, v17, v18
- ld1 {v16.8b}, [x1], #8
- ld1 {v18.8h}, [x3], #16
- st1 {v0.8h}, [x0], #16
- integral8h v17, v16, v18
- ld1 {v17.8b}, [x1], #8
- st1 {v0.8h}, [x0], #16
- b.gt 1b
- ret
- endfunc
- function integral_init4v_neon, export=1
- mov x3, x0
- add x4, x0, x2, lsl #3
- add x8, x0, x2, lsl #4
- sub x2, x2, #8
- ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
- ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
- 1:
- subs x2, x2, #16
- ld1 {v24.8h,v25.8h}, [x4], #32
- ext v0.16b, v20.16b, v21.16b, #8
- ext v1.16b, v21.16b, v22.16b, #8
- ext v2.16b, v16.16b, v17.16b, #8
- ext v3.16b, v17.16b, v18.16b, #8
- sub v24.8h, v24.8h, v20.8h
- sub v25.8h, v25.8h, v21.8h
- add v0.8h, v0.8h, v20.8h
- add v1.8h, v1.8h, v21.8h
- add v2.8h, v2.8h, v16.8h
- add v3.8h, v3.8h, v17.8h
- st1 {v24.8h}, [x1], #16
- st1 {v25.8h}, [x1], #16
- mov v20.16b, v22.16b
- mov v16.16b, v18.16b
- sub v0.8h, v2.8h, v0.8h
- sub v1.8h, v3.8h, v1.8h
- ld1 {v21.8h,v22.8h}, [x3], #32
- ld1 {v17.8h,v18.8h}, [x8], #32
- st1 {v0.8h}, [x0], #16
- st1 {v1.8h}, [x0], #16
- b.gt 1b
- 2:
- ret
- endfunc
- function integral_init8v_neon, export=1
- add x2, x0, x1, lsl #4
- sub x1, x1, #8
- ands x3, x1, #16 - 1
- b.eq 1f
- subs x1, x1, #8
- ld1 {v0.8h}, [x0]
- ld1 {v2.8h}, [x2], #16
- sub v4.8h, v2.8h, v0.8h
- st1 {v4.8h}, [x0], #16
- b.le 2f
- 1:
- subs x1, x1, #16
- ld1 {v0.8h,v1.8h}, [x0]
- ld1 {v2.8h,v3.8h}, [x2], #32
- sub v4.8h, v2.8h, v0.8h
- sub v5.8h, v3.8h, v1.8h
- st1 {v4.8h}, [x0], #16
- st1 {v5.8h}, [x0], #16
- b.gt 1b
- 2:
- ret
- endfunc
- function mbtree_propagate_cost_neon, export=1
- ld1r {v5.4s}, [x5]
- 8:
- subs w6, w6, #8
- ld1 {v1.8h}, [x1], #16
- ld1 {v2.8h}, [x2], #16
- ld1 {v3.8h}, [x3], #16
- ld1 {v4.8h}, [x4], #16
- bic v3.8h, #0xc0, lsl #8
- umin v3.8h, v2.8h, v3.8h
- umull v20.4s, v2.4h, v4.4h // propagate_intra
- umull2 v21.4s, v2.8h, v4.8h // propagate_intra
- usubl v22.4s, v2.4h, v3.4h // propagate_num
- usubl2 v23.4s, v2.8h, v3.8h // propagate_num
- uxtl v26.4s, v2.4h // propagate_denom
- uxtl2 v27.4s, v2.8h // propagate_denom
- uxtl v24.4s, v1.4h
- uxtl2 v25.4s, v1.8h
- ucvtf v20.4s, v20.4s
- ucvtf v21.4s, v21.4s
- ucvtf v26.4s, v26.4s
- ucvtf v27.4s, v27.4s
- ucvtf v22.4s, v22.4s
- ucvtf v23.4s, v23.4s
- frecpe v28.4s, v26.4s
- frecpe v29.4s, v27.4s
- ucvtf v24.4s, v24.4s
- ucvtf v25.4s, v25.4s
- frecps v30.4s, v28.4s, v26.4s
- frecps v31.4s, v29.4s, v27.4s
- fmla v24.4s, v20.4s, v5.4s // propagate_amount
- fmla v25.4s, v21.4s, v5.4s // propagate_amount
- fmul v28.4s, v28.4s, v30.4s
- fmul v29.4s, v29.4s, v31.4s
- fmul v16.4s, v24.4s, v22.4s
- fmul v17.4s, v25.4s, v23.4s
- fmul v18.4s, v16.4s, v28.4s
- fmul v19.4s, v17.4s, v29.4s
- fcvtns v20.4s, v18.4s
- fcvtns v21.4s, v19.4s
- sqxtn v0.4h, v20.4s
- sqxtn2 v0.8h, v21.4s
- st1 {v0.8h}, [x0], #16
- b.gt 8b
- ret
- endfunc
- const pw_0to15, align=5
- .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
- endconst
- function mbtree_propagate_list_internal_neon, export=1
- movrel x11, pw_0to15
- dup v31.8h, w4 // bipred_weight
- movi v30.8h, #0xc0, lsl #8
- ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
- movi v28.4s, #4
- movi v27.8h, #31
- movi v26.8h, #32
- dup v24.8h, w5 // mb_y
- zip1 v29.8h, v29.8h, v24.8h
- 8:
- subs w6, w6, #8
- ld1 {v1.8h}, [x1], #16 // propagate_amount
- ld1 {v2.8h}, [x2], #16 // lowres_cost
- and v2.16b, v2.16b, v30.16b
- cmeq v25.8h, v2.8h, v30.8h
- umull v16.4s, v1.4h, v31.4h
- umull2 v17.4s, v1.8h, v31.8h
- rshrn v16.4h, v16.4s, #6
- rshrn2 v16.8h, v17.4s, #6
- bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
- // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
- ld1 {v4.8h,v5.8h}, [x0], #32
- sshr v6.8h, v4.8h, #5
- sshr v7.8h, v5.8h, #5
- add v6.8h, v6.8h, v29.8h
- add v29.8h, v29.8h, v28.8h
- add v7.8h, v7.8h, v29.8h
- add v29.8h, v29.8h, v28.8h
- st1 {v6.8h,v7.8h}, [x3], #32
- and v4.16b, v4.16b, v27.16b
- and v5.16b, v5.16b, v27.16b
- uzp1 v6.8h, v4.8h, v5.8h // x & 31
- uzp2 v7.8h, v4.8h, v5.8h // y & 31
- sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
- sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
- mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
- mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
- mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
- mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
- umull v6.4s, v19.4h, v25.4h
- umull2 v7.4s, v19.8h, v25.8h
- umull v4.4s, v18.4h, v25.4h
- umull2 v5.4s, v18.8h, v25.8h
- umull v2.4s, v17.4h, v25.4h
- umull2 v3.4s, v17.8h, v25.8h
- umull v0.4s, v16.4h, v25.4h
- umull2 v1.4s, v16.8h, v25.8h
- rshrn v19.4h, v6.4s, #10
- rshrn2 v19.8h, v7.4s, #10
- rshrn v18.4h, v4.4s, #10
- rshrn2 v18.8h, v5.4s, #10
- rshrn v17.4h, v2.4s, #10
- rshrn2 v17.8h, v3.4s, #10
- rshrn v16.4h, v0.4s, #10
- rshrn2 v16.8h, v1.4s, #10
- zip1 v0.8h, v16.8h, v17.8h
- zip2 v1.8h, v16.8h, v17.8h
- zip1 v2.8h, v18.8h, v19.8h
- zip2 v3.8h, v18.8h, v19.8h
- st1 {v0.8h,v1.8h}, [x3], #32
- st1 {v2.8h,v3.8h}, [x3], #32
- b.ge 8b
- ret
- endfunc
- function memcpy_aligned_neon, export=1
- tst x2, #16
- b.eq 32f
- sub x2, x2, #16
- ldr q0, [x1], #16
- str q0, [x0], #16
- 32:
- tst x2, #32
- b.eq 640f
- sub x2, x2, #32
- ldp q0, q1, [x1], #32
- stp q0, q1, [x0], #32
- 640:
- cbz x2, 1f
- 64:
- subs x2, x2, #64
- ldp q0, q1, [x1, #32]
- ldp q2, q3, [x1], #64
- stp q0, q1, [x0, #32]
- stp q2, q3, [x0], #64
- b.gt 64b
- 1:
- ret
- endfunc
- function memzero_aligned_neon, export=1
- movi v0.16b, #0
- movi v1.16b, #0
- 1:
- subs x1, x1, #128
- stp q0, q1, [x0, #96]
- stp q0, q1, [x0, #64]
- stp q0, q1, [x0, #32]
- stp q0, q1, [x0], 128
- b.gt 1b
- ret
- endfunc
- // void mbtree_fix8_pack( int16_t *dst, float *src, int count )
- function mbtree_fix8_pack_neon, export=1
- subs w3, w2, #8
- b.lt 2f
- 1:
- subs w3, w3, #8
- ld1 {v0.4s,v1.4s}, [x1], #32
- fcvtzs v0.4s, v0.4s, #8
- fcvtzs v1.4s, v1.4s, #8
- sqxtn v2.4h, v0.4s
- sqxtn2 v2.8h, v1.4s
- rev16 v3.16b, v2.16b
- st1 {v3.8h}, [x0], #16
- b.ge 1b
- 2:
- adds w3, w3, #8
- b.eq 4f
- 3:
- subs w3, w3, #1
- ldr s0, [x1], #4
- fcvtzs w4, s0, #8
- rev16 w5, w4
- strh w5, [x0], #2
- b.gt 3b
- 4:
- ret
- endfunc
- // void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
- function mbtree_fix8_unpack_neon, export=1
- subs w3, w2, #8
- b.lt 2f
- 1:
- subs w3, w3, #8
- ld1 {v0.8h}, [x1], #16
- rev16 v1.16b, v0.16b
- sxtl v2.4s, v1.4h
- sxtl2 v3.4s, v1.8h
- scvtf v4.4s, v2.4s, #8
- scvtf v5.4s, v3.4s, #8
- st1 {v4.4s,v5.4s}, [x0], #32
- b.ge 1b
- 2:
- adds w3, w3, #8
- b.eq 4f
- 3:
- subs w3, w3, #1
- ldrh w4, [x1], #2
- rev16 w5, w4
- sxth w6, w5
- scvtf s0, w6, #8
- str s0, [x0], #4
- b.gt 3b
- 4:
- ret
- endfunc
|