123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893 |
- ;*****************************************************************************
- ;* trellis-64.asm: x86_64 trellis quantization
- ;*****************************************************************************
- ;* Copyright (C) 2012-2018 x264 project
- ;*
- ;* Authors: Loren Merritt <lorenm@u.washington.edu>
- ;*
- ;* This program is free software; you can redistribute it and/or modify
- ;* it under the terms of the GNU General Public License as published by
- ;* the Free Software Foundation; either version 2 of the License, or
- ;* (at your option) any later version.
- ;*
- ;* This program is distributed in the hope that it will be useful,
- ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
- ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ;* GNU General Public License for more details.
- ;*
- ;* You should have received a copy of the GNU General Public License
- ;* along with this program; if not, write to the Free Software
- ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- ;*
- ;* This program is also available under a commercial proprietary license.
- ;* For more information, contact us at licensing@x264.com.
- ;*****************************************************************************
- ; This is a pretty straight-forward translation of the C code, except:
- ; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
- ; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
- ; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
- ; nodes are invalid).
- ; * Interprocedural register allocation. Eliminates argument-passing overhead
- ; to trellis_coef* subroutines. Also reduces codesize.
- ; Optimizations that I tried, and rejected because they were not faster:
- ; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
- ; Costs too much icache compared to the negligible speedup.
- ; * There are only 21 possible sets of live node_ctxs; we could keep track of
- ; exactly which set we're in and feed that (along with abs_level) into a jump
- ; table instead of the switch to select a trellis_coef subroutine. This would
- ; eliminate all branches about which node_ctxs are live, but costs either a
- ; bunch of icache or a bunch of call/ret, and the jump table itself is
- ; unpredictable.
- ; * Separate versions of trellis_coef* depending on whether we're doing the 1st
- ; or the 2nd of the two abs_level candidates. This would eliminate some
- ; branches about if(score is better).
- ; * Special case more values of coef. I had a coef2 at some intermediate point
- ; in the optimization process, but it didn't end up worthwhile in conjunction
- ; with all the other optimizations.
- ; * Unroll or simd writeback. I don't know why this didn't help.
- %include "x86inc.asm"
- %include "x86util.asm"
- SECTION_RODATA
- pd_m16: times 4 dd -16
- sq_1: dq 1, 0
- pq_128: times 2 dq 128
- pq_ffffffff: times 2 dq 0xffffffff
- cextern pd_8
- cextern pd_0123
- cextern pd_4567
- cextern_common cabac_entropy
- cextern_common cabac_transition
- cextern cabac_size_unary
- cextern cabac_transition_unary
- cextern_common dct4_weight_tab
- cextern_common dct8_weight_tab
- cextern_common dct4_weight2_tab
- cextern_common dct8_weight2_tab
- cextern_common last_coeff_flag_offset_8x8
- cextern_common significant_coeff_flag_offset_8x8
- cextern_common coeff_flag_offset_chroma_422_dc
- SECTION .text
- %define TRELLIS_SCORE_BIAS 1<<60
- %define SIZEOF_NODE 16
- %define CABAC_SIZE_BITS 8
- %define LAMBDA_BITS 4
- %macro SQUARE 2 ; dst, tmp
- ; could use pmuldq here, to eliminate the abs. but that would involve
- ; templating a sse4 version of all of trellis, for negligible speedup.
- %if cpuflag(ssse3)
- pabsd m%1, m%1
- pmuludq m%1, m%1
- %elif HIGH_BIT_DEPTH
- ABSD m%2, m%1
- SWAP %1, %2
- pmuludq m%1, m%1
- %else
- pmuludq m%1, m%1
- pand m%1, [pq_ffffffff]
- %endif
- %endmacro
- %macro LOAD_DUP 2 ; dst, src
- %if cpuflag(ssse3)
- movddup %1, %2
- %else
- movd %1, %2
- punpcklqdq %1, %1
- %endif
- %endmacro
- ;-----------------------------------------------------------------------------
- ; int trellis_cabac_4x4_psy(
- ; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
- ; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
- ; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
- ; uint64_t level_state0, uint16_t level_state1,
- ; int b_ac, dctcoef *fenc_dct, int psy_trellis )
- ;-----------------------------------------------------------------------------
- %macro TRELLIS 4
- %define num_coefs %2
- %define dc %3
- %define psy %4
- cglobal %1, 4,15,9
- %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
- %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
- SUB rsp, pad
- DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
- %if WIN64
- %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
- %else
- %define level_statem rsp+stack_offset+32
- %endif
- %define b_acm r11m ; 4x4 only
- %define b_interlacedm r11m ; 8x8 only
- %define i_coefsm1 r11m ; dc only
- %define fenc_dctm r12m
- %define psy_trellism r13m
- %if num_coefs == 64
- shl dword b_interlacedm, 6
- %define dct_weight1_tab dct8_weight_tab
- %define dct_weight2_tab dct8_weight2_tab
- %else
- %define dct_weight1_tab dct4_weight_tab
- %define dct_weight2_tab dct4_weight2_tab
- %endif
- %define stack rsp
- %define last_nnzm [stack+0]
- %define zigzagm [stack+8]
- mov last_nnzm, iid
- mov zigzagm, zigzagq
- %if WIN64 == 0
- %define orig_coefsm [stack+16]
- %define quant_coefsm [stack+24]
- mov orig_coefsm, orig_coefsq
- mov quant_coefsm, quant_coefsq
- %endif
- %define unquant_mfm [stack+32]
- %define levelgt1_ctxm [stack+40]
- %define ssd stack+48
- %define cost_siglast stack+80
- %define level_tree stack+96
- ; trellis_node_t is layed out differently than C.
- ; struct-of-arrays rather than array-of-structs, for simd.
- %define nodes_curq r7
- %define nodes_prevq r8
- %define node_score(x) x*8
- %define node_level_idx(x) 64+x*4
- %define node_cabac_state(x) 96+x*4
- lea nodes_curq, [level_tree + level_tree_size]
- lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
- mov r6, TRELLIS_SCORE_BIAS
- mov [nodes_curq + node_score(0)], r6
- mov dword [nodes_curq + node_level_idx(0)], 0
- movd mm0, [level_statem + 0]
- punpcklbw mm0, [level_statem + 4]
- punpcklwd mm0, [level_statem + 8]
- %define level_state_packed mm0 ; version for copying into node.cabac_state
- pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
- movq [nodes_curq + node_score(1)], m7
- mova [nodes_curq + node_score(2)], m7
- %define levels_usedq r4
- %define levels_usedd r4d
- mov dword [level_tree], 0
- mov levels_usedd, 1
- %define abs_levelq r9
- %define abs_leveld r9d
- %define abs_coefq r14
- %define zigzagiq r5
- %define zigzagid r5d
- %if num_coefs == 8
- mov dword levelgt1_ctxm, 8
- %else
- mov dword levelgt1_ctxm, 9
- %endif
- %if psy
- LOAD_DUP m6, psy_trellism
- %define psy_trellis m6
- %elif dc
- LOAD_DUP m6, [unquant_mfq]
- paddd m6, m6
- %define unquant_mf m6
- %endif
- %ifdef PIC
- %if dc == 0
- mov unquant_mfm, unquant_mfq
- %endif
- ; Keep a single offset register to PICify all global constants.
- ; They're all relative to "beginning of this asm file's .text section",
- ; even tables that aren't in this file.
- ; (Any address in .text would work, this one was just convenient.)
- lea r0, [$$]
- %define GLOBAL +r0-$$
- %else
- %define GLOBAL
- %endif
- TRELLIS_LOOP 0 ; node_ctx 0..3
- TRELLIS_LOOP 1 ; node_ctx 1..7
- .writeback:
- ; int level = bnode->level_idx;
- ; for( int i = b_ac; i <= last_nnz; i++ )
- ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
- ; level = level_tree[level].next;
- mov iid, last_nnzm
- add zigzagq, iiq
- neg iiq
- %if num_coefs == 16 && dc == 0
- mov r2d, b_acm
- add iiq, r2
- %endif
- %define dctq r10
- mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
- .writeback_loop:
- movzx r2, byte [zigzagq + iiq]
- %if cpuflag(ssse3)
- movd m0, [level_tree + r0*4]
- movzx r0, word [level_tree + r0*4]
- psrld m0, 16
- movd m1, [dctq + r2*SIZEOF_DCTCOEF]
- %if HIGH_BIT_DEPTH
- psignd m0, m1
- movd [dctq + r2*SIZEOF_DCTCOEF], m0
- %else
- psignw m0, m1
- movd r4d, m0
- mov [dctq + r2*SIZEOF_DCTCOEF], r4w
- %endif
- %else
- mov r5d, [level_tree + r0*4]
- %if HIGH_BIT_DEPTH
- mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
- %else
- movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
- %endif
- movzx r0d, r5w
- sar r4d, 31
- shr r5d, 16
- xor r5d, r4d
- sub r5d, r4d
- %if HIGH_BIT_DEPTH
- mov [dctq + r2*SIZEOF_DCTCOEF], r5d
- %else
- mov [dctq + r2*SIZEOF_DCTCOEF], r5w
- %endif
- %endif
- inc iiq
- jle .writeback_loop
- mov eax, 1
- .return:
- ADD rsp, pad
- RET
- %if num_coefs == 16 && dc == 0
- .return_zero:
- pxor m0, m0
- mova [r10+ 0], m0
- mova [r10+16], m0
- %if HIGH_BIT_DEPTH
- mova [r10+32], m0
- mova [r10+48], m0
- %endif
- jmp .return
- %endif
- %endmacro ; TRELLIS
- %macro TRELLIS_LOOP 1 ; ctx_hi
- .i_loop%1:
- ; if( !quant_coefs[i] )
- mov r6, quant_coefsm
- %if HIGH_BIT_DEPTH
- mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
- %else
- movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
- %endif
- ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
- ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
- mov r10, cabac_state_sigm
- %if num_coefs == 64
- mov r6d, b_interlacedm
- %ifdef PIC
- add r6d, iid
- movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
- %else
- movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
- %endif
- movzx r10, byte [r10 + r6]
- %elif num_coefs == 8
- movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
- movzx r10, byte [r10 + r13]
- %else
- movzx r10, byte [r10 + iiq]
- %endif
- test abs_leveld, abs_leveld
- jnz %%.nonzero_quant_coef
- %if %1 == 0
- ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
- ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
- ; nodes_cur[0].score -= cost_sig0;
- movzx r10, word [cabac_entropy + r10*2 GLOBAL]
- imul r10, lambda2q
- shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
- sub [nodes_curq + node_score(0)], r10
- %endif
- ZERO_LEVEL_IDX %1, cur
- jmp .i_continue%1
- %%.nonzero_quant_coef:
- ; int sign_coef = orig_coefs[zigzag[i]];
- ; int abs_coef = abs( sign_coef );
- ; int q = abs( quant_coefs[i] );
- movzx zigzagid, byte [zigzagq+iiq]
- movd m0, abs_leveld
- mov r6, orig_coefsm
- %if HIGH_BIT_DEPTH
- LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
- %else
- LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
- psrad m1, 16 ; sign_coef
- %endif
- punpcklqdq m0, m0 ; quant_coef
- %if cpuflag(ssse3)
- pabsd m0, m0
- pabsd m2, m1 ; abs_coef
- %else
- pxor m8, m8
- pcmpgtd m8, m1 ; sign_mask
- pxor m0, m8
- pxor m2, m1, m8
- psubd m0, m8
- psubd m2, m8
- %endif
- psubd m0, [sq_1] ; abs_level
- movd abs_leveld, m0
- xchg nodes_curq, nodes_prevq
- ; if( i < num_coefs-1 )
- ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
- ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
- ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
- ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
- ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
- ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
- %if %1 == 0
- %if dc && num_coefs != 8
- cmp iid, i_coefsm1
- %else
- cmp iid, num_coefs-1
- %endif
- je %%.zero_siglast
- %endif
- movzx r11, word [cabac_entropy + r10*2 GLOBAL]
- xor r10, 1
- movzx r12, word [cabac_entropy + r10*2 GLOBAL]
- mov [cost_siglast+0], r11d
- mov r10, cabac_state_lastm
- %if num_coefs == 64
- movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
- movzx r10, byte [r10 + r6]
- %elif num_coefs == 8
- movzx r10, byte [r10 + r13]
- %else
- movzx r10, byte [r10 + iiq]
- %endif
- movzx r11, word [cabac_entropy + r10*2 GLOBAL]
- add r11, r12
- mov [cost_siglast+4], r11d
- %if %1 == 0
- xor r10, 1
- movzx r10, word [cabac_entropy + r10*2 GLOBAL]
- add r10, r12
- mov [cost_siglast+8], r10d
- %endif
- %%.skip_siglast:
- ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
- ; int d = abs_coef - unquant_abs_level;
- ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
- %if dc
- pmuludq m0, unquant_mf
- %else
- %ifdef PIC
- mov r10, unquant_mfm
- LOAD_DUP m3, [r10 + zigzagiq*4]
- %else
- LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
- %endif
- pmuludq m0, m3
- %endif
- paddd m0, [pq_128]
- psrld m0, 8 ; unquant_abs_level
- %if psy || dc == 0
- mova m4, m0
- %endif
- psubd m0, m2
- SQUARE 0, 3
- %if dc
- psllq m0, 8
- %else
- LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
- pmuludq m0, m5
- %endif
- %if psy
- test iid, iid
- jz %%.dc_rounding
- ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
- ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
- ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
- ; ssd1[k] -= psy_weight * psy_value;
- mov r6, fenc_dctm
- %if HIGH_BIT_DEPTH
- LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
- %else
- LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
- psrad m3, 16 ; orig_coef
- %endif
- %if cpuflag(ssse3)
- psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
- %else
- PSIGN d, m4, m8
- %endif
- psubd m3, m1 ; predicted_coef
- paddd m4, m3
- %if cpuflag(ssse3)
- pabsd m4, m4
- %else
- ABSD m3, m4
- SWAP 4, 3
- %endif
- LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
- pmuludq m1, psy_trellis
- pmuludq m4, m1
- psubq m0, m4
- %if %1
- %%.dc_rounding:
- %endif
- %endif
- %if %1 == 0
- mova [ssd], m0
- %endif
- %if dc == 0 && %1 == 0
- test iid, iid
- jnz %%.skip_dc_rounding
- %%.dc_rounding:
- ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
- ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
- ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
- psrad m1, 31 ; sign_coef>>31
- paddd m4, [pd_8]
- paddd m4, m1
- pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
- psubd m4, m2 ; d
- SQUARE 4, 3
- pmuludq m4, m5
- mova [ssd], m4
- %%.skip_dc_rounding:
- %endif
- mova [ssd+16], m0
- %assign stack_offset_bak stack_offset
- cmp abs_leveld, 1
- jl %%.switch_coef0
- %if %1 == 0
- mov r10, [ssd] ; trellis_coef* args
- %endif
- movq r12, m0
- ; for( int j = 0; j < 8; j++ )
- ; nodes_cur[j].score = TRELLIS_SCORE_MAX;
- %if cpuflag(ssse3)
- mova [nodes_curq + node_score(0)], m7
- mova [nodes_curq + node_score(2)], m7
- %else ; avoid store-forwarding stalls on k8/k10
- %if %1 == 0
- movq [nodes_curq + node_score(0)], m7
- %endif
- movq [nodes_curq + node_score(1)], m7
- movq [nodes_curq + node_score(2)], m7
- movq [nodes_curq + node_score(3)], m7
- %endif
- mova [nodes_curq + node_score(4)], m7
- mova [nodes_curq + node_score(6)], m7
- je %%.switch_coef1
- %%.switch_coefn:
- call trellis_coefn.entry%1
- call trellis_coefn.entry%1b
- jmp .i_continue1
- %%.switch_coef1:
- call trellis_coef1.entry%1
- call trellis_coefn.entry%1b
- jmp .i_continue1
- %%.switch_coef0:
- call trellis_coef0_%1
- call trellis_coef1.entry%1b
- .i_continue%1:
- dec iid
- %if num_coefs == 16 && dc == 0
- cmp iid, b_acm
- %endif
- jge .i_loop%1
- call trellis_bnode_%1
- %if %1 == 0
- %if num_coefs == 16 && dc == 0
- jz .return_zero
- %else
- jz .return
- %endif
- jmp .writeback
- %%.zero_siglast:
- xor r6d, r6d
- mov [cost_siglast+0], r6
- mov [cost_siglast+8], r6d
- jmp %%.skip_siglast
- %endif
- %endmacro ; TRELLIS_LOOP
- ; just a synonym for %if
- %macro IF0 1+
- %endmacro
- %macro IF1 1+
- %1
- %endmacro
- %macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
- ; for( int j = 0; j < 8; j++ )
- ; nodes_cur[j].level_idx = levels_used;
- ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
- ; levels_used++;
- add levels_usedd, 3
- and levels_usedd, ~3 ; allow aligned stores
- movd m0, levels_usedd
- pshufd m0, m0, 0
- IF%1 mova m1, m0
- paddd m0, [pd_0123]
- IF%1 paddd m1, [pd_4567]
- mova m2, [nodes_%2q + node_level_idx(0)]
- IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
- mova [nodes_curq + node_level_idx(0)], m0
- IF%1 mova [nodes_curq + node_level_idx(4)], m1
- mova [level_tree + (levels_usedq+0)*4], m2
- IF%1 mova [level_tree + (levels_usedq+4)*4], m3
- add levels_usedd, (1+%1)*4
- %endmacro
- INIT_XMM sse2
- TRELLIS trellis_cabac_4x4, 16, 0, 0
- TRELLIS trellis_cabac_8x8, 64, 0, 0
- TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
- TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
- TRELLIS trellis_cabac_dc, 16, 1, 0
- TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
- INIT_XMM ssse3
- TRELLIS trellis_cabac_4x4, 16, 0, 0
- TRELLIS trellis_cabac_8x8, 64, 0, 0
- TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
- TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
- TRELLIS trellis_cabac_dc, 16, 1, 0
- TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
- %define stack rsp+gprsize
- %define scoreq r14
- %define bitsq r13
- %define bitsd r13d
- INIT_XMM
- %macro clocal 1
- ALIGN 16
- global mangle(private_prefix %+ _%1)
- mangle(private_prefix %+ _%1):
- %1:
- %assign stack_offset stack_offset_bak+gprsize
- %endmacro
- %macro TRELLIS_BNODE 1 ; ctx_hi
- clocal trellis_bnode_%1
- ; int j = ctx_hi?1:0;
- ; trellis_node_t *bnode = &nodes_cur[j];
- ; while( ++j < (ctx_hi?8:4) )
- ; if( nodes_cur[j].score < bnode->score )
- ; bnode = &nodes_cur[j];
- %assign j %1
- mov rax, [nodes_curq + node_score(j)]
- lea rax, [rax*8 + j]
- %rep 3+3*%1
- %assign j j+1
- mov r11, [nodes_curq + node_score(j)]
- lea r11, [r11*8 + j]
- cmp rax, r11
- cmova rax, r11
- %endrep
- mov r10, dctm
- and eax, 7
- ret
- %endmacro ; TRELLIS_BNODE
- TRELLIS_BNODE 0
- TRELLIS_BNODE 1
- %macro TRELLIS_COEF0 1 ; ctx_hi
- clocal trellis_coef0_%1
- ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
- mov r11d, [cost_siglast+0]
- imul r11, lambda2q
- shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
- add r11, [ssd+16]
- %if %1 == 0
- ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
- mov scoreq, [nodes_prevq + node_score(0)]
- add scoreq, [ssd]
- sub scoreq, r11
- mov [nodes_curq + node_score(0)], scoreq
- %endif
- ; memcpy
- mov scoreq, [nodes_prevq + node_score(1)]
- mov [nodes_curq + node_score(1)], scoreq
- mova m1, [nodes_prevq + node_score(2)]
- mova [nodes_curq + node_score(2)], m1
- %if %1
- mova m1, [nodes_prevq + node_score(4)]
- mova [nodes_curq + node_score(4)], m1
- mova m1, [nodes_prevq + node_score(6)]
- mova [nodes_curq + node_score(6)], m1
- %endif
- mov r6d, [nodes_prevq + node_cabac_state(3)]
- mov [nodes_curq + node_cabac_state(3)], r6d
- %if %1
- mova m1, [nodes_prevq + node_cabac_state(4)]
- mova [nodes_curq + node_cabac_state(4)], m1
- %endif
- ZERO_LEVEL_IDX %1, prev
- ret
- %endmacro ; TRELLIS_COEF0
- TRELLIS_COEF0 0
- TRELLIS_COEF0 1
- %macro START_COEF 1 ; gt1
- ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
- mov scoreq, [nodes_prevq + node_score(j)]
- %if j > 0
- test scoreq, scoreq
- js .ctx %+ nextj_if_invalid
- %endif
- ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
- %if j >= 3
- movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
- movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
- %else
- movzx r6d, byte [level_statem + coeff_abs_level1_offs]
- %endif
- %if %1
- xor r6d, 1
- %endif
- movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
- ; n.score += ssd;
- ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
- %if j == 0
- add scoreq, r10
- add bitsd, [cost_siglast+8]
- %else
- add scoreq, r12
- add bitsd, [cost_siglast+4]
- %endif
- %endmacro ; START_COEF
- %macro END_COEF 1
- ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
- imul bitsq, lambda2q
- shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
- add scoreq, bitsq
- ; if( n.score < nodes_cur[node_ctx].score )
- ; SET_LEVEL( n, abs_level );
- ; nodes_cur[node_ctx] = n;
- cmp scoreq, [nodes_curq + node_score(node_ctx)]
- jae .ctx %+ nextj_if_valid
- mov [nodes_curq + node_score(node_ctx)], scoreq
- %if j == 2 || (j <= 3 && node_ctx == 4)
- ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
- movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
- %elif j >= 3
- ; if we have updated before, then copy cabac_state from the parent node
- mov r6d, [nodes_prevq + node_cabac_state(j)]
- mov [nodes_curq + node_cabac_state(node_ctx)], r6d
- %endif
- %if j >= 3 ; skip the transition if we're not going to reuse the context
- mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
- %endif
- %if %1 && node_ctx == 7
- mov r6d, levelgt1_ctxm
- mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
- %endif
- mov r6d, [nodes_prevq + node_level_idx(j)]
- %if %1
- mov r11d, abs_leveld
- shl r11d, 16
- or r6d, r11d
- %else
- or r6d, 1<<16
- %endif
- mov [level_tree + levels_usedq*4], r6d
- mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
- inc levels_usedd
- %endmacro ; END_COEF
- %macro COEF1 2
- %assign j %1
- %assign nextj_if_valid %1+1
- %assign nextj_if_invalid %2
- %if j < 4
- %assign coeff_abs_level1_offs j+1
- %else
- %assign coeff_abs_level1_offs 0
- %endif
- %if j < 3
- %assign node_ctx j+1
- %else
- %assign node_ctx j
- %endif
- .ctx %+ j:
- START_COEF 0
- add bitsd, 1 << CABAC_SIZE_BITS
- END_COEF 0
- %endmacro ; COEF1
- %macro COEFN 2
- %assign j %1
- %assign nextj_if_valid %2
- %assign nextj_if_invalid %2
- %if j < 4
- %assign coeff_abs_level1_offs j+1
- %assign coeff_abs_levelgt1_offs 5
- %else
- %assign coeff_abs_level1_offs 0
- %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
- %endif
- %if j < 4
- %assign node_ctx 4
- %elif j < 7
- %assign node_ctx j+1
- %else
- %assign node_ctx 7
- %endif
- .ctx %+ j:
- START_COEF 1
- ; if( abs_level >= 15 )
- ; bits += bs_size_ue_big(...)
- add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
- ; n.cabac_state[levelgt1_ctx]
- %if j == 7 ; && compiling support for 4:2:2
- mov r6d, levelgt1_ctxm
- %define coeff_abs_levelgt1_offs r6
- %endif
- %if j == 7
- movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
- %else
- movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
- %endif
- ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
- add r10d, r1d
- movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
- add bitsd, r6d
- %if node_ctx == 7
- movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
- %endif
- END_COEF 1
- %endmacro ; COEFN
- clocal trellis_coef1
- .entry0b: ; ctx_lo, larger of the two abs_level candidates
- mov r10, [ssd+8]
- sub r10, r11
- mov r12, [ssd+24]
- sub r12, r11
- .entry0: ; ctx_lo, smaller of the two abs_level candidates
- COEF1 0, 4
- COEF1 1, 4
- COEF1 2, 4
- COEF1 3, 4
- .ctx4:
- rep ret
- .entry1b: ; ctx_hi, larger of the two abs_level candidates
- mov r12, [ssd+24]
- sub r12, r11
- .entry1: ; ctx_hi, smaller of the two abs_level candidates
- trellis_coef1_hi:
- COEF1 1, 2
- COEF1 2, 3
- COEF1 3, 4
- COEF1 4, 5
- COEF1 5, 6
- COEF1 6, 7
- COEF1 7, 8
- .ctx8:
- rep ret
- %macro COEFN_PREFIX 1
- ; int prefix = X264_MIN( abs_level - 1, 14 );
- mov r1d, abs_leveld
- cmp abs_leveld, 15
- jge .level_suffix%1
- xor r5d, r5d
- .skip_level_suffix%1:
- shl r1d, 7
- %endmacro
- %macro COEFN_SUFFIX 1
- .level_suffix%1:
- ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
- lea r5d, [abs_levelq-14]
- bsr r5d, r5d
- shl r5d, CABAC_SIZE_BITS+1
- add r5d, 1<<CABAC_SIZE_BITS
- ; int prefix = X264_MIN( abs_level - 1, 14 );
- mov r1d, 15
- jmp .skip_level_suffix%1
- %endmacro
- clocal trellis_coefn
- .entry0b:
- mov r10, [ssd+8]
- mov r12, [ssd+24]
- inc abs_leveld
- .entry0:
- ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
- ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
- ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
- ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
- ; The C version has to be fully separate since C doesn't support multiple
- ; entrypoints. But return-on-first-failure isn't very important here (as
- ; opposed to coef1), so I might as well reduce codesize.
- COEFN_PREFIX 0
- COEFN 0, 1
- COEFN 1, 2
- COEFN 2, 3
- COEFN 3, 8
- .ctx8:
- mov zigzagq, zigzagm ; unspill since r1 was clobbered
- ret
- .entry1b:
- mov r12, [ssd+24]
- inc abs_leveld
- .entry1:
- COEFN_PREFIX 1
- COEFN 4, 5
- COEFN 5, 6
- COEFN 6, 7
- COEFN 7, 1
- jmp .ctx1
- COEFN_SUFFIX 0
- COEFN_SUFFIX 1
|