123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- /*****************************************************************************
- * quant.c: ppc quantization
- *****************************************************************************
- * Copyright (C) 2007-2018 x264 project
- *
- * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing@x264.com.
- *****************************************************************************/
- #include "common/common.h"
- #include "ppccommon.h"
- #include "quant.h"
- #if !HIGH_BIT_DEPTH
- // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
- #define QUANT_16_U( idx0, idx1 ) \
- { \
- temp1v = vec_ld((idx0), dct); \
- temp2v = vec_ld((idx1), dct); \
- mfvA = vec_ld((idx0), mf); \
- mfvB = vec_ld((idx1), mf); \
- biasvA = vec_ld((idx0), bias); \
- biasvB = vec_ld((idx1), bias); \
- mskA = vec_cmplt(temp1v, zero_s16v); \
- mskB = vec_cmplt(temp2v, zero_s16v); \
- coefvA = (vec_u16_t)vec_abs( temp1v ); \
- coefvB = (vec_u16_t)vec_abs( temp2v ); \
- coefvA = vec_adds(coefvA, biasvA); \
- coefvB = vec_adds(coefvB, biasvB); \
- multEvenvA = vec_mule(coefvA, mfvA); \
- multOddvA = vec_mulo(coefvA, mfvA); \
- multEvenvB = vec_mule(coefvB, mfvB); \
- multOddvB = vec_mulo(coefvB, mfvB); \
- multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
- multOddvA = vec_sr(multOddvA, i_qbitsv); \
- multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
- multOddvB = vec_sr(multOddvB, i_qbitsv); \
- temp1v = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
- tmpv = xxpermdi( temp1v, temp1v, 2 ); \
- temp1v = vec_mergeh( temp1v, tmpv ); \
- temp2v = (vec_s16_t) vec_packs( multEvenvB, multOddvB ); \
- tmpv = xxpermdi( temp2v, temp2v, 2 ); \
- temp2v = vec_mergeh( temp2v, tmpv ); \
- temp1v = vec_xor(temp1v, mskA); \
- temp2v = vec_xor(temp2v, mskB); \
- temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
- vec_st(temp1v, (idx0), dct); \
- temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
- nz = vec_or(nz, vec_or(temp1v, temp2v)); \
- vec_st(temp2v, (idx1), dct); \
- }
- int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
- {
- LOAD_ZERO;
- vector bool short mskA;
- vec_u32_t i_qbitsv;
- vec_u16_t coefvA;
- vec_u32_t multEvenvA, multOddvA;
- vec_u16_t mfvA;
- vec_u16_t biasvA;
- vec_s16_t one = vec_splat_s16(1);
- vec_s16_t nz = zero_s16v;
- vector bool short mskB;
- vec_u16_t coefvB;
- vec_u32_t multEvenvB, multOddvB;
- vec_u16_t mfvB;
- vec_u16_t biasvB;
- vec_s16_t temp1v, temp2v, tmpv;
- vec_u32_u qbits_u;
- qbits_u.s[0]=16;
- i_qbitsv = vec_splat(qbits_u.v, 0);
- QUANT_16_U( 0, 16 );
- return vec_any_ne(nz, zero_s16v);
- }
- // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
- #define QUANT_16_U_DC( idx0, idx1 ) \
- { \
- temp1v = vec_ld((idx0), dct); \
- temp2v = vec_ld((idx1), dct); \
- mskA = vec_cmplt(temp1v, zero_s16v); \
- mskB = vec_cmplt(temp2v, zero_s16v); \
- coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
- coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
- coefvA = vec_add(coefvA, biasv); \
- coefvB = vec_add(coefvB, biasv); \
- multEvenvA = vec_mule(coefvA, mfv); \
- multOddvA = vec_mulo(coefvA, mfv); \
- multEvenvB = vec_mule(coefvB, mfv); \
- multOddvB = vec_mulo(coefvB, mfv); \
- multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
- multOddvA = vec_sr(multOddvA, i_qbitsv); \
- multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
- multOddvB = vec_sr(multOddvB, i_qbitsv); \
- temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
- temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
- temp1v = vec_xor(temp1v, mskA); \
- temp2v = vec_xor(temp2v, mskB); \
- temp1v = vec_add(temp1v, vec_and(mskA, one)); \
- vec_st(temp1v, (idx0), dct); \
- temp2v = vec_add(temp2v, vec_and(mskB, one)); \
- nz = vec_or(nz, vec_or(temp1v, temp2v)); \
- vec_st(temp2v, (idx1), dct); \
- }
- int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
- {
- LOAD_ZERO;
- vector bool short mskA;
- vec_u32_t i_qbitsv;
- vec_u16_t coefvA;
- vec_u32_t multEvenvA, multOddvA;
- vec_s16_t one = vec_splat_s16(1);
- vec_s16_t nz = zero_s16v;
- vector bool short mskB;
- vec_u16_t coefvB;
- vec_u32_t multEvenvB, multOddvB;
- vec_s16_t temp1v, temp2v;
- vec_u16_t mfv;
- vec_u16_t biasv;
- mfv = vec_splats( (uint16_t)mf );
- i_qbitsv = vec_splats( (uint32_t) 16 );
- biasv = vec_splats( (uint16_t)bias );
- QUANT_16_U_DC( 0, 16 );
- return vec_any_ne(nz, zero_s16v);
- }
- // DC quant of a whole 2x2 block
- #define QUANT_4_U_DC( idx0 ) \
- { \
- const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
- temp1v = vec_ld((idx0), dct); \
- mskA = vec_cmplt(temp1v, zero_s16v); \
- coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
- coefvA = vec_add(coefvA, biasv); \
- multEvenvA = vec_mule(coefvA, mfv); \
- multOddvA = vec_mulo(coefvA, mfv); \
- multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
- multOddvA = vec_sr(multOddvA, i_qbitsv); \
- temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
- temp2v = vec_xor(temp2v, mskA); \
- temp2v = vec_add(temp2v, vec_and(mskA, one)); \
- temp1v = vec_sel(temp1v, temp2v, sel); \
- nz = vec_or(nz, temp1v); \
- vec_st(temp1v, (idx0), dct); \
- }
- int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
- {
- LOAD_ZERO;
- vector bool short mskA;
- vec_u32_t i_qbitsv;
- vec_u16_t coefvA;
- vec_u32_t multEvenvA, multOddvA;
- vec_s16_t one = vec_splat_s16(1);
- vec_s16_t nz = zero_s16v;
- vec_s16_t temp1v, temp2v;
- vec_u16_t mfv;
- vec_u16_t biasv;
- mfv = vec_splats( (uint16_t)mf );
- i_qbitsv = vec_splats( (uint32_t) 16 );
- biasv = vec_splats( (uint16_t)bias );
- static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
- QUANT_4_U_DC(0);
- return vec_any_ne(vec_and(nz, mask2), zero_s16v);
- }
- int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
- {
- LOAD_ZERO;
- vector bool short mskA;
- vec_u32_t i_qbitsv;
- vec_u16_t coefvA;
- vec_u32_t multEvenvA, multOddvA;
- vec_u16_t mfvA;
- vec_u16_t biasvA;
- vec_s16_t one = vec_splat_s16(1);
- vec_s16_t nz = zero_s16v;
- vector bool short mskB;
- vec_u16_t coefvB;
- vec_u32_t multEvenvB, multOddvB;
- vec_u16_t mfvB;
- vec_u16_t biasvB;
- vec_s16_t temp1v, temp2v, tmpv;
- vec_u32_u qbits_u;
- qbits_u.s[0]=16;
- i_qbitsv = vec_splat(qbits_u.v, 0);
- for( int i = 0; i < 4; i++ )
- QUANT_16_U( i*2*16, i*2*16+16 );
- return vec_any_ne(nz, zero_s16v);
- }
- #define DEQUANT_SHL() \
- { \
- dctv = vec_ld(8*y, dct); \
- mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
- mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
- mfv = vec_packs(mf1v, mf2v); \
- \
- multEvenvA = vec_mule(dctv, mfv); \
- multOddvA = vec_mulo(dctv, mfv); \
- dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
- vec_mergel(multEvenvA, multOddvA)); \
- dctv = (vec_s16_t) vec_packs( multEvenvA, multOddvA ); \
- tmpv = xxpermdi( dctv, dctv, 2 ); \
- dctv = vec_mergeh( dctv, tmpv ); \
- dctv = vec_sl(dctv, i_qbitsv); \
- vec_st(dctv, 8*y, dct); \
- }
- #ifdef WORDS_BIGENDIAN
- #define VEC_MULE vec_mule
- #define VEC_MULO vec_mulo
- #else
- #define VEC_MULE vec_mulo
- #define VEC_MULO vec_mule
- #endif
- #define DEQUANT_SHR() \
- { \
- dctv = vec_ld(8*y, dct); \
- dct1v = vec_mergeh(dctv, dctv); \
- dct2v = vec_mergel(dctv, dctv); \
- mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
- mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
- \
- multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \
- multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \
- temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
- temp1v = vec_add(temp1v, fv); \
- temp1v = vec_sra(temp1v, i_qbitsv); \
- \
- multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \
- multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \
- temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
- temp2v = vec_add(temp2v, fv); \
- temp2v = vec_sra(temp2v, i_qbitsv); \
- \
- dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
- vec_st(dctv, y*8, dct); \
- }
- void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
- {
- int i_mf = i_qp%6;
- int i_qbits = i_qp/6 - 4;
- vec_s16_t dctv, tmpv;
- vec_s16_t dct1v, dct2v;
- vec_s32_t mf1v, mf2v;
- vec_s16_t mfv;
- vec_s32_t multEvenvA, multOddvA;
- vec_s32_t temp1v, temp2v;
- if( i_qbits >= 0 )
- {
- vec_u16_t i_qbitsv;
- i_qbitsv = vec_splats( (uint16_t) i_qbits );
- for( int y = 0; y < 4; y+=2 )
- DEQUANT_SHL();
- }
- else
- {
- const int f = 1 << (-i_qbits-1);
- vec_s32_t fv;
- fv = vec_splats( f );
- vec_u32_t i_qbitsv;
- i_qbitsv = vec_splats( (uint32_t)-i_qbits );
- vec_u32_t sixteenv;
- sixteenv = vec_splats( (uint32_t)16 );
- for( int y = 0; y < 4; y+=2 )
- DEQUANT_SHR();
- }
- }
- void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
- {
- int i_mf = i_qp%6;
- int i_qbits = i_qp/6 - 6;
- vec_s16_t dctv, tmpv;
- vec_s16_t dct1v, dct2v;
- vec_s32_t mf1v, mf2v;
- vec_s16_t mfv;
- vec_s32_t multEvenvA, multOddvA;
- vec_s32_t temp1v, temp2v;
- if( i_qbits >= 0 )
- {
- vec_u16_t i_qbitsv;
- i_qbitsv = vec_splats((uint16_t)i_qbits );
- for( int y = 0; y < 16; y+=2 )
- DEQUANT_SHL();
- }
- else
- {
- const int f = 1 << (-i_qbits-1);
- vec_s32_t fv;
- fv = vec_splats( f );
- vec_u32_t i_qbitsv;
- i_qbitsv = vec_splats( (uint32_t)-i_qbits );
- vec_u32_t sixteenv;
- sixteenv = vec_splats( (uint32_t)16 );
- for( int y = 0; y < 16; y+=2 )
- DEQUANT_SHR();
- }
- }
- #endif // !HIGH_BIT_DEPTH
|