trellis-64.asm 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. ;*****************************************************************************
  2. ;* trellis-64.asm: x86_64 trellis quantization
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2012-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;*
  8. ;* This program is free software; you can redistribute it and/or modify
  9. ;* it under the terms of the GNU General Public License as published by
  10. ;* the Free Software Foundation; either version 2 of the License, or
  11. ;* (at your option) any later version.
  12. ;*
  13. ;* This program is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. ;* GNU General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU General Public License
  19. ;* along with this program; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  21. ;*
  22. ;* This program is also available under a commercial proprietary license.
  23. ;* For more information, contact us at licensing@x264.com.
  24. ;*****************************************************************************
  25. ; This is a pretty straight-forward translation of the C code, except:
  26. ; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
  27. ; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
  28. ; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
  29. ; nodes are invalid).
  30. ; * Interprocedural register allocation. Eliminates argument-passing overhead
  31. ; to trellis_coef* subroutines. Also reduces codesize.
  32. ; Optimizations that I tried, and rejected because they were not faster:
  33. ; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
  34. ; Costs too much icache compared to the negligible speedup.
  35. ; * There are only 21 possible sets of live node_ctxs; we could keep track of
  36. ; exactly which set we're in and feed that (along with abs_level) into a jump
  37. ; table instead of the switch to select a trellis_coef subroutine. This would
  38. ; eliminate all branches about which node_ctxs are live, but costs either a
  39. ; bunch of icache or a bunch of call/ret, and the jump table itself is
  40. ; unpredictable.
  41. ; * Separate versions of trellis_coef* depending on whether we're doing the 1st
  42. ; or the 2nd of the two abs_level candidates. This would eliminate some
  43. ; branches about if(score is better).
  44. ; * Special case more values of coef. I had a coef2 at some intermediate point
  45. ; in the optimization process, but it didn't end up worthwhile in conjunction
  46. ; with all the other optimizations.
  47. ; * Unroll or simd writeback. I don't know why this didn't help.
  48. %include "x86inc.asm"
  49. %include "x86util.asm"
  50. SECTION_RODATA
  51. pd_m16: times 4 dd -16
  52. sq_1: dq 1, 0
  53. pq_128: times 2 dq 128
  54. pq_ffffffff: times 2 dq 0xffffffff
  55. cextern pd_8
  56. cextern pd_0123
  57. cextern pd_4567
  58. cextern_common cabac_entropy
  59. cextern_common cabac_transition
  60. cextern cabac_size_unary
  61. cextern cabac_transition_unary
  62. cextern_common dct4_weight_tab
  63. cextern_common dct8_weight_tab
  64. cextern_common dct4_weight2_tab
  65. cextern_common dct8_weight2_tab
  66. cextern_common last_coeff_flag_offset_8x8
  67. cextern_common significant_coeff_flag_offset_8x8
  68. cextern_common coeff_flag_offset_chroma_422_dc
  69. SECTION .text
  70. %define TRELLIS_SCORE_BIAS 1<<60
  71. %define SIZEOF_NODE 16
  72. %define CABAC_SIZE_BITS 8
  73. %define LAMBDA_BITS 4
  74. %macro SQUARE 2 ; dst, tmp
  75. ; could use pmuldq here, to eliminate the abs. but that would involve
  76. ; templating a sse4 version of all of trellis, for negligible speedup.
  77. %if cpuflag(ssse3)
  78. pabsd m%1, m%1
  79. pmuludq m%1, m%1
  80. %elif HIGH_BIT_DEPTH
  81. ABSD m%2, m%1
  82. SWAP %1, %2
  83. pmuludq m%1, m%1
  84. %else
  85. pmuludq m%1, m%1
  86. pand m%1, [pq_ffffffff]
  87. %endif
  88. %endmacro
  89. %macro LOAD_DUP 2 ; dst, src
  90. %if cpuflag(ssse3)
  91. movddup %1, %2
  92. %else
  93. movd %1, %2
  94. punpcklqdq %1, %1
  95. %endif
  96. %endmacro
  97. ;-----------------------------------------------------------------------------
  98. ; int trellis_cabac_4x4_psy(
  99. ; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
  100. ; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
  101. ; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
  102. ; uint64_t level_state0, uint16_t level_state1,
  103. ; int b_ac, dctcoef *fenc_dct, int psy_trellis )
  104. ;-----------------------------------------------------------------------------
  105. %macro TRELLIS 4
  106. %define num_coefs %2
  107. %define dc %3
  108. %define psy %4
  109. cglobal %1, 4,15,9
  110. %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
  111. %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
  112. SUB rsp, pad
  113. DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
  114. %if WIN64
  115. %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
  116. %else
  117. %define level_statem rsp+stack_offset+32
  118. %endif
  119. %define b_acm r11m ; 4x4 only
  120. %define b_interlacedm r11m ; 8x8 only
  121. %define i_coefsm1 r11m ; dc only
  122. %define fenc_dctm r12m
  123. %define psy_trellism r13m
  124. %if num_coefs == 64
  125. shl dword b_interlacedm, 6
  126. %define dct_weight1_tab dct8_weight_tab
  127. %define dct_weight2_tab dct8_weight2_tab
  128. %else
  129. %define dct_weight1_tab dct4_weight_tab
  130. %define dct_weight2_tab dct4_weight2_tab
  131. %endif
  132. %define stack rsp
  133. %define last_nnzm [stack+0]
  134. %define zigzagm [stack+8]
  135. mov last_nnzm, iid
  136. mov zigzagm, zigzagq
  137. %if WIN64 == 0
  138. %define orig_coefsm [stack+16]
  139. %define quant_coefsm [stack+24]
  140. mov orig_coefsm, orig_coefsq
  141. mov quant_coefsm, quant_coefsq
  142. %endif
  143. %define unquant_mfm [stack+32]
  144. %define levelgt1_ctxm [stack+40]
  145. %define ssd stack+48
  146. %define cost_siglast stack+80
  147. %define level_tree stack+96
  148. ; trellis_node_t is layed out differently than C.
  149. ; struct-of-arrays rather than array-of-structs, for simd.
  150. %define nodes_curq r7
  151. %define nodes_prevq r8
  152. %define node_score(x) x*8
  153. %define node_level_idx(x) 64+x*4
  154. %define node_cabac_state(x) 96+x*4
  155. lea nodes_curq, [level_tree + level_tree_size]
  156. lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
  157. mov r6, TRELLIS_SCORE_BIAS
  158. mov [nodes_curq + node_score(0)], r6
  159. mov dword [nodes_curq + node_level_idx(0)], 0
  160. movd mm0, [level_statem + 0]
  161. punpcklbw mm0, [level_statem + 4]
  162. punpcklwd mm0, [level_statem + 8]
  163. %define level_state_packed mm0 ; version for copying into node.cabac_state
  164. pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
  165. movq [nodes_curq + node_score(1)], m7
  166. mova [nodes_curq + node_score(2)], m7
  167. %define levels_usedq r4
  168. %define levels_usedd r4d
  169. mov dword [level_tree], 0
  170. mov levels_usedd, 1
  171. %define abs_levelq r9
  172. %define abs_leveld r9d
  173. %define abs_coefq r14
  174. %define zigzagiq r5
  175. %define zigzagid r5d
  176. %if num_coefs == 8
  177. mov dword levelgt1_ctxm, 8
  178. %else
  179. mov dword levelgt1_ctxm, 9
  180. %endif
  181. %if psy
  182. LOAD_DUP m6, psy_trellism
  183. %define psy_trellis m6
  184. %elif dc
  185. LOAD_DUP m6, [unquant_mfq]
  186. paddd m6, m6
  187. %define unquant_mf m6
  188. %endif
  189. %ifdef PIC
  190. %if dc == 0
  191. mov unquant_mfm, unquant_mfq
  192. %endif
  193. ; Keep a single offset register to PICify all global constants.
  194. ; They're all relative to "beginning of this asm file's .text section",
  195. ; even tables that aren't in this file.
  196. ; (Any address in .text would work, this one was just convenient.)
  197. lea r0, [$$]
  198. %define GLOBAL +r0-$$
  199. %else
  200. %define GLOBAL
  201. %endif
  202. TRELLIS_LOOP 0 ; node_ctx 0..3
  203. TRELLIS_LOOP 1 ; node_ctx 1..7
  204. .writeback:
  205. ; int level = bnode->level_idx;
  206. ; for( int i = b_ac; i <= last_nnz; i++ )
  207. ; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
  208. ; level = level_tree[level].next;
  209. mov iid, last_nnzm
  210. add zigzagq, iiq
  211. neg iiq
  212. %if num_coefs == 16 && dc == 0
  213. mov r2d, b_acm
  214. add iiq, r2
  215. %endif
  216. %define dctq r10
  217. mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
  218. .writeback_loop:
  219. movzx r2, byte [zigzagq + iiq]
  220. %if cpuflag(ssse3)
  221. movd m0, [level_tree + r0*4]
  222. movzx r0, word [level_tree + r0*4]
  223. psrld m0, 16
  224. movd m1, [dctq + r2*SIZEOF_DCTCOEF]
  225. %if HIGH_BIT_DEPTH
  226. psignd m0, m1
  227. movd [dctq + r2*SIZEOF_DCTCOEF], m0
  228. %else
  229. psignw m0, m1
  230. movd r4d, m0
  231. mov [dctq + r2*SIZEOF_DCTCOEF], r4w
  232. %endif
  233. %else
  234. mov r5d, [level_tree + r0*4]
  235. %if HIGH_BIT_DEPTH
  236. mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
  237. %else
  238. movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
  239. %endif
  240. movzx r0d, r5w
  241. sar r4d, 31
  242. shr r5d, 16
  243. xor r5d, r4d
  244. sub r5d, r4d
  245. %if HIGH_BIT_DEPTH
  246. mov [dctq + r2*SIZEOF_DCTCOEF], r5d
  247. %else
  248. mov [dctq + r2*SIZEOF_DCTCOEF], r5w
  249. %endif
  250. %endif
  251. inc iiq
  252. jle .writeback_loop
  253. mov eax, 1
  254. .return:
  255. ADD rsp, pad
  256. RET
  257. %if num_coefs == 16 && dc == 0
  258. .return_zero:
  259. pxor m0, m0
  260. mova [r10+ 0], m0
  261. mova [r10+16], m0
  262. %if HIGH_BIT_DEPTH
  263. mova [r10+32], m0
  264. mova [r10+48], m0
  265. %endif
  266. jmp .return
  267. %endif
  268. %endmacro ; TRELLIS
  269. %macro TRELLIS_LOOP 1 ; ctx_hi
  270. .i_loop%1:
  271. ; if( !quant_coefs[i] )
  272. mov r6, quant_coefsm
  273. %if HIGH_BIT_DEPTH
  274. mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
  275. %else
  276. movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
  277. %endif
  278. ; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
  279. ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
  280. mov r10, cabac_state_sigm
  281. %if num_coefs == 64
  282. mov r6d, b_interlacedm
  283. %ifdef PIC
  284. add r6d, iid
  285. movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
  286. %else
  287. movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
  288. %endif
  289. movzx r10, byte [r10 + r6]
  290. %elif num_coefs == 8
  291. movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
  292. movzx r10, byte [r10 + r13]
  293. %else
  294. movzx r10, byte [r10 + iiq]
  295. %endif
  296. test abs_leveld, abs_leveld
  297. jnz %%.nonzero_quant_coef
  298. %if %1 == 0
  299. ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
  300. ; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
  301. ; nodes_cur[0].score -= cost_sig0;
  302. movzx r10, word [cabac_entropy + r10*2 GLOBAL]
  303. imul r10, lambda2q
  304. shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
  305. sub [nodes_curq + node_score(0)], r10
  306. %endif
  307. ZERO_LEVEL_IDX %1, cur
  308. jmp .i_continue%1
  309. %%.nonzero_quant_coef:
  310. ; int sign_coef = orig_coefs[zigzag[i]];
  311. ; int abs_coef = abs( sign_coef );
  312. ; int q = abs( quant_coefs[i] );
  313. movzx zigzagid, byte [zigzagq+iiq]
  314. movd m0, abs_leveld
  315. mov r6, orig_coefsm
  316. %if HIGH_BIT_DEPTH
  317. LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
  318. %else
  319. LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
  320. psrad m1, 16 ; sign_coef
  321. %endif
  322. punpcklqdq m0, m0 ; quant_coef
  323. %if cpuflag(ssse3)
  324. pabsd m0, m0
  325. pabsd m2, m1 ; abs_coef
  326. %else
  327. pxor m8, m8
  328. pcmpgtd m8, m1 ; sign_mask
  329. pxor m0, m8
  330. pxor m2, m1, m8
  331. psubd m0, m8
  332. psubd m2, m8
  333. %endif
  334. psubd m0, [sq_1] ; abs_level
  335. movd abs_leveld, m0
  336. xchg nodes_curq, nodes_prevq
  337. ; if( i < num_coefs-1 )
  338. ; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
  339. ; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
  340. ; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
  341. ; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
  342. ; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
  343. ; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
  344. %if %1 == 0
  345. %if dc && num_coefs != 8
  346. cmp iid, i_coefsm1
  347. %else
  348. cmp iid, num_coefs-1
  349. %endif
  350. je %%.zero_siglast
  351. %endif
  352. movzx r11, word [cabac_entropy + r10*2 GLOBAL]
  353. xor r10, 1
  354. movzx r12, word [cabac_entropy + r10*2 GLOBAL]
  355. mov [cost_siglast+0], r11d
  356. mov r10, cabac_state_lastm
  357. %if num_coefs == 64
  358. movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
  359. movzx r10, byte [r10 + r6]
  360. %elif num_coefs == 8
  361. movzx r10, byte [r10 + r13]
  362. %else
  363. movzx r10, byte [r10 + iiq]
  364. %endif
  365. movzx r11, word [cabac_entropy + r10*2 GLOBAL]
  366. add r11, r12
  367. mov [cost_siglast+4], r11d
  368. %if %1 == 0
  369. xor r10, 1
  370. movzx r10, word [cabac_entropy + r10*2 GLOBAL]
  371. add r10, r12
  372. mov [cost_siglast+8], r10d
  373. %endif
  374. %%.skip_siglast:
  375. ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
  376. ; int d = abs_coef - unquant_abs_level;
  377. ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
  378. %if dc
  379. pmuludq m0, unquant_mf
  380. %else
  381. %ifdef PIC
  382. mov r10, unquant_mfm
  383. LOAD_DUP m3, [r10 + zigzagiq*4]
  384. %else
  385. LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
  386. %endif
  387. pmuludq m0, m3
  388. %endif
  389. paddd m0, [pq_128]
  390. psrld m0, 8 ; unquant_abs_level
  391. %if psy || dc == 0
  392. mova m4, m0
  393. %endif
  394. psubd m0, m2
  395. SQUARE 0, 3
  396. %if dc
  397. psllq m0, 8
  398. %else
  399. LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
  400. pmuludq m0, m5
  401. %endif
  402. %if psy
  403. test iid, iid
  404. jz %%.dc_rounding
  405. ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
  406. ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
  407. ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
  408. ; ssd1[k] -= psy_weight * psy_value;
  409. mov r6, fenc_dctm
  410. %if HIGH_BIT_DEPTH
  411. LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
  412. %else
  413. LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
  414. psrad m3, 16 ; orig_coef
  415. %endif
  416. %if cpuflag(ssse3)
  417. psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
  418. %else
  419. PSIGN d, m4, m8
  420. %endif
  421. psubd m3, m1 ; predicted_coef
  422. paddd m4, m3
  423. %if cpuflag(ssse3)
  424. pabsd m4, m4
  425. %else
  426. ABSD m3, m4
  427. SWAP 4, 3
  428. %endif
  429. LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
  430. pmuludq m1, psy_trellis
  431. pmuludq m4, m1
  432. psubq m0, m4
  433. %if %1
  434. %%.dc_rounding:
  435. %endif
  436. %endif
  437. %if %1 == 0
  438. mova [ssd], m0
  439. %endif
  440. %if dc == 0 && %1 == 0
  441. test iid, iid
  442. jnz %%.skip_dc_rounding
  443. %%.dc_rounding:
  444. ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
  445. ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
  446. ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
  447. psrad m1, 31 ; sign_coef>>31
  448. paddd m4, [pd_8]
  449. paddd m4, m1
  450. pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
  451. psubd m4, m2 ; d
  452. SQUARE 4, 3
  453. pmuludq m4, m5
  454. mova [ssd], m4
  455. %%.skip_dc_rounding:
  456. %endif
  457. mova [ssd+16], m0
  458. %assign stack_offset_bak stack_offset
  459. cmp abs_leveld, 1
  460. jl %%.switch_coef0
  461. %if %1 == 0
  462. mov r10, [ssd] ; trellis_coef* args
  463. %endif
  464. movq r12, m0
  465. ; for( int j = 0; j < 8; j++ )
  466. ; nodes_cur[j].score = TRELLIS_SCORE_MAX;
  467. %if cpuflag(ssse3)
  468. mova [nodes_curq + node_score(0)], m7
  469. mova [nodes_curq + node_score(2)], m7
  470. %else ; avoid store-forwarding stalls on k8/k10
  471. %if %1 == 0
  472. movq [nodes_curq + node_score(0)], m7
  473. %endif
  474. movq [nodes_curq + node_score(1)], m7
  475. movq [nodes_curq + node_score(2)], m7
  476. movq [nodes_curq + node_score(3)], m7
  477. %endif
  478. mova [nodes_curq + node_score(4)], m7
  479. mova [nodes_curq + node_score(6)], m7
  480. je %%.switch_coef1
  481. %%.switch_coefn:
  482. call trellis_coefn.entry%1
  483. call trellis_coefn.entry%1b
  484. jmp .i_continue1
  485. %%.switch_coef1:
  486. call trellis_coef1.entry%1
  487. call trellis_coefn.entry%1b
  488. jmp .i_continue1
  489. %%.switch_coef0:
  490. call trellis_coef0_%1
  491. call trellis_coef1.entry%1b
  492. .i_continue%1:
  493. dec iid
  494. %if num_coefs == 16 && dc == 0
  495. cmp iid, b_acm
  496. %endif
  497. jge .i_loop%1
  498. call trellis_bnode_%1
  499. %if %1 == 0
  500. %if num_coefs == 16 && dc == 0
  501. jz .return_zero
  502. %else
  503. jz .return
  504. %endif
  505. jmp .writeback
  506. %%.zero_siglast:
  507. xor r6d, r6d
  508. mov [cost_siglast+0], r6
  509. mov [cost_siglast+8], r6d
  510. jmp %%.skip_siglast
  511. %endif
  512. %endmacro ; TRELLIS_LOOP
  513. ; just a synonym for %if
  514. %macro IF0 1+
  515. %endmacro
  516. %macro IF1 1+
  517. %1
  518. %endmacro
  519. %macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
  520. ; for( int j = 0; j < 8; j++ )
  521. ; nodes_cur[j].level_idx = levels_used;
  522. ; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
  523. ; levels_used++;
  524. add levels_usedd, 3
  525. and levels_usedd, ~3 ; allow aligned stores
  526. movd m0, levels_usedd
  527. pshufd m0, m0, 0
  528. IF%1 mova m1, m0
  529. paddd m0, [pd_0123]
  530. IF%1 paddd m1, [pd_4567]
  531. mova m2, [nodes_%2q + node_level_idx(0)]
  532. IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
  533. mova [nodes_curq + node_level_idx(0)], m0
  534. IF%1 mova [nodes_curq + node_level_idx(4)], m1
  535. mova [level_tree + (levels_usedq+0)*4], m2
  536. IF%1 mova [level_tree + (levels_usedq+4)*4], m3
  537. add levels_usedd, (1+%1)*4
  538. %endmacro
  539. INIT_XMM sse2
  540. TRELLIS trellis_cabac_4x4, 16, 0, 0
  541. TRELLIS trellis_cabac_8x8, 64, 0, 0
  542. TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
  543. TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
  544. TRELLIS trellis_cabac_dc, 16, 1, 0
  545. TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
  546. INIT_XMM ssse3
  547. TRELLIS trellis_cabac_4x4, 16, 0, 0
  548. TRELLIS trellis_cabac_8x8, 64, 0, 0
  549. TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
  550. TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
  551. TRELLIS trellis_cabac_dc, 16, 1, 0
  552. TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
  553. %define stack rsp+gprsize
  554. %define scoreq r14
  555. %define bitsq r13
  556. %define bitsd r13d
  557. INIT_XMM
  558. %macro clocal 1
  559. ALIGN 16
  560. global mangle(private_prefix %+ _%1)
  561. mangle(private_prefix %+ _%1):
  562. %1:
  563. %assign stack_offset stack_offset_bak+gprsize
  564. %endmacro
  565. %macro TRELLIS_BNODE 1 ; ctx_hi
  566. clocal trellis_bnode_%1
  567. ; int j = ctx_hi?1:0;
  568. ; trellis_node_t *bnode = &nodes_cur[j];
  569. ; while( ++j < (ctx_hi?8:4) )
  570. ; if( nodes_cur[j].score < bnode->score )
  571. ; bnode = &nodes_cur[j];
  572. %assign j %1
  573. mov rax, [nodes_curq + node_score(j)]
  574. lea rax, [rax*8 + j]
  575. %rep 3+3*%1
  576. %assign j j+1
  577. mov r11, [nodes_curq + node_score(j)]
  578. lea r11, [r11*8 + j]
  579. cmp rax, r11
  580. cmova rax, r11
  581. %endrep
  582. mov r10, dctm
  583. and eax, 7
  584. ret
  585. %endmacro ; TRELLIS_BNODE
  586. TRELLIS_BNODE 0
  587. TRELLIS_BNODE 1
  588. %macro TRELLIS_COEF0 1 ; ctx_hi
  589. clocal trellis_coef0_%1
  590. ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
  591. mov r11d, [cost_siglast+0]
  592. imul r11, lambda2q
  593. shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
  594. add r11, [ssd+16]
  595. %if %1 == 0
  596. ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
  597. mov scoreq, [nodes_prevq + node_score(0)]
  598. add scoreq, [ssd]
  599. sub scoreq, r11
  600. mov [nodes_curq + node_score(0)], scoreq
  601. %endif
  602. ; memcpy
  603. mov scoreq, [nodes_prevq + node_score(1)]
  604. mov [nodes_curq + node_score(1)], scoreq
  605. mova m1, [nodes_prevq + node_score(2)]
  606. mova [nodes_curq + node_score(2)], m1
  607. %if %1
  608. mova m1, [nodes_prevq + node_score(4)]
  609. mova [nodes_curq + node_score(4)], m1
  610. mova m1, [nodes_prevq + node_score(6)]
  611. mova [nodes_curq + node_score(6)], m1
  612. %endif
  613. mov r6d, [nodes_prevq + node_cabac_state(3)]
  614. mov [nodes_curq + node_cabac_state(3)], r6d
  615. %if %1
  616. mova m1, [nodes_prevq + node_cabac_state(4)]
  617. mova [nodes_curq + node_cabac_state(4)], m1
  618. %endif
  619. ZERO_LEVEL_IDX %1, prev
  620. ret
  621. %endmacro ; TRELLIS_COEF0
  622. TRELLIS_COEF0 0
  623. TRELLIS_COEF0 1
  624. %macro START_COEF 1 ; gt1
  625. ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
  626. mov scoreq, [nodes_prevq + node_score(j)]
  627. %if j > 0
  628. test scoreq, scoreq
  629. js .ctx %+ nextj_if_invalid
  630. %endif
  631. ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
  632. %if j >= 3
  633. movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
  634. movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
  635. %else
  636. movzx r6d, byte [level_statem + coeff_abs_level1_offs]
  637. %endif
  638. %if %1
  639. xor r6d, 1
  640. %endif
  641. movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
  642. ; n.score += ssd;
  643. ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
  644. %if j == 0
  645. add scoreq, r10
  646. add bitsd, [cost_siglast+8]
  647. %else
  648. add scoreq, r12
  649. add bitsd, [cost_siglast+4]
  650. %endif
  651. %endmacro ; START_COEF
  652. %macro END_COEF 1
  653. ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
  654. imul bitsq, lambda2q
  655. shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
  656. add scoreq, bitsq
  657. ; if( n.score < nodes_cur[node_ctx].score )
  658. ; SET_LEVEL( n, abs_level );
  659. ; nodes_cur[node_ctx] = n;
  660. cmp scoreq, [nodes_curq + node_score(node_ctx)]
  661. jae .ctx %+ nextj_if_valid
  662. mov [nodes_curq + node_score(node_ctx)], scoreq
  663. %if j == 2 || (j <= 3 && node_ctx == 4)
  664. ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
  665. movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
  666. %elif j >= 3
  667. ; if we have updated before, then copy cabac_state from the parent node
  668. mov r6d, [nodes_prevq + node_cabac_state(j)]
  669. mov [nodes_curq + node_cabac_state(node_ctx)], r6d
  670. %endif
  671. %if j >= 3 ; skip the transition if we're not going to reuse the context
  672. mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
  673. %endif
  674. %if %1 && node_ctx == 7
  675. mov r6d, levelgt1_ctxm
  676. mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
  677. %endif
  678. mov r6d, [nodes_prevq + node_level_idx(j)]
  679. %if %1
  680. mov r11d, abs_leveld
  681. shl r11d, 16
  682. or r6d, r11d
  683. %else
  684. or r6d, 1<<16
  685. %endif
  686. mov [level_tree + levels_usedq*4], r6d
  687. mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
  688. inc levels_usedd
  689. %endmacro ; END_COEF
  690. %macro COEF1 2
  691. %assign j %1
  692. %assign nextj_if_valid %1+1
  693. %assign nextj_if_invalid %2
  694. %if j < 4
  695. %assign coeff_abs_level1_offs j+1
  696. %else
  697. %assign coeff_abs_level1_offs 0
  698. %endif
  699. %if j < 3
  700. %assign node_ctx j+1
  701. %else
  702. %assign node_ctx j
  703. %endif
  704. .ctx %+ j:
  705. START_COEF 0
  706. add bitsd, 1 << CABAC_SIZE_BITS
  707. END_COEF 0
  708. %endmacro ; COEF1
  709. %macro COEFN 2
  710. %assign j %1
  711. %assign nextj_if_valid %2
  712. %assign nextj_if_invalid %2
  713. %if j < 4
  714. %assign coeff_abs_level1_offs j+1
  715. %assign coeff_abs_levelgt1_offs 5
  716. %else
  717. %assign coeff_abs_level1_offs 0
  718. %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
  719. %endif
  720. %if j < 4
  721. %assign node_ctx 4
  722. %elif j < 7
  723. %assign node_ctx j+1
  724. %else
  725. %assign node_ctx 7
  726. %endif
  727. .ctx %+ j:
  728. START_COEF 1
  729. ; if( abs_level >= 15 )
  730. ; bits += bs_size_ue_big(...)
  731. add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
  732. ; n.cabac_state[levelgt1_ctx]
  733. %if j == 7 ; && compiling support for 4:2:2
  734. mov r6d, levelgt1_ctxm
  735. %define coeff_abs_levelgt1_offs r6
  736. %endif
  737. %if j == 7
  738. movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
  739. %else
  740. movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
  741. %endif
  742. ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
  743. add r10d, r1d
  744. movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
  745. add bitsd, r6d
  746. %if node_ctx == 7
  747. movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
  748. %endif
  749. END_COEF 1
  750. %endmacro ; COEFN
  751. clocal trellis_coef1
  752. .entry0b: ; ctx_lo, larger of the two abs_level candidates
  753. mov r10, [ssd+8]
  754. sub r10, r11
  755. mov r12, [ssd+24]
  756. sub r12, r11
  757. .entry0: ; ctx_lo, smaller of the two abs_level candidates
  758. COEF1 0, 4
  759. COEF1 1, 4
  760. COEF1 2, 4
  761. COEF1 3, 4
  762. .ctx4:
  763. rep ret
  764. .entry1b: ; ctx_hi, larger of the two abs_level candidates
  765. mov r12, [ssd+24]
  766. sub r12, r11
  767. .entry1: ; ctx_hi, smaller of the two abs_level candidates
  768. trellis_coef1_hi:
  769. COEF1 1, 2
  770. COEF1 2, 3
  771. COEF1 3, 4
  772. COEF1 4, 5
  773. COEF1 5, 6
  774. COEF1 6, 7
  775. COEF1 7, 8
  776. .ctx8:
  777. rep ret
  778. %macro COEFN_PREFIX 1
  779. ; int prefix = X264_MIN( abs_level - 1, 14 );
  780. mov r1d, abs_leveld
  781. cmp abs_leveld, 15
  782. jge .level_suffix%1
  783. xor r5d, r5d
  784. .skip_level_suffix%1:
  785. shl r1d, 7
  786. %endmacro
  787. %macro COEFN_SUFFIX 1
  788. .level_suffix%1:
  789. ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
  790. lea r5d, [abs_levelq-14]
  791. bsr r5d, r5d
  792. shl r5d, CABAC_SIZE_BITS+1
  793. add r5d, 1<<CABAC_SIZE_BITS
  794. ; int prefix = X264_MIN( abs_level - 1, 14 );
  795. mov r1d, 15
  796. jmp .skip_level_suffix%1
  797. %endmacro
  798. clocal trellis_coefn
  799. .entry0b:
  800. mov r10, [ssd+8]
  801. mov r12, [ssd+24]
  802. inc abs_leveld
  803. .entry0:
  804. ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
  805. ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
  806. ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
  807. ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
  808. ; The C version has to be fully separate since C doesn't support multiple
  809. ; entrypoints. But return-on-first-failure isn't very important here (as
  810. ; opposed to coef1), so I might as well reduce codesize.
  811. COEFN_PREFIX 0
  812. COEFN 0, 1
  813. COEFN 1, 2
  814. COEFN 2, 3
  815. COEFN 3, 8
  816. .ctx8:
  817. mov zigzagq, zigzagm ; unspill since r1 was clobbered
  818. ret
  819. .entry1b:
  820. mov r12, [ssd+24]
  821. inc abs_leveld
  822. .entry1:
  823. COEFN_PREFIX 1
  824. COEFN 4, 5
  825. COEFN 5, 6
  826. COEFN 6, 7
  827. COEFN 7, 1
  828. jmp .ctx1
  829. COEFN_SUFFIX 0
  830. COEFN_SUFFIX 1