cabac-a.asm 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. ;*****************************************************************************
  2. ;* cabac-a.asm: x86 cabac
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2008-2018 x264 project
  5. ;*
  6. ;* Authors: Loren Merritt <lorenm@u.washington.edu>
  7. ;* Fiona Glaser <fiona@x264.com>
  8. ;* Holger Lubitz <holger@lubitz.org>
  9. ;*
  10. ;* This program is free software; you can redistribute it and/or modify
  11. ;* it under the terms of the GNU General Public License as published by
  12. ;* the Free Software Foundation; either version 2 of the License, or
  13. ;* (at your option) any later version.
  14. ;*
  15. ;* This program is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. ;* GNU General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU General Public License
  21. ;* along with this program; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. ;*
  24. ;* This program is also available under a commercial proprietary license.
  25. ;* For more information, contact us at licensing@x264.com.
  26. ;*****************************************************************************
  27. %include "x86inc.asm"
  28. %include "x86util.asm"
  29. SECTION_RODATA 64
  30. %if ARCH_X86_64
  31. %macro COEFF_LAST_TABLE 4-18 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
  32. %xdefine %%funccpu1 %2 ; last4
  33. %xdefine %%funccpu2 %3 ; last64
  34. %xdefine %%funccpu3 %4 ; last15/last16
  35. coeff_last_%1:
  36. %ifdef PIC
  37. %xdefine %%base coeff_last_%1 ; offset relative to the start of the table
  38. %else
  39. %xdefine %%base 0 ; absolute address
  40. %endif
  41. %rep 14
  42. %ifidn %5, 4
  43. dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
  44. %elifidn %5, 64
  45. dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu2) - %%base
  46. %else
  47. dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu3) - %%base
  48. %endif
  49. %rotate 1
  50. %endrep
  51. dd 0, 0 ; 64-byte alignment padding
  52. %endmacro
  53. cextern coeff_last4_mmx2
  54. cextern coeff_last4_lzcnt
  55. %if HIGH_BIT_DEPTH
  56. cextern coeff_last4_avx512
  57. %endif
  58. cextern coeff_last15_sse2
  59. cextern coeff_last15_lzcnt
  60. cextern coeff_last15_avx512
  61. cextern coeff_last16_sse2
  62. cextern coeff_last16_lzcnt
  63. cextern coeff_last16_avx512
  64. cextern coeff_last64_sse2
  65. cextern coeff_last64_lzcnt
  66. cextern coeff_last64_avx2
  67. cextern coeff_last64_avx512
  68. COEFF_LAST_TABLE sse2, mmx2, sse2, sse2
  69. COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, lzcnt
  70. COEFF_LAST_TABLE avx2, lzcnt, avx2, lzcnt
  71. %if HIGH_BIT_DEPTH
  72. COEFF_LAST_TABLE avx512, avx512, avx512, avx512
  73. %else
  74. COEFF_LAST_TABLE avx512, lzcnt, avx512, avx512
  75. %endif
  76. %endif
  77. coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
  78. coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
  79. coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
  80. db 4, 4, 4, 4, 5, 6, 7, 7
  81. SECTION .text
  82. cextern_common cabac_range_lps
  83. cextern_common cabac_transition
  84. cextern_common cabac_renorm_shift
  85. cextern_common cabac_entropy
  86. cextern cabac_size_unary
  87. cextern cabac_transition_unary
  88. cextern_common significant_coeff_flag_offset
  89. cextern_common significant_coeff_flag_offset_8x8
  90. cextern_common last_coeff_flag_offset
  91. cextern_common last_coeff_flag_offset_8x8
  92. cextern_common coeff_abs_level_m1_offset
  93. cextern_common count_cat_m1
  94. cextern cabac_encode_ue_bypass
  95. %if ARCH_X86_64
  96. %define pointer resq
  97. %else
  98. %define pointer resd
  99. %endif
  100. struc cb
  101. .low: resd 1
  102. .range: resd 1
  103. .queue: resd 1
  104. .bytes_outstanding: resd 1
  105. .start: pointer 1
  106. .p: pointer 1
  107. .end: pointer 1
  108. align 64, resb 1
  109. .bits_encoded: resd 1
  110. .state: resb 1024
  111. endstruc
  112. %macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
  113. %ifdef PIC
  114. %ifidn %4, 0
  115. movzx %1, byte [%2+%3+r7-$$]
  116. %else
  117. lea %5, [r7+%4]
  118. movzx %1, byte [%2+%3+%5-$$]
  119. %endif
  120. %else
  121. movzx %1, byte [%2+%3+%4]
  122. %endif
  123. %endmacro
  124. %macro CABAC 1
  125. ; t3 must be ecx, since it's used for shift.
  126. %if WIN64
  127. DECLARE_REG_TMP 3,1,2,0,5,6,4,4
  128. %elif ARCH_X86_64
  129. DECLARE_REG_TMP 0,1,2,3,4,5,6,6
  130. %else
  131. DECLARE_REG_TMP 0,4,2,1,3,5,6,2
  132. %endif
  133. cglobal cabac_encode_decision_%1, 1,7
  134. movifnidn t1d, r1m
  135. mov t5d, [r0+cb.range]
  136. movzx t6d, byte [r0+cb.state+t1]
  137. movifnidn t0, r0 ; WIN64
  138. mov t4d, ~1
  139. mov t3d, t5d
  140. and t4d, t6d
  141. shr t5d, 6
  142. movifnidn t2d, r2m
  143. %if WIN64
  144. PUSH r7
  145. %endif
  146. %ifdef PIC
  147. lea r7, [$$]
  148. %endif
  149. LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
  150. LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
  151. and t6d, 1
  152. sub t3d, t5d
  153. cmp t6d, t2d
  154. mov t6d, [t0+cb.low]
  155. lea t2, [t6+t3]
  156. cmovne t3d, t5d
  157. cmovne t6d, t2d
  158. mov [t0+cb.state+t1], t4b
  159. ;cabac_encode_renorm
  160. mov t4d, t3d
  161. %ifidn %1, bmi2
  162. lzcnt t3d, t3d
  163. sub t3d, 23
  164. shlx t4d, t4d, t3d
  165. shlx t6d, t6d, t3d
  166. %else
  167. shr t3d, 3
  168. LOAD_GLOBAL t3d, cabac_renorm_shift, t3
  169. shl t4d, t3b
  170. shl t6d, t3b
  171. %endif
  172. %if WIN64
  173. POP r7
  174. %endif
  175. mov [t0+cb.range], t4d
  176. add t3d, [t0+cb.queue]
  177. jge cabac_putbyte_%1
  178. .update_queue_low:
  179. mov [t0+cb.low], t6d
  180. mov [t0+cb.queue], t3d
  181. RET
  182. cglobal cabac_encode_bypass_%1, 2,3
  183. mov t7d, [r0+cb.low]
  184. and r1d, [r0+cb.range]
  185. lea t7d, [t7*2+r1]
  186. movifnidn t0, r0 ; WIN64
  187. mov t3d, [r0+cb.queue]
  188. inc t3d
  189. %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
  190. jge cabac_putbyte_%1
  191. %else
  192. jge .putbyte
  193. %endif
  194. mov [t0+cb.low], t7d
  195. mov [t0+cb.queue], t3d
  196. RET
  197. %if ARCH_X86_64 == 0
  198. .putbyte:
  199. PROLOGUE 0,7
  200. movifnidn t6d, t7d
  201. jmp cabac_putbyte_%1
  202. %endif
  203. %ifnidn %1,bmi2
  204. cglobal cabac_encode_terminal_%1, 1,3
  205. sub dword [r0+cb.range], 2
  206. ; shortcut: the renormalization shift in terminal
  207. ; can only be 0 or 1 and is zero over 99% of the time.
  208. test dword [r0+cb.range], 0x100
  209. je .renorm
  210. RET
  211. .renorm:
  212. shl dword [r0+cb.low], 1
  213. shl dword [r0+cb.range], 1
  214. inc dword [r0+cb.queue]
  215. jge .putbyte
  216. RET
  217. .putbyte:
  218. PROLOGUE 0,7
  219. movifnidn t0, r0 ; WIN64
  220. mov t3d, [r0+cb.queue]
  221. mov t6d, [t0+cb.low]
  222. %endif
  223. cabac_putbyte_%1:
  224. ; alive: t0=cb t3=queue t6=low
  225. %if WIN64
  226. DECLARE_REG_TMP 3,6,1,0,2,5,4
  227. %endif
  228. %ifidn %1, bmi2
  229. add t3d, 10
  230. shrx t2d, t6d, t3d
  231. bzhi t6d, t6d, t3d
  232. sub t3d, 18
  233. %else
  234. mov t1d, -1
  235. add t3d, 10
  236. mov t2d, t6d
  237. shl t1d, t3b
  238. shr t2d, t3b ; out
  239. not t1d
  240. sub t3d, 18
  241. and t6d, t1d
  242. %endif
  243. mov t5d, [t0+cb.bytes_outstanding]
  244. cmp t2b, 0xff ; FIXME is a 32bit op faster?
  245. jz .postpone
  246. mov t1, [t0+cb.p]
  247. add [t1-1], t2h
  248. dec t2h
  249. .loop_outstanding:
  250. mov [t1], t2h
  251. inc t1
  252. dec t5d
  253. jge .loop_outstanding
  254. mov [t1-1], t2b
  255. mov [t0+cb.p], t1
  256. .postpone:
  257. inc t5d
  258. mov [t0+cb.bytes_outstanding], t5d
  259. jmp mangle(private_prefix %+ _cabac_encode_decision_%1.update_queue_low)
  260. %endmacro
  261. CABAC asm
  262. CABAC bmi2
  263. ; %1 = label name
  264. ; %2 = node_ctx init?
  265. %macro COEFF_ABS_LEVEL_GT1 2
  266. %if %2
  267. %define ctx 1
  268. %else
  269. movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
  270. %define ctx r11
  271. %endif
  272. movzx r9d, byte [r8+ctx]
  273. ; if( coeff_abs > 1 )
  274. cmp r1d, 1
  275. jg .%1_gt1
  276. ; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
  277. movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
  278. movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
  279. lea r0d, [r0+r9+256]
  280. mov [r8+ctx], r10b
  281. %if %2
  282. mov r2d, 1
  283. %else
  284. movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
  285. %endif
  286. jmp .%1_end
  287. .%1_gt1:
  288. ; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
  289. movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
  290. xor r9d, 1
  291. movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
  292. mov [r8+ctx], r10b
  293. add r0d, r9d
  294. %if %2
  295. %define ctx 5
  296. %else
  297. movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
  298. %define ctx r11
  299. %endif
  300. ; if( coeff_abs < 15 )
  301. cmp r1d, 15
  302. jge .%1_escape
  303. shl r1d, 7
  304. ; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
  305. movzx r9d, byte [r8+ctx]
  306. add r9d, r1d
  307. movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
  308. ; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
  309. movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
  310. mov [r8+ctx], r10b
  311. add r0d, r9d
  312. jmp .%1_gt1_end
  313. .%1_escape:
  314. ; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
  315. movzx r9d, byte [r8+ctx]
  316. movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
  317. ; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
  318. movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
  319. add r0d, r9d
  320. mov [r8+ctx], r10b
  321. sub r1d, 14
  322. %if cpuflag(lzcnt)
  323. lzcnt r9d, r1d
  324. xor r9d, 0x1f
  325. %else
  326. bsr r9d, r1d
  327. %endif
  328. ; bs_size_ue_big(coeff_abs-15)<<8
  329. shl r9d, 9
  330. ; (ilog2(coeff_abs-14)+1) << 8
  331. lea r0d, [r0+r9+256]
  332. .%1_gt1_end:
  333. %if %2
  334. mov r2d, 4
  335. %else
  336. movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
  337. %endif
  338. .%1_end:
  339. %endmacro
  340. %macro LOAD_DCTCOEF 1
  341. %if HIGH_BIT_DEPTH
  342. mov %1, [dct+r6*4]
  343. %else
  344. movzx %1, word [dct+r6*2]
  345. %endif
  346. %endmacro
  347. %macro ABS_DCTCOEFS 2
  348. %if HIGH_BIT_DEPTH
  349. %define %%abs ABSD
  350. %else
  351. %define %%abs ABSW
  352. %endif
  353. %if mmsize == %2*SIZEOF_DCTCOEF
  354. %%abs m0, [%1], m1
  355. mova [rsp], m0
  356. %elif mmsize == %2*SIZEOF_DCTCOEF/2
  357. %%abs m0, [%1+0*mmsize], m2
  358. %%abs m1, [%1+1*mmsize], m3
  359. mova [rsp+0*mmsize], m0
  360. mova [rsp+1*mmsize], m1
  361. %else
  362. %assign i 0
  363. %rep %2*SIZEOF_DCTCOEF/(4*mmsize)
  364. %%abs m0, [%1+(4*i+0)*mmsize], m4
  365. %%abs m1, [%1+(4*i+1)*mmsize], m5
  366. %%abs m2, [%1+(4*i+2)*mmsize], m4
  367. %%abs m3, [%1+(4*i+3)*mmsize], m5
  368. mova [rsp+(4*i+0)*mmsize], m0
  369. mova [rsp+(4*i+1)*mmsize], m1
  370. mova [rsp+(4*i+2)*mmsize], m2
  371. mova [rsp+(4*i+3)*mmsize], m3
  372. %assign i i+1
  373. %endrep
  374. %endif
  375. %endmacro
  376. %macro SIG_OFFSET 1
  377. %if %1
  378. movzx r11d, byte [r4+r6]
  379. %endif
  380. %endmacro
  381. %macro LAST_OFFSET 1
  382. %if %1
  383. movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
  384. %endif
  385. %endmacro
  386. %macro COEFF_LAST 2 ; table, ctx_block_cat
  387. %ifdef PIC
  388. lea r1, [%1 GLOBAL]
  389. movsxd r6, [r1+4*%2]
  390. add r6, r1
  391. %else
  392. movsxd r6, [%1+4*%2]
  393. %endif
  394. call r6
  395. %endmacro
  396. ;-----------------------------------------------------------------------------
  397. ; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
  398. ; int ctx_block_cat, x264_cabac_t *cb );
  399. ;-----------------------------------------------------------------------------
  400. ;%1 = 8x8 mode
  401. %macro CABAC_RESIDUAL_RD 2
  402. %if %1
  403. %define func cabac_block_residual_8x8_rd_internal
  404. %define maxcoeffs 64
  405. %define dct rsp
  406. %else
  407. %define func cabac_block_residual_rd_internal
  408. %define maxcoeffs 16
  409. %define dct r4
  410. %endif
  411. %ifdef PIC
  412. cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
  413. lea r12, [$$]
  414. %define GLOBAL +r12-$$
  415. %else
  416. cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
  417. %define GLOBAL
  418. %endif
  419. shl r1d, 4 ; MB_INTERLACED*16
  420. %if %1
  421. lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
  422. %endif
  423. add r1d, r2d
  424. movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
  425. movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
  426. movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
  427. ; abs() all the coefficients; copy them to the stack to avoid
  428. ; changing the originals.
  429. ; overreading is okay; it's all valid aligned data anyways.
  430. %if %1
  431. ABS_DCTCOEFS r0, 64
  432. %else
  433. mov r4, r0 ; r4 = dct
  434. and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
  435. ABS_DCTCOEFS r4, 16
  436. xor r4, r0 ; calculate our new dct pointer
  437. add r4, rsp ; restore AC coefficient offset
  438. %endif
  439. ; for improved OOE performance, run coeff_last on the original coefficients.
  440. COEFF_LAST %2, r2 ; coeff_last[ctx_block_cat]( dct )
  441. ; we know on 64-bit that the SSE2 versions of this function only
  442. ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
  443. ; don't need r2 in 8x8 mode.
  444. mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
  445. ; pre-add some values to simplify addressing
  446. add r3, cb.state
  447. add r5, r3
  448. add r7, r3
  449. add r8, r3 ; precalculate cabac state pointers
  450. ; if( last != count_cat_m1[ctx_block_cat] )
  451. %if %1
  452. cmp r6b, 63
  453. %else
  454. cmp r6b, [count_cat_m1+r2 GLOBAL]
  455. %endif
  456. je .skip_last_sigmap
  457. ; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
  458. ; so we'll use r11 for this.
  459. %if %1
  460. %define siglast_ctx r11
  461. %else
  462. %define siglast_ctx r6
  463. %endif
  464. ; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
  465. ; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
  466. SIG_OFFSET %1
  467. movzx r1d, byte [r5+siglast_ctx]
  468. movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
  469. xor r1d, 1
  470. movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
  471. mov [r5+siglast_ctx], r9b
  472. add r0d, r1d
  473. LAST_OFFSET %1
  474. movzx r1d, byte [r7+siglast_ctx]
  475. movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
  476. xor r1d, 1
  477. movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
  478. mov [r7+siglast_ctx], r9b
  479. add r0d, r1d
  480. .skip_last_sigmap:
  481. LOAD_DCTCOEF r1d
  482. COEFF_ABS_LEVEL_GT1 last, 1
  483. ; for( int i = last-1 ; i >= 0; i-- )
  484. dec r6d
  485. jl .end
  486. .coeff_loop:
  487. LOAD_DCTCOEF r1d
  488. ; if( l[i] )
  489. SIG_OFFSET %1
  490. movzx r9d, byte [r5+siglast_ctx]
  491. test r1d, r1d
  492. jnz .coeff_nonzero
  493. ; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
  494. movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
  495. movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
  496. mov [r5+siglast_ctx], r10b
  497. add r0d, r9d
  498. dec r6d
  499. jge .coeff_loop
  500. jmp .end
  501. .coeff_nonzero:
  502. ; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
  503. movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
  504. xor r9d, 1
  505. movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
  506. mov [r5+siglast_ctx], r10b
  507. add r0d, r9d
  508. ; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
  509. LAST_OFFSET %1
  510. movzx r9d, byte [r7+siglast_ctx]
  511. movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
  512. movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
  513. mov [r7+siglast_ctx], r10b
  514. add r0d, r9d
  515. COEFF_ABS_LEVEL_GT1 coeff, 0
  516. dec r6d
  517. jge .coeff_loop
  518. .end:
  519. mov [r3+cb.bits_encoded-cb.state], r0d
  520. RET
  521. %endmacro
  522. %if ARCH_X86_64
  523. INIT_XMM sse2
  524. CABAC_RESIDUAL_RD 0, coeff_last_sse2
  525. CABAC_RESIDUAL_RD 1, coeff_last_sse2
  526. INIT_XMM lzcnt
  527. CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
  528. CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
  529. INIT_XMM ssse3
  530. CABAC_RESIDUAL_RD 0, coeff_last_sse2
  531. CABAC_RESIDUAL_RD 1, coeff_last_sse2
  532. INIT_XMM ssse3,lzcnt
  533. CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
  534. CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
  535. %if HIGH_BIT_DEPTH
  536. INIT_ZMM avx512
  537. %else
  538. INIT_YMM avx512
  539. %endif
  540. CABAC_RESIDUAL_RD 0, coeff_last_avx512
  541. INIT_ZMM avx512
  542. CABAC_RESIDUAL_RD 1, coeff_last_avx512
  543. %endif
  544. ;-----------------------------------------------------------------------------
  545. ; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
  546. ; int ctx_block_cat, x264_cabac_t *cb );
  547. ;-----------------------------------------------------------------------------
  548. %macro CALL_CABAC 0
  549. %if cpuflag(bmi2)
  550. call cabac_encode_decision_bmi2
  551. %else
  552. call cabac_encode_decision_asm
  553. %endif
  554. %if WIN64 ; move cabac back
  555. mov r0, r3
  556. %endif
  557. %endmacro
  558. ; %1 = 8x8 mode
  559. ; %2 = dct register
  560. ; %3 = countcat
  561. ; %4 = name
  562. %macro SIGMAP_LOOP 3-4
  563. .sigmap_%4loop:
  564. %if HIGH_BIT_DEPTH
  565. mov %2, [dct+r10*4]
  566. %else
  567. movsx %2, word [dct+r10*2]
  568. %endif
  569. %if %1
  570. movzx r1d, byte [sigoff_8x8 + r10]
  571. add r1d, sigoffd
  572. %else
  573. lea r1d, [sigoffd + r10d]
  574. %endif
  575. test %2, %2
  576. jz .sigmap_%4zero ; if( l[i] )
  577. inc coeffidxd
  578. mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
  579. mov r2d, 1
  580. CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
  581. %if %1
  582. movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
  583. add r1d, lastoffd
  584. %else
  585. lea r1d, [lastoffd + r10d]
  586. %endif
  587. cmp r10d, lastm ; if( i == last )
  588. je .sigmap_%4last
  589. xor r2d, r2d
  590. CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
  591. jmp .sigmap_%4loop_endcheck
  592. .sigmap_%4zero:
  593. xor r2d, r2d
  594. CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
  595. .sigmap_%4loop_endcheck:
  596. inc r10d
  597. cmp r10d, %3
  598. jne .sigmap_%4loop ; if( ++i == count_m1 )
  599. %if HIGH_BIT_DEPTH
  600. mov %2, [dct+r10*4]
  601. %else
  602. movsx %2, word [dct+r10*2]
  603. %endif
  604. inc coeffidxd
  605. mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
  606. jmp .sigmap_%4end
  607. .sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
  608. mov r2d, 1
  609. CALL_CABAC
  610. .sigmap_%4end:
  611. %if %1==0
  612. jmp .level_loop_start
  613. %endif
  614. %endmacro
  615. %macro CABAC_RESIDUAL 1
  616. cglobal cabac_block_residual_internal, 4,15,0,-4*64
  617. %ifdef PIC
  618. ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
  619. lea r7, [$$]
  620. %define lastm [rsp+4*1]
  621. %define GLOBAL +r7-$$
  622. %else
  623. %define lastm r7d
  624. %define GLOBAL
  625. %endif
  626. shl r1d, 4
  627. %define sigoffq r8
  628. %define sigoffd r8d
  629. %define lastoffq r9
  630. %define lastoffd r9d
  631. %define leveloffq r10
  632. %define leveloffd r10d
  633. %define leveloffm [rsp+4*0]
  634. %define countcatd r11d
  635. %define sigoff_8x8 r12
  636. %define coeffidxq r13
  637. %define coeffidxd r13d
  638. %define dct r14
  639. %define coeffs rsp+4*2
  640. lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
  641. add r1d, r2d
  642. movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
  643. movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
  644. movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
  645. movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
  646. mov coeffidxd, -1
  647. mov dct, r0
  648. mov leveloffm, leveloffd
  649. COEFF_LAST %1, r2
  650. mov lastm, eax
  651. ; put cabac in r0; needed for cabac_encode_decision
  652. mov r0, r3
  653. xor r10d, r10d
  654. cmp countcatd, 63
  655. je .sigmap_8x8
  656. SIGMAP_LOOP 0, r12d, countcatd,
  657. .sigmap_8x8:
  658. SIGMAP_LOOP 1, r11d, 63, _8x8
  659. .level_loop_start:
  660. ; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
  661. %define nodectxq r8
  662. %define nodectxd r8d
  663. mov leveloffd, leveloffm
  664. xor nodectxd, nodectxd
  665. .level_loop:
  666. mov r9d, [coeffs+coeffidxq*4]
  667. mov r11d, r9d
  668. sar r11d, 31
  669. add r9d, r11d
  670. movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
  671. xor r9d, r11d
  672. add r1d, leveloffd
  673. cmp r9d, 1
  674. jg .level_gt1
  675. xor r2d, r2d
  676. CALL_CABAC
  677. movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
  678. jmp .level_sign
  679. .level_gt1:
  680. mov r2d, 1
  681. CALL_CABAC
  682. movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
  683. add r14d, leveloffd
  684. cmp r9d, 15
  685. mov r12d, 15
  686. cmovl r12d, r9d
  687. sub r12d, 2
  688. jz .level_eq2
  689. .level_gt1_loop:
  690. mov r1d, r14d
  691. mov r2d, 1
  692. CALL_CABAC
  693. dec r12d
  694. jg .level_gt1_loop
  695. cmp r9d, 15
  696. jge .level_bypass
  697. .level_eq2:
  698. mov r1d, r14d
  699. xor r2d, r2d
  700. CALL_CABAC
  701. jmp .level_gt1_end
  702. .level_bypass:
  703. lea r2d, [r9d-15]
  704. xor r1d, r1d
  705. push r0
  706. ; we could avoid this if we implemented it in asm, but I don't feel like that
  707. ; right now.
  708. %if UNIX64
  709. push r7
  710. push r8
  711. %else
  712. sub rsp, 40 ; shadow space and alignment
  713. %endif
  714. call cabac_encode_ue_bypass
  715. %if UNIX64
  716. pop r8
  717. pop r7
  718. %else
  719. add rsp, 40
  720. %endif
  721. pop r0
  722. .level_gt1_end:
  723. movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
  724. .level_sign:
  725. mov r1d, r11d
  726. %if cpuflag(bmi2)
  727. call cabac_encode_bypass_bmi2
  728. %else
  729. call cabac_encode_bypass_asm
  730. %endif
  731. %if WIN64
  732. mov r0, r3
  733. %endif
  734. dec coeffidxd
  735. jge .level_loop
  736. RET
  737. %endmacro
  738. %if ARCH_X86_64
  739. INIT_XMM sse2
  740. CABAC_RESIDUAL coeff_last_sse2
  741. INIT_XMM lzcnt
  742. CABAC_RESIDUAL coeff_last_lzcnt
  743. INIT_XMM avx2
  744. CABAC_RESIDUAL coeff_last_avx2
  745. INIT_XMM avx512
  746. CABAC_RESIDUAL coeff_last_avx512
  747. %endif