bitstream-a.asm 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. ;*****************************************************************************
  2. ;* bitstream-a.asm: x86 bitstream functions
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2010-2018 x264 project
  5. ;*
  6. ;* Authors: Fiona Glaser <fiona@x264.com>
  7. ;* Henrik Gramner <henrik@gramner.com>
  8. ;*
  9. ;* This program is free software; you can redistribute it and/or modify
  10. ;* it under the terms of the GNU General Public License as published by
  11. ;* the Free Software Foundation; either version 2 of the License, or
  12. ;* (at your option) any later version.
  13. ;*
  14. ;* This program is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. ;* GNU General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU General Public License
  20. ;* along with this program; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  22. ;*
  23. ;* This program is also available under a commercial proprietary license.
  24. ;* For more information, contact us at licensing@x264.com.
  25. ;*****************************************************************************
  26. %include "x86inc.asm"
  27. %include "x86util.asm"
  28. SECTION .text
  29. ;-----------------------------------------------------------------------------
  30. ; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
  31. ;-----------------------------------------------------------------------------
  32. %macro NAL_LOOP 2
  33. %%escape:
  34. ; Detect false positive to avoid unneccessary escape loop
  35. xor r3d, r3d
  36. cmp byte [r0+r1-1], 0
  37. setnz r3b
  38. xor k3, k4
  39. jnz .escape
  40. jmp %%continue
  41. ALIGN 16
  42. %1:
  43. mova [r0+r1+mmsize], m1
  44. pcmpeqb m1, m0
  45. mova [r0+r1], m2
  46. pcmpeqb m2, m0
  47. pmovmskb r3d, m1
  48. %2 m1, [r1+r2+3*mmsize]
  49. pmovmskb r4d, m2
  50. %2 m2, [r1+r2+2*mmsize]
  51. shl k3, mmsize
  52. or k3, k4
  53. lea k4, [2*r3+1]
  54. and k4, k3
  55. jnz %%escape
  56. %%continue:
  57. add r1, 2*mmsize
  58. jl %1
  59. %endmacro
  60. %macro NAL_ESCAPE 0
  61. %if mmsize == 32
  62. %xdefine k3 r3
  63. %xdefine k4 r4
  64. %else
  65. %xdefine k3 r3d
  66. %xdefine k4 r4d
  67. %endif
  68. cglobal nal_escape, 3,5
  69. movzx r3d, byte [r1]
  70. sub r1, r2 ; r1 = offset of current src pointer from end of src
  71. pxor m0, m0
  72. mov [r0], r3b
  73. sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
  74. or r3d, 0xffffff00 ; ignore data before src
  75. ; Start off by jumping into the escape loop in case there's an escape at the start.
  76. ; And do a few more in scalar until dst is aligned.
  77. jmp .escape_loop
  78. %if mmsize == 16
  79. NAL_LOOP .loop_aligned, mova
  80. jmp .ret
  81. %endif
  82. NAL_LOOP .loop_unaligned, movu
  83. .ret:
  84. movifnidn rax, r0
  85. RET
  86. .escape:
  87. ; Skip bytes that are known to be valid
  88. and k4, k3
  89. tzcnt k4, k4
  90. xor r3d, r3d ; the last two bytes are known to be zero
  91. add r1, r4
  92. .escape_loop:
  93. inc r1
  94. jge .ret
  95. movzx r4d, byte [r1+r2]
  96. shl r3d, 8
  97. or r3d, r4d
  98. test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
  99. jz .add_escape_byte
  100. .escaped:
  101. lea r4d, [r0+r1]
  102. mov [r0+r1], r3b
  103. test r4d, mmsize-1 ; Do SIMD when dst is aligned
  104. jnz .escape_loop
  105. movu m1, [r1+r2+mmsize]
  106. movu m2, [r1+r2]
  107. %if mmsize == 16
  108. lea r4d, [r1+r2]
  109. test r4d, mmsize-1
  110. jz .loop_aligned
  111. %endif
  112. jmp .loop_unaligned
  113. .add_escape_byte:
  114. mov byte [r0+r1], 3
  115. inc r0
  116. or r3d, 0x0300
  117. jmp .escaped
  118. %endmacro
  119. INIT_MMX mmx2
  120. NAL_ESCAPE
  121. INIT_XMM sse2
  122. NAL_ESCAPE
  123. %if ARCH_X86_64
  124. INIT_YMM avx2
  125. NAL_ESCAPE
  126. %endif