asm.S 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. /*****************************************************************************
  2. * asm.S: AArch64 utility macros
  3. *****************************************************************************
  4. * Copyright (C) 2008-2018 x264 project
  5. *
  6. * Authors: Mans Rullgard <mans@mansr.com>
  7. * David Conrad <lessen42@gmail.com>
  8. * Janne Grunau <janne-x264@jannau.net>
  9. *
  10. * This program is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation; either version 2 of the License, or
  13. * (at your option) any later version.
  14. *
  15. * This program is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with this program; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
  23. *
  24. * This program is also available under a commercial proprietary license.
  25. * For more information, contact us at licensing@x264.com.
  26. *****************************************************************************/
  27. #include "config.h"
  28. #define GLUE(a, b) a ## b
  29. #define JOIN(a, b) GLUE(a, b)
  30. #ifdef PREFIX
  31. # define BASE _x264_
  32. # define SYM_PREFIX _
  33. #else
  34. # define BASE x264_
  35. # define SYM_PREFIX
  36. #endif
  37. #ifdef BIT_DEPTH
  38. # define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
  39. #else
  40. # define EXTERN_ASM BASE
  41. #endif
  42. #define X(s) JOIN(EXTERN_ASM, s)
  43. #define X264(s) JOIN(BASE, s)
  44. #define EXT(s) JOIN(SYM_PREFIX, s)
  45. #ifdef __ELF__
  46. # define ELF
  47. #else
  48. # define ELF #
  49. #endif
  50. #ifdef __MACH__
  51. # define MACH
  52. #else
  53. # define MACH #
  54. #endif
  55. #if HAVE_AS_FUNC
  56. # define FUNC
  57. #else
  58. # define FUNC #
  59. #endif
  60. .macro function name, export=0, align=2
  61. .macro endfunc
  62. .if \export
  63. ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
  64. .else
  65. ELF .size \name, . - \name
  66. .endif
  67. FUNC .endfunc
  68. .purgem endfunc
  69. .endm
  70. .text
  71. .align \align
  72. .if \export
  73. .global EXTERN_ASM\name
  74. ELF .type EXTERN_ASM\name, %function
  75. FUNC .func EXTERN_ASM\name
  76. EXTERN_ASM\name:
  77. .else
  78. ELF .type \name, %function
  79. FUNC .func \name
  80. \name:
  81. .endif
  82. .endm
  83. .macro const name, align=2
  84. .macro endconst
  85. ELF .size \name, . - \name
  86. .purgem endconst
  87. .endm
  88. ELF .section .rodata
  89. MACH .const_data
  90. .align \align
  91. \name:
  92. .endm
  93. .macro movrel rd, val
  94. #if defined(PIC) && defined(__APPLE__)
  95. adrp \rd, \val@PAGE
  96. add \rd, \rd, \val@PAGEOFF
  97. #elif defined(PIC)
  98. adrp \rd, \val
  99. add \rd, \rd, :lo12:\val
  100. #else
  101. ldr \rd, =\val
  102. #endif
  103. .endm
  104. #define FDEC_STRIDE 32
  105. #define FENC_STRIDE 16
  106. .macro SUMSUB_AB sum, sub, a, b
  107. add \sum, \a, \b
  108. sub \sub, \a, \b
  109. .endm
  110. .macro unzip t1, t2, s1, s2
  111. uzp1 \t1, \s1, \s2
  112. uzp2 \t2, \s1, \s2
  113. .endm
  114. .macro transpose t1, t2, s1, s2
  115. trn1 \t1, \s1, \s2
  116. trn2 \t2, \s1, \s2
  117. .endm
  118. .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
  119. transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
  120. transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
  121. transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
  122. transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
  123. .endm
  124. .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
  125. transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
  126. transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
  127. transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
  128. transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
  129. .endm
  130. .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
  131. trn1 \r8\().8H, \r0\().8H, \r1\().8H
  132. trn2 \r9\().8H, \r0\().8H, \r1\().8H
  133. trn1 \r1\().8H, \r2\().8H, \r3\().8H
  134. trn2 \r3\().8H, \r2\().8H, \r3\().8H
  135. trn1 \r0\().8H, \r4\().8H, \r5\().8H
  136. trn2 \r5\().8H, \r4\().8H, \r5\().8H
  137. trn1 \r2\().8H, \r6\().8H, \r7\().8H
  138. trn2 \r7\().8H, \r6\().8H, \r7\().8H
  139. trn1 \r4\().4S, \r0\().4S, \r2\().4S
  140. trn2 \r2\().4S, \r0\().4S, \r2\().4S
  141. trn1 \r6\().4S, \r5\().4S, \r7\().4S
  142. trn2 \r7\().4S, \r5\().4S, \r7\().4S
  143. trn1 \r5\().4S, \r9\().4S, \r3\().4S
  144. trn2 \r9\().4S, \r9\().4S, \r3\().4S
  145. trn1 \r3\().4S, \r8\().4S, \r1\().4S
  146. trn2 \r8\().4S, \r8\().4S, \r1\().4S
  147. trn1 \r0\().2D, \r3\().2D, \r4\().2D
  148. trn2 \r4\().2D, \r3\().2D, \r4\().2D
  149. trn1 \r1\().2D, \r5\().2D, \r6\().2D
  150. trn2 \r5\().2D, \r5\().2D, \r6\().2D
  151. trn2 \r6\().2D, \r8\().2D, \r2\().2D
  152. trn1 \r2\().2D, \r8\().2D, \r2\().2D
  153. trn1 \r3\().2D, \r9\().2D, \r7\().2D
  154. trn2 \r7\().2D, \r9\().2D, \r7\().2D
  155. .endm
  156. .macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
  157. trn1 \t0\().16b, \r0\().16b, \r1\().16b
  158. trn2 \t1\().16b, \r0\().16b, \r1\().16b
  159. trn1 \r1\().16b, \r2\().16b, \r3\().16b
  160. trn2 \r3\().16b, \r2\().16b, \r3\().16b
  161. trn1 \r0\().16b, \r4\().16b, \r5\().16b
  162. trn2 \r5\().16b, \r4\().16b, \r5\().16b
  163. trn1 \r2\().16b, \r6\().16b, \r7\().16b
  164. trn2 \r7\().16b, \r6\().16b, \r7\().16b
  165. trn1 \r4\().8h, \r0\().8h, \r2\().8h
  166. trn2 \r2\().8h, \r0\().8h, \r2\().8h
  167. trn1 \r6\().8h, \r5\().8h, \r7\().8h
  168. trn2 \r7\().8h, \r5\().8h, \r7\().8h
  169. trn1 \r5\().8h, \t1\().8h, \r3\().8h
  170. trn2 \t1\().8h, \t1\().8h, \r3\().8h
  171. trn1 \r3\().8h, \t0\().8h, \r1\().8h
  172. trn2 \t0\().8h, \t0\().8h, \r1\().8h
  173. trn1 \r0\().4s, \r3\().4s, \r4\().4s
  174. trn2 \r4\().4s, \r3\().4s, \r4\().4s
  175. trn1 \r1\().4s, \r5\().4s, \r6\().4s
  176. trn2 \r5\().4s, \r5\().4s, \r6\().4s
  177. trn2 \r6\().4s, \t0\().4s, \r2\().4s
  178. trn1 \r2\().4s, \t0\().4s, \r2\().4s
  179. trn1 \r3\().4s, \t1\().4s, \r7\().4s
  180. trn2 \r7\().4s, \t1\().4s, \r7\().4s
  181. .endm
  182. .macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
  183. trn1 \t4\().16b, \r0\().16b, \r1\().16b
  184. trn2 \t5\().16b, \r0\().16b, \r1\().16b
  185. trn1 \t6\().16b, \r2\().16b, \r3\().16b
  186. trn2 \t7\().16b, \r2\().16b, \r3\().16b
  187. trn1 \r0\().8h, \t4\().8h, \t6\().8h
  188. trn2 \r2\().8h, \t4\().8h, \t6\().8h
  189. trn1 \r1\().8h, \t5\().8h, \t7\().8h
  190. trn2 \r3\().8h, \t5\().8h, \t7\().8h
  191. .endm
  192. .macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
  193. trn1 \t4\().8b, \r0\().8b, \r1\().8b
  194. trn2 \t5\().8b, \r0\().8b, \r1\().8b
  195. trn1 \t6\().8b, \r2\().8b, \r3\().8b
  196. trn2 \t7\().8b, \r2\().8b, \r3\().8b
  197. trn1 \r0\().4h, \t4\().4h, \t6\().4h
  198. trn2 \r2\().4h, \t4\().4h, \t6\().4h
  199. trn1 \r1\().4h, \t5\().4h, \t7\().4h
  200. trn2 \r3\().4h, \t5\().4h, \t7\().4h
  201. .endm