ghashv8-armx32.S 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__arm__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. #if __ARM_MAX_ARCH__>=7
  16. .text
  17. .fpu neon
  18. .code 32
  19. #undef __thumb2__
  20. .globl gcm_init_v8
  21. .hidden gcm_init_v8
  22. .type gcm_init_v8,%function
  23. .align 4
  24. gcm_init_v8:
  25. AARCH64_VALID_CALL_TARGET
  26. vld1.64 {q9},[r1] @ load input H
  27. vmov.i8 q11,#0xe1
  28. vshl.i64 q11,q11,#57 @ 0xc2.0
  29. vext.8 q3,q9,q9,#8
  30. vshr.u64 q10,q11,#63
  31. vdup.32 q9,d18[1]
  32. vext.8 q8,q10,q11,#8 @ t0=0xc2....01
  33. vshr.u64 q10,q3,#63
  34. vshr.s32 q9,q9,#31 @ broadcast carry bit
  35. vand q10,q10,q8
  36. vshl.i64 q3,q3,#1
  37. vext.8 q10,q10,q10,#8
  38. vand q8,q8,q9
  39. vorr q3,q3,q10 @ H<<<=1
  40. veor q12,q3,q8 @ twisted H
  41. vst1.64 {q12},[r0]! @ store Htable[0]
  42. @ calculate H^2
  43. vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
  44. .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
  45. veor q8,q8,q12
  46. .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
  47. .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
  48. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  49. veor q10,q0,q2
  50. veor q1,q1,q9
  51. veor q1,q1,q10
  52. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
  53. vmov d4,d3 @ Xh|Xm - 256-bit result
  54. vmov d3,d0 @ Xm is rotated Xl
  55. veor q0,q1,q10
  56. vext.8 q10,q0,q0,#8 @ 2nd phase
  57. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  58. veor q10,q10,q2
  59. veor q14,q0,q10
  60. vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
  61. veor q9,q9,q14
  62. vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
  63. vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
  64. bx lr
  65. .size gcm_init_v8,.-gcm_init_v8
  66. .globl gcm_gmult_v8
  67. .hidden gcm_gmult_v8
  68. .type gcm_gmult_v8,%function
  69. .align 4
  70. gcm_gmult_v8:
  71. AARCH64_VALID_CALL_TARGET
  72. vld1.64 {q9},[r0] @ load Xi
  73. vmov.i8 q11,#0xe1
  74. vld1.64 {q12,q13},[r1] @ load twisted H, ...
  75. vshl.u64 q11,q11,#57
  76. #ifndef __ARMEB__
  77. vrev64.8 q9,q9
  78. #endif
  79. vext.8 q3,q9,q9,#8
  80. .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
  81. veor q9,q9,q3 @ Karatsuba pre-processing
  82. .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
  83. .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  84. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  85. veor q10,q0,q2
  86. veor q1,q1,q9
  87. veor q1,q1,q10
  88. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  89. vmov d4,d3 @ Xh|Xm - 256-bit result
  90. vmov d3,d0 @ Xm is rotated Xl
  91. veor q0,q1,q10
  92. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  93. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  94. veor q10,q10,q2
  95. veor q0,q0,q10
  96. #ifndef __ARMEB__
  97. vrev64.8 q0,q0
  98. #endif
  99. vext.8 q0,q0,q0,#8
  100. vst1.64 {q0},[r0] @ write out Xi
  101. bx lr
  102. .size gcm_gmult_v8,.-gcm_gmult_v8
  103. .globl gcm_ghash_v8
  104. .hidden gcm_ghash_v8
  105. .type gcm_ghash_v8,%function
  106. .align 4
  107. gcm_ghash_v8:
  108. AARCH64_VALID_CALL_TARGET
  109. vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
  110. vld1.64 {q0},[r0] @ load [rotated] Xi
  111. @ "[rotated]" means that
  112. @ loaded value would have
  113. @ to be rotated in order to
  114. @ make it appear as in
  115. @ algorithm specification
  116. subs r3,r3,#32 @ see if r3 is 32 or larger
  117. mov r12,#16 @ r12 is used as post-
  118. @ increment for input pointer;
  119. @ as loop is modulo-scheduled
  120. @ r12 is zeroed just in time
  121. @ to preclude overstepping
  122. @ inp[len], which means that
  123. @ last block[s] are actually
  124. @ loaded twice, but last
  125. @ copy is not processed
  126. vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
  127. vmov.i8 q11,#0xe1
  128. vld1.64 {q14},[r1]
  129. moveq r12,#0 @ is it time to zero r12?
  130. vext.8 q0,q0,q0,#8 @ rotate Xi
  131. vld1.64 {q8},[r2]! @ load [rotated] I[0]
  132. vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
  133. #ifndef __ARMEB__
  134. vrev64.8 q8,q8
  135. vrev64.8 q0,q0
  136. #endif
  137. vext.8 q3,q8,q8,#8 @ rotate I[0]
  138. blo .Lodd_tail_v8 @ r3 was less than 32
  139. vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
  140. #ifndef __ARMEB__
  141. vrev64.8 q9,q9
  142. #endif
  143. vext.8 q7,q9,q9,#8
  144. veor q3,q3,q0 @ I[i]^=Xi
  145. .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
  146. veor q9,q9,q7 @ Karatsuba pre-processing
  147. .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
  148. b .Loop_mod2x_v8
  149. .align 4
  150. .Loop_mod2x_v8:
  151. vext.8 q10,q3,q3,#8
  152. subs r3,r3,#32 @ is there more data?
  153. .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
  154. movlo r12,#0 @ is it time to zero r12?
  155. .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
  156. veor q10,q10,q3 @ Karatsuba pre-processing
  157. .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
  158. veor q0,q0,q4 @ accumulate
  159. .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  160. vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
  161. veor q2,q2,q6
  162. moveq r12,#0 @ is it time to zero r12?
  163. veor q1,q1,q5
  164. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  165. veor q10,q0,q2
  166. veor q1,q1,q9
  167. vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
  168. #ifndef __ARMEB__
  169. vrev64.8 q8,q8
  170. #endif
  171. veor q1,q1,q10
  172. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  173. #ifndef __ARMEB__
  174. vrev64.8 q9,q9
  175. #endif
  176. vmov d4,d3 @ Xh|Xm - 256-bit result
  177. vmov d3,d0 @ Xm is rotated Xl
  178. vext.8 q7,q9,q9,#8
  179. vext.8 q3,q8,q8,#8
  180. veor q0,q1,q10
  181. .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
  182. veor q3,q3,q2 @ accumulate q3 early
  183. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  184. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  185. veor q3,q3,q10
  186. veor q9,q9,q7 @ Karatsuba pre-processing
  187. veor q3,q3,q0
  188. .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
  189. bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
  190. veor q2,q2,q10
  191. vext.8 q3,q8,q8,#8 @ re-construct q3
  192. adds r3,r3,#32 @ re-construct r3
  193. veor q0,q0,q2 @ re-construct q0
  194. beq .Ldone_v8 @ is r3 zero?
  195. .Lodd_tail_v8:
  196. vext.8 q10,q0,q0,#8
  197. veor q3,q3,q0 @ inp^=Xi
  198. veor q9,q8,q10 @ q9 is rotated inp^Xi
  199. .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
  200. veor q9,q9,q3 @ Karatsuba pre-processing
  201. .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
  202. .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  203. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  204. veor q10,q0,q2
  205. veor q1,q1,q9
  206. veor q1,q1,q10
  207. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  208. vmov d4,d3 @ Xh|Xm - 256-bit result
  209. vmov d3,d0 @ Xm is rotated Xl
  210. veor q0,q1,q10
  211. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  212. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  213. veor q10,q10,q2
  214. veor q0,q0,q10
  215. .Ldone_v8:
  216. #ifndef __ARMEB__
  217. vrev64.8 q0,q0
  218. #endif
  219. vext.8 q0,q0,q0,#8
  220. vst1.64 {q0},[r0] @ write out Xi
  221. vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
  222. bx lr
  223. .size gcm_ghash_v8,.-gcm_ghash_v8
  224. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  225. .align 2
  226. .align 2
  227. #endif
  228. #endif
  229. #endif // !OPENSSL_NO_ASM
  230. .section .note.GNU-stack,"",%progbits