ghashv8-armx32.S 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. #if __ARM_MAX_ARCH__>=7
  15. .text
  16. .code 32
  17. #undef __thumb2__
  18. .globl _gcm_init_v8
  19. .private_extern _gcm_init_v8
  20. #ifdef __thumb2__
  21. .thumb_func _gcm_init_v8
  22. #endif
  23. .align 4
  24. _gcm_init_v8:
  25. AARCH64_VALID_CALL_TARGET
  26. vld1.64 {q9},[r1] @ load input H
  27. vmov.i8 q11,#0xe1
  28. vshl.i64 q11,q11,#57 @ 0xc2.0
  29. vext.8 q3,q9,q9,#8
  30. vshr.u64 q10,q11,#63
  31. vdup.32 q9,d18[1]
  32. vext.8 q8,q10,q11,#8 @ t0=0xc2....01
  33. vshr.u64 q10,q3,#63
  34. vshr.s32 q9,q9,#31 @ broadcast carry bit
  35. vand q10,q10,q8
  36. vshl.i64 q3,q3,#1
  37. vext.8 q10,q10,q10,#8
  38. vand q8,q8,q9
  39. vorr q3,q3,q10 @ H<<<=1
  40. veor q12,q3,q8 @ twisted H
  41. vst1.64 {q12},[r0]! @ store Htable[0]
  42. @ calculate H^2
  43. vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
  44. .byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
  45. veor q8,q8,q12
  46. .byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
  47. .byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
  48. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  49. veor q10,q0,q2
  50. veor q1,q1,q9
  51. veor q1,q1,q10
  52. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
  53. vmov d4,d3 @ Xh|Xm - 256-bit result
  54. vmov d3,d0 @ Xm is rotated Xl
  55. veor q0,q1,q10
  56. vext.8 q10,q0,q0,#8 @ 2nd phase
  57. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  58. veor q10,q10,q2
  59. veor q14,q0,q10
  60. vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
  61. veor q9,q9,q14
  62. vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
  63. vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
  64. bx lr
  65. .globl _gcm_gmult_v8
  66. .private_extern _gcm_gmult_v8
  67. #ifdef __thumb2__
  68. .thumb_func _gcm_gmult_v8
  69. #endif
  70. .align 4
  71. _gcm_gmult_v8:
  72. AARCH64_VALID_CALL_TARGET
  73. vld1.64 {q9},[r0] @ load Xi
  74. vmov.i8 q11,#0xe1
  75. vld1.64 {q12,q13},[r1] @ load twisted H, ...
  76. vshl.u64 q11,q11,#57
  77. #ifndef __ARMEB__
  78. vrev64.8 q9,q9
  79. #endif
  80. vext.8 q3,q9,q9,#8
  81. .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
  82. veor q9,q9,q3 @ Karatsuba pre-processing
  83. .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
  84. .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  85. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  86. veor q10,q0,q2
  87. veor q1,q1,q9
  88. veor q1,q1,q10
  89. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  90. vmov d4,d3 @ Xh|Xm - 256-bit result
  91. vmov d3,d0 @ Xm is rotated Xl
  92. veor q0,q1,q10
  93. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  94. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  95. veor q10,q10,q2
  96. veor q0,q0,q10
  97. #ifndef __ARMEB__
  98. vrev64.8 q0,q0
  99. #endif
  100. vext.8 q0,q0,q0,#8
  101. vst1.64 {q0},[r0] @ write out Xi
  102. bx lr
  103. .globl _gcm_ghash_v8
  104. .private_extern _gcm_ghash_v8
  105. #ifdef __thumb2__
  106. .thumb_func _gcm_ghash_v8
  107. #endif
  108. .align 4
  109. _gcm_ghash_v8:
  110. AARCH64_VALID_CALL_TARGET
  111. vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
  112. vld1.64 {q0},[r0] @ load [rotated] Xi
  113. @ "[rotated]" means that
  114. @ loaded value would have
  115. @ to be rotated in order to
  116. @ make it appear as in
  117. @ algorithm specification
  118. subs r3,r3,#32 @ see if r3 is 32 or larger
  119. mov r12,#16 @ r12 is used as post-
  120. @ increment for input pointer;
  121. @ as loop is modulo-scheduled
  122. @ r12 is zeroed just in time
  123. @ to preclude overstepping
  124. @ inp[len], which means that
  125. @ last block[s] are actually
  126. @ loaded twice, but last
  127. @ copy is not processed
  128. vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
  129. vmov.i8 q11,#0xe1
  130. vld1.64 {q14},[r1]
  131. moveq r12,#0 @ is it time to zero r12?
  132. vext.8 q0,q0,q0,#8 @ rotate Xi
  133. vld1.64 {q8},[r2]! @ load [rotated] I[0]
  134. vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
  135. #ifndef __ARMEB__
  136. vrev64.8 q8,q8
  137. vrev64.8 q0,q0
  138. #endif
  139. vext.8 q3,q8,q8,#8 @ rotate I[0]
  140. blo Lodd_tail_v8 @ r3 was less than 32
  141. vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
  142. #ifndef __ARMEB__
  143. vrev64.8 q9,q9
  144. #endif
  145. vext.8 q7,q9,q9,#8
  146. veor q3,q3,q0 @ I[i]^=Xi
  147. .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
  148. veor q9,q9,q7 @ Karatsuba pre-processing
  149. .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
  150. b Loop_mod2x_v8
  151. .align 4
  152. Loop_mod2x_v8:
  153. vext.8 q10,q3,q3,#8
  154. subs r3,r3,#32 @ is there more data?
  155. .byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
  156. movlo r12,#0 @ is it time to zero r12?
  157. .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
  158. veor q10,q10,q3 @ Karatsuba pre-processing
  159. .byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
  160. veor q0,q0,q4 @ accumulate
  161. .byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  162. vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
  163. veor q2,q2,q6
  164. moveq r12,#0 @ is it time to zero r12?
  165. veor q1,q1,q5
  166. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  167. veor q10,q0,q2
  168. veor q1,q1,q9
  169. vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
  170. #ifndef __ARMEB__
  171. vrev64.8 q8,q8
  172. #endif
  173. veor q1,q1,q10
  174. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  175. #ifndef __ARMEB__
  176. vrev64.8 q9,q9
  177. #endif
  178. vmov d4,d3 @ Xh|Xm - 256-bit result
  179. vmov d3,d0 @ Xm is rotated Xl
  180. vext.8 q7,q9,q9,#8
  181. vext.8 q3,q8,q8,#8
  182. veor q0,q1,q10
  183. .byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
  184. veor q3,q3,q2 @ accumulate q3 early
  185. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  186. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  187. veor q3,q3,q10
  188. veor q9,q9,q7 @ Karatsuba pre-processing
  189. veor q3,q3,q0
  190. .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
  191. bhs Loop_mod2x_v8 @ there was at least 32 more bytes
  192. veor q2,q2,q10
  193. vext.8 q3,q8,q8,#8 @ re-construct q3
  194. adds r3,r3,#32 @ re-construct r3
  195. veor q0,q0,q2 @ re-construct q0
  196. beq Ldone_v8 @ is r3 zero?
  197. Lodd_tail_v8:
  198. vext.8 q10,q0,q0,#8
  199. veor q3,q3,q0 @ inp^=Xi
  200. veor q9,q8,q10 @ q9 is rotated inp^Xi
  201. .byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
  202. veor q9,q9,q3 @ Karatsuba pre-processing
  203. .byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
  204. .byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
  205. vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
  206. veor q10,q0,q2
  207. veor q1,q1,q9
  208. veor q1,q1,q10
  209. .byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
  210. vmov d4,d3 @ Xh|Xm - 256-bit result
  211. vmov d3,d0 @ Xm is rotated Xl
  212. veor q0,q1,q10
  213. vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
  214. .byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
  215. veor q10,q10,q2
  216. veor q0,q0,q10
  217. Ldone_v8:
  218. #ifndef __ARMEB__
  219. vrev64.8 q0,q0
  220. #endif
  221. vext.8 q0,q0,q0,#8
  222. vst1.64 {q0},[r0] @ write out Xi
  223. vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
  224. bx lr
  225. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  226. .align 2
  227. .align 2
  228. #endif
  229. #endif // !OPENSSL_NO_ASM