ghash-neon-armv8.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .text
  16. .globl gcm_init_neon
  17. .hidden gcm_init_neon
  18. .type gcm_init_neon,%function
  19. .align 4
  20. gcm_init_neon:
  21. AARCH64_VALID_CALL_TARGET
  22. // This function is adapted from gcm_init_v8. xC2 is t3.
  23. ld1 {v17.2d}, [x1] // load H
  24. movi v19.16b, #0xe1
  25. shl v19.2d, v19.2d, #57 // 0xc2.0
  26. ext v3.16b, v17.16b, v17.16b, #8
  27. ushr v18.2d, v19.2d, #63
  28. dup v17.4s, v17.s[1]
  29. ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
  30. ushr v18.2d, v3.2d, #63
  31. sshr v17.4s, v17.4s, #31 // broadcast carry bit
  32. and v18.16b, v18.16b, v16.16b
  33. shl v3.2d, v3.2d, #1
  34. ext v18.16b, v18.16b, v18.16b, #8
  35. and v16.16b, v16.16b, v17.16b
  36. orr v3.16b, v3.16b, v18.16b // H<<<=1
  37. eor v5.16b, v3.16b, v16.16b // twisted H
  38. st1 {v5.2d}, [x0] // store Htable[0]
  39. ret
  40. .size gcm_init_neon,.-gcm_init_neon
  41. .globl gcm_gmult_neon
  42. .hidden gcm_gmult_neon
  43. .type gcm_gmult_neon,%function
  44. .align 4
  45. gcm_gmult_neon:
  46. AARCH64_VALID_CALL_TARGET
  47. ld1 {v3.16b}, [x0] // load Xi
  48. ld1 {v5.1d}, [x1], #8 // load twisted H
  49. ld1 {v6.1d}, [x1]
  50. adrp x9, .Lmasks // load constants
  51. add x9, x9, :lo12:.Lmasks
  52. ld1 {v24.2d, v25.2d}, [x9]
  53. rev64 v3.16b, v3.16b // byteswap Xi
  54. ext v3.16b, v3.16b, v3.16b, #8
  55. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  56. mov x3, #16
  57. b .Lgmult_neon
  58. .size gcm_gmult_neon,.-gcm_gmult_neon
  59. .globl gcm_ghash_neon
  60. .hidden gcm_ghash_neon
  61. .type gcm_ghash_neon,%function
  62. .align 4
  63. gcm_ghash_neon:
  64. AARCH64_VALID_CALL_TARGET
  65. ld1 {v0.16b}, [x0] // load Xi
  66. ld1 {v5.1d}, [x1], #8 // load twisted H
  67. ld1 {v6.1d}, [x1]
  68. adrp x9, .Lmasks // load constants
  69. add x9, x9, :lo12:.Lmasks
  70. ld1 {v24.2d, v25.2d}, [x9]
  71. rev64 v0.16b, v0.16b // byteswap Xi
  72. ext v0.16b, v0.16b, v0.16b, #8
  73. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  74. .Loop_neon:
  75. ld1 {v3.16b}, [x2], #16 // load inp
  76. rev64 v3.16b, v3.16b // byteswap inp
  77. ext v3.16b, v3.16b, v3.16b, #8
  78. eor v3.16b, v3.16b, v0.16b // inp ^= Xi
  79. .Lgmult_neon:
  80. // Split the input into v3 and v4. (The upper halves are unused,
  81. // so it is okay to leave them alone.)
  82. ins v4.d[0], v3.d[1]
  83. ext v16.8b, v5.8b, v5.8b, #1 // A1
  84. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  85. ext v0.8b, v3.8b, v3.8b, #1 // B1
  86. pmull v0.8h, v5.8b, v0.8b // E = A*B1
  87. ext v17.8b, v5.8b, v5.8b, #2 // A2
  88. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  89. ext v19.8b, v3.8b, v3.8b, #2 // B2
  90. pmull v19.8h, v5.8b, v19.8b // G = A*B2
  91. ext v18.8b, v5.8b, v5.8b, #3 // A3
  92. eor v16.16b, v16.16b, v0.16b // L = E + F
  93. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  94. ext v0.8b, v3.8b, v3.8b, #3 // B3
  95. eor v17.16b, v17.16b, v19.16b // M = G + H
  96. pmull v0.8h, v5.8b, v0.8b // I = A*B3
  97. // Here we diverge from the 32-bit version. It computes the following
  98. // (instructions reordered for clarity):
  99. //
  100. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  101. // vand $t0#hi, $t0#hi, $k48
  102. // veor $t0#lo, $t0#lo, $t0#hi
  103. //
  104. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  105. // vand $t1#hi, $t1#hi, $k32
  106. // veor $t1#lo, $t1#lo, $t1#hi
  107. //
  108. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  109. // vand $t2#hi, $t2#hi, $k16
  110. // veor $t2#lo, $t2#lo, $t2#hi
  111. //
  112. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  113. // vmov.i64 $t3#hi, #0
  114. //
  115. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  116. // upper halves of SIMD registers, so we must split each half into
  117. // separate registers. To compensate, we pair computations up and
  118. // parallelize.
  119. ext v19.8b, v3.8b, v3.8b, #4 // B4
  120. eor v18.16b, v18.16b, v0.16b // N = I + J
  121. pmull v19.8h, v5.8b, v19.8b // K = A*B4
  122. // This can probably be scheduled more efficiently. For now, we just
  123. // pair up independent instructions.
  124. zip1 v20.2d, v16.2d, v17.2d
  125. zip1 v22.2d, v18.2d, v19.2d
  126. zip2 v21.2d, v16.2d, v17.2d
  127. zip2 v23.2d, v18.2d, v19.2d
  128. eor v20.16b, v20.16b, v21.16b
  129. eor v22.16b, v22.16b, v23.16b
  130. and v21.16b, v21.16b, v24.16b
  131. and v23.16b, v23.16b, v25.16b
  132. eor v20.16b, v20.16b, v21.16b
  133. eor v22.16b, v22.16b, v23.16b
  134. zip1 v16.2d, v20.2d, v21.2d
  135. zip1 v18.2d, v22.2d, v23.2d
  136. zip2 v17.2d, v20.2d, v21.2d
  137. zip2 v19.2d, v22.2d, v23.2d
  138. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  139. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  140. pmull v0.8h, v5.8b, v3.8b // D = A*B
  141. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  142. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  143. eor v16.16b, v16.16b, v17.16b
  144. eor v18.16b, v18.16b, v19.16b
  145. eor v0.16b, v0.16b, v16.16b
  146. eor v0.16b, v0.16b, v18.16b
  147. eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
  148. ext v16.8b, v7.8b, v7.8b, #1 // A1
  149. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  150. ext v1.8b, v3.8b, v3.8b, #1 // B1
  151. pmull v1.8h, v7.8b, v1.8b // E = A*B1
  152. ext v17.8b, v7.8b, v7.8b, #2 // A2
  153. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  154. ext v19.8b, v3.8b, v3.8b, #2 // B2
  155. pmull v19.8h, v7.8b, v19.8b // G = A*B2
  156. ext v18.8b, v7.8b, v7.8b, #3 // A3
  157. eor v16.16b, v16.16b, v1.16b // L = E + F
  158. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  159. ext v1.8b, v3.8b, v3.8b, #3 // B3
  160. eor v17.16b, v17.16b, v19.16b // M = G + H
  161. pmull v1.8h, v7.8b, v1.8b // I = A*B3
  162. // Here we diverge from the 32-bit version. It computes the following
  163. // (instructions reordered for clarity):
  164. //
  165. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  166. // vand $t0#hi, $t0#hi, $k48
  167. // veor $t0#lo, $t0#lo, $t0#hi
  168. //
  169. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  170. // vand $t1#hi, $t1#hi, $k32
  171. // veor $t1#lo, $t1#lo, $t1#hi
  172. //
  173. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  174. // vand $t2#hi, $t2#hi, $k16
  175. // veor $t2#lo, $t2#lo, $t2#hi
  176. //
  177. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  178. // vmov.i64 $t3#hi, #0
  179. //
  180. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  181. // upper halves of SIMD registers, so we must split each half into
  182. // separate registers. To compensate, we pair computations up and
  183. // parallelize.
  184. ext v19.8b, v3.8b, v3.8b, #4 // B4
  185. eor v18.16b, v18.16b, v1.16b // N = I + J
  186. pmull v19.8h, v7.8b, v19.8b // K = A*B4
  187. // This can probably be scheduled more efficiently. For now, we just
  188. // pair up independent instructions.
  189. zip1 v20.2d, v16.2d, v17.2d
  190. zip1 v22.2d, v18.2d, v19.2d
  191. zip2 v21.2d, v16.2d, v17.2d
  192. zip2 v23.2d, v18.2d, v19.2d
  193. eor v20.16b, v20.16b, v21.16b
  194. eor v22.16b, v22.16b, v23.16b
  195. and v21.16b, v21.16b, v24.16b
  196. and v23.16b, v23.16b, v25.16b
  197. eor v20.16b, v20.16b, v21.16b
  198. eor v22.16b, v22.16b, v23.16b
  199. zip1 v16.2d, v20.2d, v21.2d
  200. zip1 v18.2d, v22.2d, v23.2d
  201. zip2 v17.2d, v20.2d, v21.2d
  202. zip2 v19.2d, v22.2d, v23.2d
  203. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  204. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  205. pmull v1.8h, v7.8b, v3.8b // D = A*B
  206. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  207. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  208. eor v16.16b, v16.16b, v17.16b
  209. eor v18.16b, v18.16b, v19.16b
  210. eor v1.16b, v1.16b, v16.16b
  211. eor v1.16b, v1.16b, v18.16b
  212. ext v16.8b, v6.8b, v6.8b, #1 // A1
  213. pmull v16.8h, v16.8b, v4.8b // F = A1*B
  214. ext v2.8b, v4.8b, v4.8b, #1 // B1
  215. pmull v2.8h, v6.8b, v2.8b // E = A*B1
  216. ext v17.8b, v6.8b, v6.8b, #2 // A2
  217. pmull v17.8h, v17.8b, v4.8b // H = A2*B
  218. ext v19.8b, v4.8b, v4.8b, #2 // B2
  219. pmull v19.8h, v6.8b, v19.8b // G = A*B2
  220. ext v18.8b, v6.8b, v6.8b, #3 // A3
  221. eor v16.16b, v16.16b, v2.16b // L = E + F
  222. pmull v18.8h, v18.8b, v4.8b // J = A3*B
  223. ext v2.8b, v4.8b, v4.8b, #3 // B3
  224. eor v17.16b, v17.16b, v19.16b // M = G + H
  225. pmull v2.8h, v6.8b, v2.8b // I = A*B3
  226. // Here we diverge from the 32-bit version. It computes the following
  227. // (instructions reordered for clarity):
  228. //
  229. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  230. // vand $t0#hi, $t0#hi, $k48
  231. // veor $t0#lo, $t0#lo, $t0#hi
  232. //
  233. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  234. // vand $t1#hi, $t1#hi, $k32
  235. // veor $t1#lo, $t1#lo, $t1#hi
  236. //
  237. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  238. // vand $t2#hi, $t2#hi, $k16
  239. // veor $t2#lo, $t2#lo, $t2#hi
  240. //
  241. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  242. // vmov.i64 $t3#hi, #0
  243. //
  244. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  245. // upper halves of SIMD registers, so we must split each half into
  246. // separate registers. To compensate, we pair computations up and
  247. // parallelize.
  248. ext v19.8b, v4.8b, v4.8b, #4 // B4
  249. eor v18.16b, v18.16b, v2.16b // N = I + J
  250. pmull v19.8h, v6.8b, v19.8b // K = A*B4
  251. // This can probably be scheduled more efficiently. For now, we just
  252. // pair up independent instructions.
  253. zip1 v20.2d, v16.2d, v17.2d
  254. zip1 v22.2d, v18.2d, v19.2d
  255. zip2 v21.2d, v16.2d, v17.2d
  256. zip2 v23.2d, v18.2d, v19.2d
  257. eor v20.16b, v20.16b, v21.16b
  258. eor v22.16b, v22.16b, v23.16b
  259. and v21.16b, v21.16b, v24.16b
  260. and v23.16b, v23.16b, v25.16b
  261. eor v20.16b, v20.16b, v21.16b
  262. eor v22.16b, v22.16b, v23.16b
  263. zip1 v16.2d, v20.2d, v21.2d
  264. zip1 v18.2d, v22.2d, v23.2d
  265. zip2 v17.2d, v20.2d, v21.2d
  266. zip2 v19.2d, v22.2d, v23.2d
  267. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  268. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  269. pmull v2.8h, v6.8b, v4.8b // D = A*B
  270. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  271. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  272. eor v16.16b, v16.16b, v17.16b
  273. eor v18.16b, v18.16b, v19.16b
  274. eor v2.16b, v2.16b, v16.16b
  275. eor v2.16b, v2.16b, v18.16b
  276. ext v16.16b, v0.16b, v2.16b, #8
  277. eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
  278. eor v1.16b, v1.16b, v2.16b
  279. eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
  280. ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
  281. // This is a no-op due to the ins instruction below.
  282. // ins v2.d[0], v1.d[1]
  283. // equivalent of reduction_avx from ghash-x86_64.pl
  284. shl v17.2d, v0.2d, #57 // 1st phase
  285. shl v18.2d, v0.2d, #62
  286. eor v18.16b, v18.16b, v17.16b //
  287. shl v17.2d, v0.2d, #63
  288. eor v18.16b, v18.16b, v17.16b //
  289. // Note Xm contains {Xl.d[1], Xh.d[0]}.
  290. eor v18.16b, v18.16b, v1.16b
  291. ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
  292. ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
  293. ushr v18.2d, v0.2d, #1 // 2nd phase
  294. eor v2.16b, v2.16b,v0.16b
  295. eor v0.16b, v0.16b,v18.16b //
  296. ushr v18.2d, v18.2d, #6
  297. ushr v0.2d, v0.2d, #1 //
  298. eor v0.16b, v0.16b, v2.16b //
  299. eor v0.16b, v0.16b, v18.16b //
  300. subs x3, x3, #16
  301. bne .Loop_neon
  302. rev64 v0.16b, v0.16b // byteswap Xi and write
  303. ext v0.16b, v0.16b, v0.16b, #8
  304. st1 {v0.16b}, [x0]
  305. ret
  306. .size gcm_ghash_neon,.-gcm_ghash_neon
  307. .section .rodata
  308. .align 4
  309. .Lmasks:
  310. .quad 0x0000ffffffffffff // k48
  311. .quad 0x00000000ffffffff // k32
  312. .quad 0x000000000000ffff // k16
  313. .quad 0x0000000000000000 // k0
  314. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  315. .align 2
  316. .align 2
  317. #endif
  318. #endif // !OPENSSL_NO_ASM
  319. .section .note.GNU-stack,"",%progbits