ghash-neon-armv8.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .text
  16. .globl gcm_init_neon
  17. .def gcm_init_neon
  18. .type 32
  19. .endef
  20. .align 4
  21. gcm_init_neon:
  22. AARCH64_VALID_CALL_TARGET
  23. // This function is adapted from gcm_init_v8. xC2 is t3.
  24. ld1 {v17.2d}, [x1] // load H
  25. movi v19.16b, #0xe1
  26. shl v19.2d, v19.2d, #57 // 0xc2.0
  27. ext v3.16b, v17.16b, v17.16b, #8
  28. ushr v18.2d, v19.2d, #63
  29. dup v17.4s, v17.s[1]
  30. ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
  31. ushr v18.2d, v3.2d, #63
  32. sshr v17.4s, v17.4s, #31 // broadcast carry bit
  33. and v18.16b, v18.16b, v16.16b
  34. shl v3.2d, v3.2d, #1
  35. ext v18.16b, v18.16b, v18.16b, #8
  36. and v16.16b, v16.16b, v17.16b
  37. orr v3.16b, v3.16b, v18.16b // H<<<=1
  38. eor v5.16b, v3.16b, v16.16b // twisted H
  39. st1 {v5.2d}, [x0] // store Htable[0]
  40. ret
  41. .globl gcm_gmult_neon
  42. .def gcm_gmult_neon
  43. .type 32
  44. .endef
  45. .align 4
  46. gcm_gmult_neon:
  47. AARCH64_VALID_CALL_TARGET
  48. ld1 {v3.16b}, [x0] // load Xi
  49. ld1 {v5.1d}, [x1], #8 // load twisted H
  50. ld1 {v6.1d}, [x1]
  51. adrp x9, Lmasks // load constants
  52. add x9, x9, :lo12:Lmasks
  53. ld1 {v24.2d, v25.2d}, [x9]
  54. rev64 v3.16b, v3.16b // byteswap Xi
  55. ext v3.16b, v3.16b, v3.16b, #8
  56. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  57. mov x3, #16
  58. b Lgmult_neon
  59. .globl gcm_ghash_neon
  60. .def gcm_ghash_neon
  61. .type 32
  62. .endef
  63. .align 4
  64. gcm_ghash_neon:
  65. AARCH64_VALID_CALL_TARGET
  66. ld1 {v0.16b}, [x0] // load Xi
  67. ld1 {v5.1d}, [x1], #8 // load twisted H
  68. ld1 {v6.1d}, [x1]
  69. adrp x9, Lmasks // load constants
  70. add x9, x9, :lo12:Lmasks
  71. ld1 {v24.2d, v25.2d}, [x9]
  72. rev64 v0.16b, v0.16b // byteswap Xi
  73. ext v0.16b, v0.16b, v0.16b, #8
  74. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  75. Loop_neon:
  76. ld1 {v3.16b}, [x2], #16 // load inp
  77. rev64 v3.16b, v3.16b // byteswap inp
  78. ext v3.16b, v3.16b, v3.16b, #8
  79. eor v3.16b, v3.16b, v0.16b // inp ^= Xi
  80. Lgmult_neon:
  81. // Split the input into v3 and v4. (The upper halves are unused,
  82. // so it is okay to leave them alone.)
  83. ins v4.d[0], v3.d[1]
  84. ext v16.8b, v5.8b, v5.8b, #1 // A1
  85. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  86. ext v0.8b, v3.8b, v3.8b, #1 // B1
  87. pmull v0.8h, v5.8b, v0.8b // E = A*B1
  88. ext v17.8b, v5.8b, v5.8b, #2 // A2
  89. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  90. ext v19.8b, v3.8b, v3.8b, #2 // B2
  91. pmull v19.8h, v5.8b, v19.8b // G = A*B2
  92. ext v18.8b, v5.8b, v5.8b, #3 // A3
  93. eor v16.16b, v16.16b, v0.16b // L = E + F
  94. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  95. ext v0.8b, v3.8b, v3.8b, #3 // B3
  96. eor v17.16b, v17.16b, v19.16b // M = G + H
  97. pmull v0.8h, v5.8b, v0.8b // I = A*B3
  98. // Here we diverge from the 32-bit version. It computes the following
  99. // (instructions reordered for clarity):
  100. //
  101. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  102. // vand $t0#hi, $t0#hi, $k48
  103. // veor $t0#lo, $t0#lo, $t0#hi
  104. //
  105. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  106. // vand $t1#hi, $t1#hi, $k32
  107. // veor $t1#lo, $t1#lo, $t1#hi
  108. //
  109. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  110. // vand $t2#hi, $t2#hi, $k16
  111. // veor $t2#lo, $t2#lo, $t2#hi
  112. //
  113. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  114. // vmov.i64 $t3#hi, #0
  115. //
  116. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  117. // upper halves of SIMD registers, so we must split each half into
  118. // separate registers. To compensate, we pair computations up and
  119. // parallelize.
  120. ext v19.8b, v3.8b, v3.8b, #4 // B4
  121. eor v18.16b, v18.16b, v0.16b // N = I + J
  122. pmull v19.8h, v5.8b, v19.8b // K = A*B4
  123. // This can probably be scheduled more efficiently. For now, we just
  124. // pair up independent instructions.
  125. zip1 v20.2d, v16.2d, v17.2d
  126. zip1 v22.2d, v18.2d, v19.2d
  127. zip2 v21.2d, v16.2d, v17.2d
  128. zip2 v23.2d, v18.2d, v19.2d
  129. eor v20.16b, v20.16b, v21.16b
  130. eor v22.16b, v22.16b, v23.16b
  131. and v21.16b, v21.16b, v24.16b
  132. and v23.16b, v23.16b, v25.16b
  133. eor v20.16b, v20.16b, v21.16b
  134. eor v22.16b, v22.16b, v23.16b
  135. zip1 v16.2d, v20.2d, v21.2d
  136. zip1 v18.2d, v22.2d, v23.2d
  137. zip2 v17.2d, v20.2d, v21.2d
  138. zip2 v19.2d, v22.2d, v23.2d
  139. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  140. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  141. pmull v0.8h, v5.8b, v3.8b // D = A*B
  142. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  143. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  144. eor v16.16b, v16.16b, v17.16b
  145. eor v18.16b, v18.16b, v19.16b
  146. eor v0.16b, v0.16b, v16.16b
  147. eor v0.16b, v0.16b, v18.16b
  148. eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
  149. ext v16.8b, v7.8b, v7.8b, #1 // A1
  150. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  151. ext v1.8b, v3.8b, v3.8b, #1 // B1
  152. pmull v1.8h, v7.8b, v1.8b // E = A*B1
  153. ext v17.8b, v7.8b, v7.8b, #2 // A2
  154. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  155. ext v19.8b, v3.8b, v3.8b, #2 // B2
  156. pmull v19.8h, v7.8b, v19.8b // G = A*B2
  157. ext v18.8b, v7.8b, v7.8b, #3 // A3
  158. eor v16.16b, v16.16b, v1.16b // L = E + F
  159. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  160. ext v1.8b, v3.8b, v3.8b, #3 // B3
  161. eor v17.16b, v17.16b, v19.16b // M = G + H
  162. pmull v1.8h, v7.8b, v1.8b // I = A*B3
  163. // Here we diverge from the 32-bit version. It computes the following
  164. // (instructions reordered for clarity):
  165. //
  166. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  167. // vand $t0#hi, $t0#hi, $k48
  168. // veor $t0#lo, $t0#lo, $t0#hi
  169. //
  170. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  171. // vand $t1#hi, $t1#hi, $k32
  172. // veor $t1#lo, $t1#lo, $t1#hi
  173. //
  174. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  175. // vand $t2#hi, $t2#hi, $k16
  176. // veor $t2#lo, $t2#lo, $t2#hi
  177. //
  178. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  179. // vmov.i64 $t3#hi, #0
  180. //
  181. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  182. // upper halves of SIMD registers, so we must split each half into
  183. // separate registers. To compensate, we pair computations up and
  184. // parallelize.
  185. ext v19.8b, v3.8b, v3.8b, #4 // B4
  186. eor v18.16b, v18.16b, v1.16b // N = I + J
  187. pmull v19.8h, v7.8b, v19.8b // K = A*B4
  188. // This can probably be scheduled more efficiently. For now, we just
  189. // pair up independent instructions.
  190. zip1 v20.2d, v16.2d, v17.2d
  191. zip1 v22.2d, v18.2d, v19.2d
  192. zip2 v21.2d, v16.2d, v17.2d
  193. zip2 v23.2d, v18.2d, v19.2d
  194. eor v20.16b, v20.16b, v21.16b
  195. eor v22.16b, v22.16b, v23.16b
  196. and v21.16b, v21.16b, v24.16b
  197. and v23.16b, v23.16b, v25.16b
  198. eor v20.16b, v20.16b, v21.16b
  199. eor v22.16b, v22.16b, v23.16b
  200. zip1 v16.2d, v20.2d, v21.2d
  201. zip1 v18.2d, v22.2d, v23.2d
  202. zip2 v17.2d, v20.2d, v21.2d
  203. zip2 v19.2d, v22.2d, v23.2d
  204. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  205. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  206. pmull v1.8h, v7.8b, v3.8b // D = A*B
  207. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  208. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  209. eor v16.16b, v16.16b, v17.16b
  210. eor v18.16b, v18.16b, v19.16b
  211. eor v1.16b, v1.16b, v16.16b
  212. eor v1.16b, v1.16b, v18.16b
  213. ext v16.8b, v6.8b, v6.8b, #1 // A1
  214. pmull v16.8h, v16.8b, v4.8b // F = A1*B
  215. ext v2.8b, v4.8b, v4.8b, #1 // B1
  216. pmull v2.8h, v6.8b, v2.8b // E = A*B1
  217. ext v17.8b, v6.8b, v6.8b, #2 // A2
  218. pmull v17.8h, v17.8b, v4.8b // H = A2*B
  219. ext v19.8b, v4.8b, v4.8b, #2 // B2
  220. pmull v19.8h, v6.8b, v19.8b // G = A*B2
  221. ext v18.8b, v6.8b, v6.8b, #3 // A3
  222. eor v16.16b, v16.16b, v2.16b // L = E + F
  223. pmull v18.8h, v18.8b, v4.8b // J = A3*B
  224. ext v2.8b, v4.8b, v4.8b, #3 // B3
  225. eor v17.16b, v17.16b, v19.16b // M = G + H
  226. pmull v2.8h, v6.8b, v2.8b // I = A*B3
  227. // Here we diverge from the 32-bit version. It computes the following
  228. // (instructions reordered for clarity):
  229. //
  230. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  231. // vand $t0#hi, $t0#hi, $k48
  232. // veor $t0#lo, $t0#lo, $t0#hi
  233. //
  234. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  235. // vand $t1#hi, $t1#hi, $k32
  236. // veor $t1#lo, $t1#lo, $t1#hi
  237. //
  238. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  239. // vand $t2#hi, $t2#hi, $k16
  240. // veor $t2#lo, $t2#lo, $t2#hi
  241. //
  242. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  243. // vmov.i64 $t3#hi, #0
  244. //
  245. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  246. // upper halves of SIMD registers, so we must split each half into
  247. // separate registers. To compensate, we pair computations up and
  248. // parallelize.
  249. ext v19.8b, v4.8b, v4.8b, #4 // B4
  250. eor v18.16b, v18.16b, v2.16b // N = I + J
  251. pmull v19.8h, v6.8b, v19.8b // K = A*B4
  252. // This can probably be scheduled more efficiently. For now, we just
  253. // pair up independent instructions.
  254. zip1 v20.2d, v16.2d, v17.2d
  255. zip1 v22.2d, v18.2d, v19.2d
  256. zip2 v21.2d, v16.2d, v17.2d
  257. zip2 v23.2d, v18.2d, v19.2d
  258. eor v20.16b, v20.16b, v21.16b
  259. eor v22.16b, v22.16b, v23.16b
  260. and v21.16b, v21.16b, v24.16b
  261. and v23.16b, v23.16b, v25.16b
  262. eor v20.16b, v20.16b, v21.16b
  263. eor v22.16b, v22.16b, v23.16b
  264. zip1 v16.2d, v20.2d, v21.2d
  265. zip1 v18.2d, v22.2d, v23.2d
  266. zip2 v17.2d, v20.2d, v21.2d
  267. zip2 v19.2d, v22.2d, v23.2d
  268. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  269. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  270. pmull v2.8h, v6.8b, v4.8b // D = A*B
  271. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  272. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  273. eor v16.16b, v16.16b, v17.16b
  274. eor v18.16b, v18.16b, v19.16b
  275. eor v2.16b, v2.16b, v16.16b
  276. eor v2.16b, v2.16b, v18.16b
  277. ext v16.16b, v0.16b, v2.16b, #8
  278. eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
  279. eor v1.16b, v1.16b, v2.16b
  280. eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
  281. ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
  282. // This is a no-op due to the ins instruction below.
  283. // ins v2.d[0], v1.d[1]
  284. // equivalent of reduction_avx from ghash-x86_64.pl
  285. shl v17.2d, v0.2d, #57 // 1st phase
  286. shl v18.2d, v0.2d, #62
  287. eor v18.16b, v18.16b, v17.16b //
  288. shl v17.2d, v0.2d, #63
  289. eor v18.16b, v18.16b, v17.16b //
  290. // Note Xm contains {Xl.d[1], Xh.d[0]}.
  291. eor v18.16b, v18.16b, v1.16b
  292. ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
  293. ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
  294. ushr v18.2d, v0.2d, #1 // 2nd phase
  295. eor v2.16b, v2.16b,v0.16b
  296. eor v0.16b, v0.16b,v18.16b //
  297. ushr v18.2d, v18.2d, #6
  298. ushr v0.2d, v0.2d, #1 //
  299. eor v0.16b, v0.16b, v2.16b //
  300. eor v0.16b, v0.16b, v18.16b //
  301. subs x3, x3, #16
  302. bne Loop_neon
  303. rev64 v0.16b, v0.16b // byteswap Xi and write
  304. ext v0.16b, v0.16b, v0.16b, #8
  305. st1 {v0.16b}, [x0]
  306. ret
  307. .section .rodata
  308. .align 4
  309. Lmasks:
  310. .quad 0x0000ffffffffffff // k48
  311. .quad 0x00000000ffffffff // k32
  312. .quad 0x000000000000ffff // k16
  313. .quad 0x0000000000000000 // k0
  314. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  315. .align 2
  316. .align 2
  317. #endif
  318. #endif // !OPENSSL_NO_ASM