ghash-neon-armv8.S 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. .text
  15. .globl _gcm_init_neon
  16. .private_extern _gcm_init_neon
  17. .align 4
  18. _gcm_init_neon:
  19. AARCH64_VALID_CALL_TARGET
  20. // This function is adapted from gcm_init_v8. xC2 is t3.
  21. ld1 {v17.2d}, [x1] // load H
  22. movi v19.16b, #0xe1
  23. shl v19.2d, v19.2d, #57 // 0xc2.0
  24. ext v3.16b, v17.16b, v17.16b, #8
  25. ushr v18.2d, v19.2d, #63
  26. dup v17.4s, v17.s[1]
  27. ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
  28. ushr v18.2d, v3.2d, #63
  29. sshr v17.4s, v17.4s, #31 // broadcast carry bit
  30. and v18.16b, v18.16b, v16.16b
  31. shl v3.2d, v3.2d, #1
  32. ext v18.16b, v18.16b, v18.16b, #8
  33. and v16.16b, v16.16b, v17.16b
  34. orr v3.16b, v3.16b, v18.16b // H<<<=1
  35. eor v5.16b, v3.16b, v16.16b // twisted H
  36. st1 {v5.2d}, [x0] // store Htable[0]
  37. ret
  38. .globl _gcm_gmult_neon
  39. .private_extern _gcm_gmult_neon
  40. .align 4
  41. _gcm_gmult_neon:
  42. AARCH64_VALID_CALL_TARGET
  43. ld1 {v3.16b}, [x0] // load Xi
  44. ld1 {v5.1d}, [x1], #8 // load twisted H
  45. ld1 {v6.1d}, [x1]
  46. adrp x9, Lmasks@PAGE // load constants
  47. add x9, x9, Lmasks@PAGEOFF
  48. ld1 {v24.2d, v25.2d}, [x9]
  49. rev64 v3.16b, v3.16b // byteswap Xi
  50. ext v3.16b, v3.16b, v3.16b, #8
  51. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  52. mov x3, #16
  53. b Lgmult_neon
  54. .globl _gcm_ghash_neon
  55. .private_extern _gcm_ghash_neon
  56. .align 4
  57. _gcm_ghash_neon:
  58. AARCH64_VALID_CALL_TARGET
  59. ld1 {v0.16b}, [x0] // load Xi
  60. ld1 {v5.1d}, [x1], #8 // load twisted H
  61. ld1 {v6.1d}, [x1]
  62. adrp x9, Lmasks@PAGE // load constants
  63. add x9, x9, Lmasks@PAGEOFF
  64. ld1 {v24.2d, v25.2d}, [x9]
  65. rev64 v0.16b, v0.16b // byteswap Xi
  66. ext v0.16b, v0.16b, v0.16b, #8
  67. eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
  68. Loop_neon:
  69. ld1 {v3.16b}, [x2], #16 // load inp
  70. rev64 v3.16b, v3.16b // byteswap inp
  71. ext v3.16b, v3.16b, v3.16b, #8
  72. eor v3.16b, v3.16b, v0.16b // inp ^= Xi
  73. Lgmult_neon:
  74. // Split the input into v3 and v4. (The upper halves are unused,
  75. // so it is okay to leave them alone.)
  76. ins v4.d[0], v3.d[1]
  77. ext v16.8b, v5.8b, v5.8b, #1 // A1
  78. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  79. ext v0.8b, v3.8b, v3.8b, #1 // B1
  80. pmull v0.8h, v5.8b, v0.8b // E = A*B1
  81. ext v17.8b, v5.8b, v5.8b, #2 // A2
  82. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  83. ext v19.8b, v3.8b, v3.8b, #2 // B2
  84. pmull v19.8h, v5.8b, v19.8b // G = A*B2
  85. ext v18.8b, v5.8b, v5.8b, #3 // A3
  86. eor v16.16b, v16.16b, v0.16b // L = E + F
  87. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  88. ext v0.8b, v3.8b, v3.8b, #3 // B3
  89. eor v17.16b, v17.16b, v19.16b // M = G + H
  90. pmull v0.8h, v5.8b, v0.8b // I = A*B3
  91. // Here we diverge from the 32-bit version. It computes the following
  92. // (instructions reordered for clarity):
  93. //
  94. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  95. // vand $t0#hi, $t0#hi, $k48
  96. // veor $t0#lo, $t0#lo, $t0#hi
  97. //
  98. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  99. // vand $t1#hi, $t1#hi, $k32
  100. // veor $t1#lo, $t1#lo, $t1#hi
  101. //
  102. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  103. // vand $t2#hi, $t2#hi, $k16
  104. // veor $t2#lo, $t2#lo, $t2#hi
  105. //
  106. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  107. // vmov.i64 $t3#hi, #0
  108. //
  109. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  110. // upper halves of SIMD registers, so we must split each half into
  111. // separate registers. To compensate, we pair computations up and
  112. // parallelize.
  113. ext v19.8b, v3.8b, v3.8b, #4 // B4
  114. eor v18.16b, v18.16b, v0.16b // N = I + J
  115. pmull v19.8h, v5.8b, v19.8b // K = A*B4
  116. // This can probably be scheduled more efficiently. For now, we just
  117. // pair up independent instructions.
  118. zip1 v20.2d, v16.2d, v17.2d
  119. zip1 v22.2d, v18.2d, v19.2d
  120. zip2 v21.2d, v16.2d, v17.2d
  121. zip2 v23.2d, v18.2d, v19.2d
  122. eor v20.16b, v20.16b, v21.16b
  123. eor v22.16b, v22.16b, v23.16b
  124. and v21.16b, v21.16b, v24.16b
  125. and v23.16b, v23.16b, v25.16b
  126. eor v20.16b, v20.16b, v21.16b
  127. eor v22.16b, v22.16b, v23.16b
  128. zip1 v16.2d, v20.2d, v21.2d
  129. zip1 v18.2d, v22.2d, v23.2d
  130. zip2 v17.2d, v20.2d, v21.2d
  131. zip2 v19.2d, v22.2d, v23.2d
  132. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  133. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  134. pmull v0.8h, v5.8b, v3.8b // D = A*B
  135. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  136. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  137. eor v16.16b, v16.16b, v17.16b
  138. eor v18.16b, v18.16b, v19.16b
  139. eor v0.16b, v0.16b, v16.16b
  140. eor v0.16b, v0.16b, v18.16b
  141. eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
  142. ext v16.8b, v7.8b, v7.8b, #1 // A1
  143. pmull v16.8h, v16.8b, v3.8b // F = A1*B
  144. ext v1.8b, v3.8b, v3.8b, #1 // B1
  145. pmull v1.8h, v7.8b, v1.8b // E = A*B1
  146. ext v17.8b, v7.8b, v7.8b, #2 // A2
  147. pmull v17.8h, v17.8b, v3.8b // H = A2*B
  148. ext v19.8b, v3.8b, v3.8b, #2 // B2
  149. pmull v19.8h, v7.8b, v19.8b // G = A*B2
  150. ext v18.8b, v7.8b, v7.8b, #3 // A3
  151. eor v16.16b, v16.16b, v1.16b // L = E + F
  152. pmull v18.8h, v18.8b, v3.8b // J = A3*B
  153. ext v1.8b, v3.8b, v3.8b, #3 // B3
  154. eor v17.16b, v17.16b, v19.16b // M = G + H
  155. pmull v1.8h, v7.8b, v1.8b // I = A*B3
  156. // Here we diverge from the 32-bit version. It computes the following
  157. // (instructions reordered for clarity):
  158. //
  159. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  160. // vand $t0#hi, $t0#hi, $k48
  161. // veor $t0#lo, $t0#lo, $t0#hi
  162. //
  163. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  164. // vand $t1#hi, $t1#hi, $k32
  165. // veor $t1#lo, $t1#lo, $t1#hi
  166. //
  167. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  168. // vand $t2#hi, $t2#hi, $k16
  169. // veor $t2#lo, $t2#lo, $t2#hi
  170. //
  171. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  172. // vmov.i64 $t3#hi, #0
  173. //
  174. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  175. // upper halves of SIMD registers, so we must split each half into
  176. // separate registers. To compensate, we pair computations up and
  177. // parallelize.
  178. ext v19.8b, v3.8b, v3.8b, #4 // B4
  179. eor v18.16b, v18.16b, v1.16b // N = I + J
  180. pmull v19.8h, v7.8b, v19.8b // K = A*B4
  181. // This can probably be scheduled more efficiently. For now, we just
  182. // pair up independent instructions.
  183. zip1 v20.2d, v16.2d, v17.2d
  184. zip1 v22.2d, v18.2d, v19.2d
  185. zip2 v21.2d, v16.2d, v17.2d
  186. zip2 v23.2d, v18.2d, v19.2d
  187. eor v20.16b, v20.16b, v21.16b
  188. eor v22.16b, v22.16b, v23.16b
  189. and v21.16b, v21.16b, v24.16b
  190. and v23.16b, v23.16b, v25.16b
  191. eor v20.16b, v20.16b, v21.16b
  192. eor v22.16b, v22.16b, v23.16b
  193. zip1 v16.2d, v20.2d, v21.2d
  194. zip1 v18.2d, v22.2d, v23.2d
  195. zip2 v17.2d, v20.2d, v21.2d
  196. zip2 v19.2d, v22.2d, v23.2d
  197. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  198. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  199. pmull v1.8h, v7.8b, v3.8b // D = A*B
  200. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  201. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  202. eor v16.16b, v16.16b, v17.16b
  203. eor v18.16b, v18.16b, v19.16b
  204. eor v1.16b, v1.16b, v16.16b
  205. eor v1.16b, v1.16b, v18.16b
  206. ext v16.8b, v6.8b, v6.8b, #1 // A1
  207. pmull v16.8h, v16.8b, v4.8b // F = A1*B
  208. ext v2.8b, v4.8b, v4.8b, #1 // B1
  209. pmull v2.8h, v6.8b, v2.8b // E = A*B1
  210. ext v17.8b, v6.8b, v6.8b, #2 // A2
  211. pmull v17.8h, v17.8b, v4.8b // H = A2*B
  212. ext v19.8b, v4.8b, v4.8b, #2 // B2
  213. pmull v19.8h, v6.8b, v19.8b // G = A*B2
  214. ext v18.8b, v6.8b, v6.8b, #3 // A3
  215. eor v16.16b, v16.16b, v2.16b // L = E + F
  216. pmull v18.8h, v18.8b, v4.8b // J = A3*B
  217. ext v2.8b, v4.8b, v4.8b, #3 // B3
  218. eor v17.16b, v17.16b, v19.16b // M = G + H
  219. pmull v2.8h, v6.8b, v2.8b // I = A*B3
  220. // Here we diverge from the 32-bit version. It computes the following
  221. // (instructions reordered for clarity):
  222. //
  223. // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
  224. // vand $t0#hi, $t0#hi, $k48
  225. // veor $t0#lo, $t0#lo, $t0#hi
  226. //
  227. // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
  228. // vand $t1#hi, $t1#hi, $k32
  229. // veor $t1#lo, $t1#lo, $t1#hi
  230. //
  231. // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
  232. // vand $t2#hi, $t2#hi, $k16
  233. // veor $t2#lo, $t2#lo, $t2#hi
  234. //
  235. // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
  236. // vmov.i64 $t3#hi, #0
  237. //
  238. // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
  239. // upper halves of SIMD registers, so we must split each half into
  240. // separate registers. To compensate, we pair computations up and
  241. // parallelize.
  242. ext v19.8b, v4.8b, v4.8b, #4 // B4
  243. eor v18.16b, v18.16b, v2.16b // N = I + J
  244. pmull v19.8h, v6.8b, v19.8b // K = A*B4
  245. // This can probably be scheduled more efficiently. For now, we just
  246. // pair up independent instructions.
  247. zip1 v20.2d, v16.2d, v17.2d
  248. zip1 v22.2d, v18.2d, v19.2d
  249. zip2 v21.2d, v16.2d, v17.2d
  250. zip2 v23.2d, v18.2d, v19.2d
  251. eor v20.16b, v20.16b, v21.16b
  252. eor v22.16b, v22.16b, v23.16b
  253. and v21.16b, v21.16b, v24.16b
  254. and v23.16b, v23.16b, v25.16b
  255. eor v20.16b, v20.16b, v21.16b
  256. eor v22.16b, v22.16b, v23.16b
  257. zip1 v16.2d, v20.2d, v21.2d
  258. zip1 v18.2d, v22.2d, v23.2d
  259. zip2 v17.2d, v20.2d, v21.2d
  260. zip2 v19.2d, v22.2d, v23.2d
  261. ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
  262. ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
  263. pmull v2.8h, v6.8b, v4.8b // D = A*B
  264. ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
  265. ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
  266. eor v16.16b, v16.16b, v17.16b
  267. eor v18.16b, v18.16b, v19.16b
  268. eor v2.16b, v2.16b, v16.16b
  269. eor v2.16b, v2.16b, v18.16b
  270. ext v16.16b, v0.16b, v2.16b, #8
  271. eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
  272. eor v1.16b, v1.16b, v2.16b
  273. eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
  274. ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
  275. // This is a no-op due to the ins instruction below.
  276. // ins v2.d[0], v1.d[1]
  277. // equivalent of reduction_avx from ghash-x86_64.pl
  278. shl v17.2d, v0.2d, #57 // 1st phase
  279. shl v18.2d, v0.2d, #62
  280. eor v18.16b, v18.16b, v17.16b //
  281. shl v17.2d, v0.2d, #63
  282. eor v18.16b, v18.16b, v17.16b //
  283. // Note Xm contains {Xl.d[1], Xh.d[0]}.
  284. eor v18.16b, v18.16b, v1.16b
  285. ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
  286. ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
  287. ushr v18.2d, v0.2d, #1 // 2nd phase
  288. eor v2.16b, v2.16b,v0.16b
  289. eor v0.16b, v0.16b,v18.16b //
  290. ushr v18.2d, v18.2d, #6
  291. ushr v0.2d, v0.2d, #1 //
  292. eor v0.16b, v0.16b, v2.16b //
  293. eor v0.16b, v0.16b, v18.16b //
  294. subs x3, x3, #16
  295. bne Loop_neon
  296. rev64 v0.16b, v0.16b // byteswap Xi and write
  297. ext v0.16b, v0.16b, v0.16b, #8
  298. st1 {v0.16b}, [x0]
  299. ret
  300. .section __TEXT,__const
  301. .align 4
  302. Lmasks:
  303. .quad 0x0000ffffffffffff // k48
  304. .quad 0x00000000ffffffff // k32
  305. .quad 0x000000000000ffff // k16
  306. .quad 0x0000000000000000 // k0
  307. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  308. .align 2
  309. .align 2
  310. #endif // !OPENSSL_NO_ASM