ghashv8-armx64.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. #if __ARM_MAX_ARCH__>=7
  16. .text
  17. .arch armv8-a+crypto
  18. .globl gcm_init_v8
  19. .def gcm_init_v8
  20. .type 32
  21. .endef
  22. .align 4
  23. gcm_init_v8:
  24. AARCH64_VALID_CALL_TARGET
  25. ld1 {v17.2d},[x1] //load input H
  26. movi v19.16b,#0xe1
  27. shl v19.2d,v19.2d,#57 //0xc2.0
  28. ext v3.16b,v17.16b,v17.16b,#8
  29. ushr v18.2d,v19.2d,#63
  30. dup v17.4s,v17.s[1]
  31. ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
  32. ushr v18.2d,v3.2d,#63
  33. sshr v17.4s,v17.4s,#31 //broadcast carry bit
  34. and v18.16b,v18.16b,v16.16b
  35. shl v3.2d,v3.2d,#1
  36. ext v18.16b,v18.16b,v18.16b,#8
  37. and v16.16b,v16.16b,v17.16b
  38. orr v3.16b,v3.16b,v18.16b //H<<<=1
  39. eor v20.16b,v3.16b,v16.16b //twisted H
  40. st1 {v20.2d},[x0],#16 //store Htable[0]
  41. //calculate H^2
  42. ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
  43. pmull v0.1q,v20.1d,v20.1d
  44. eor v16.16b,v16.16b,v20.16b
  45. pmull2 v2.1q,v20.2d,v20.2d
  46. pmull v1.1q,v16.1d,v16.1d
  47. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  48. eor v18.16b,v0.16b,v2.16b
  49. eor v1.16b,v1.16b,v17.16b
  50. eor v1.16b,v1.16b,v18.16b
  51. pmull v18.1q,v0.1d,v19.1d //1st phase
  52. ins v2.d[0],v1.d[1]
  53. ins v1.d[1],v0.d[0]
  54. eor v0.16b,v1.16b,v18.16b
  55. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
  56. pmull v0.1q,v0.1d,v19.1d
  57. eor v18.16b,v18.16b,v2.16b
  58. eor v22.16b,v0.16b,v18.16b
  59. ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
  60. eor v17.16b,v17.16b,v22.16b
  61. ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
  62. st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
  63. //calculate H^3 and H^4
  64. pmull v0.1q,v20.1d, v22.1d
  65. pmull v5.1q,v22.1d,v22.1d
  66. pmull2 v2.1q,v20.2d, v22.2d
  67. pmull2 v7.1q,v22.2d,v22.2d
  68. pmull v1.1q,v16.1d,v17.1d
  69. pmull v6.1q,v17.1d,v17.1d
  70. ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  71. ext v17.16b,v5.16b,v7.16b,#8
  72. eor v18.16b,v0.16b,v2.16b
  73. eor v1.16b,v1.16b,v16.16b
  74. eor v4.16b,v5.16b,v7.16b
  75. eor v6.16b,v6.16b,v17.16b
  76. eor v1.16b,v1.16b,v18.16b
  77. pmull v18.1q,v0.1d,v19.1d //1st phase
  78. eor v6.16b,v6.16b,v4.16b
  79. pmull v4.1q,v5.1d,v19.1d
  80. ins v2.d[0],v1.d[1]
  81. ins v7.d[0],v6.d[1]
  82. ins v1.d[1],v0.d[0]
  83. ins v6.d[1],v5.d[0]
  84. eor v0.16b,v1.16b,v18.16b
  85. eor v5.16b,v6.16b,v4.16b
  86. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
  87. ext v4.16b,v5.16b,v5.16b,#8
  88. pmull v0.1q,v0.1d,v19.1d
  89. pmull v5.1q,v5.1d,v19.1d
  90. eor v18.16b,v18.16b,v2.16b
  91. eor v4.16b,v4.16b,v7.16b
  92. eor v20.16b, v0.16b,v18.16b //H^3
  93. eor v22.16b,v5.16b,v4.16b //H^4
  94. ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
  95. ext v17.16b,v22.16b,v22.16b,#8
  96. eor v16.16b,v16.16b,v20.16b
  97. eor v17.16b,v17.16b,v22.16b
  98. ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
  99. st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
  100. ret
  101. .globl gcm_gmult_v8
  102. .def gcm_gmult_v8
  103. .type 32
  104. .endef
  105. .align 4
  106. gcm_gmult_v8:
  107. AARCH64_VALID_CALL_TARGET
  108. ld1 {v17.2d},[x0] //load Xi
  109. movi v19.16b,#0xe1
  110. ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
  111. shl v19.2d,v19.2d,#57
  112. #ifndef __ARMEB__
  113. rev64 v17.16b,v17.16b
  114. #endif
  115. ext v3.16b,v17.16b,v17.16b,#8
  116. pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
  117. eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
  118. pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
  119. pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
  120. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  121. eor v18.16b,v0.16b,v2.16b
  122. eor v1.16b,v1.16b,v17.16b
  123. eor v1.16b,v1.16b,v18.16b
  124. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  125. ins v2.d[0],v1.d[1]
  126. ins v1.d[1],v0.d[0]
  127. eor v0.16b,v1.16b,v18.16b
  128. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  129. pmull v0.1q,v0.1d,v19.1d
  130. eor v18.16b,v18.16b,v2.16b
  131. eor v0.16b,v0.16b,v18.16b
  132. #ifndef __ARMEB__
  133. rev64 v0.16b,v0.16b
  134. #endif
  135. ext v0.16b,v0.16b,v0.16b,#8
  136. st1 {v0.2d},[x0] //write out Xi
  137. ret
  138. .globl gcm_ghash_v8
  139. .def gcm_ghash_v8
  140. .type 32
  141. .endef
  142. .align 4
  143. gcm_ghash_v8:
  144. AARCH64_VALID_CALL_TARGET
  145. cmp x3,#64
  146. b.hs Lgcm_ghash_v8_4x
  147. ld1 {v0.2d},[x0] //load [rotated] Xi
  148. //"[rotated]" means that
  149. //loaded value would have
  150. //to be rotated in order to
  151. //make it appear as in
  152. //algorithm specification
  153. subs x3,x3,#32 //see if x3 is 32 or larger
  154. mov x12,#16 //x12 is used as post-
  155. //increment for input pointer;
  156. //as loop is modulo-scheduled
  157. //x12 is zeroed just in time
  158. //to preclude overstepping
  159. //inp[len], which means that
  160. //last block[s] are actually
  161. //loaded twice, but last
  162. //copy is not processed
  163. ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
  164. movi v19.16b,#0xe1
  165. ld1 {v22.2d},[x1]
  166. csel x12,xzr,x12,eq //is it time to zero x12?
  167. ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
  168. ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
  169. shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
  170. #ifndef __ARMEB__
  171. rev64 v16.16b,v16.16b
  172. rev64 v0.16b,v0.16b
  173. #endif
  174. ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
  175. b.lo Lodd_tail_v8 //x3 was less than 32
  176. ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
  177. #ifndef __ARMEB__
  178. rev64 v17.16b,v17.16b
  179. #endif
  180. ext v7.16b,v17.16b,v17.16b,#8
  181. eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
  182. pmull v4.1q,v20.1d,v7.1d //H·Ii+1
  183. eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
  184. pmull2 v6.1q,v20.2d,v7.2d
  185. b Loop_mod2x_v8
  186. .align 4
  187. Loop_mod2x_v8:
  188. ext v18.16b,v3.16b,v3.16b,#8
  189. subs x3,x3,#32 //is there more data?
  190. pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
  191. csel x12,xzr,x12,lo //is it time to zero x12?
  192. pmull v5.1q,v21.1d,v17.1d
  193. eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
  194. pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
  195. eor v0.16b,v0.16b,v4.16b //accumulate
  196. pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  197. ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
  198. eor v2.16b,v2.16b,v6.16b
  199. csel x12,xzr,x12,eq //is it time to zero x12?
  200. eor v1.16b,v1.16b,v5.16b
  201. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  202. eor v18.16b,v0.16b,v2.16b
  203. eor v1.16b,v1.16b,v17.16b
  204. ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
  205. #ifndef __ARMEB__
  206. rev64 v16.16b,v16.16b
  207. #endif
  208. eor v1.16b,v1.16b,v18.16b
  209. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  210. #ifndef __ARMEB__
  211. rev64 v17.16b,v17.16b
  212. #endif
  213. ins v2.d[0],v1.d[1]
  214. ins v1.d[1],v0.d[0]
  215. ext v7.16b,v17.16b,v17.16b,#8
  216. ext v3.16b,v16.16b,v16.16b,#8
  217. eor v0.16b,v1.16b,v18.16b
  218. pmull v4.1q,v20.1d,v7.1d //H·Ii+1
  219. eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
  220. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  221. pmull v0.1q,v0.1d,v19.1d
  222. eor v3.16b,v3.16b,v18.16b
  223. eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
  224. eor v3.16b,v3.16b,v0.16b
  225. pmull2 v6.1q,v20.2d,v7.2d
  226. b.hs Loop_mod2x_v8 //there was at least 32 more bytes
  227. eor v2.16b,v2.16b,v18.16b
  228. ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
  229. adds x3,x3,#32 //re-construct x3
  230. eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
  231. b.eq Ldone_v8 //is x3 zero?
  232. Lodd_tail_v8:
  233. ext v18.16b,v0.16b,v0.16b,#8
  234. eor v3.16b,v3.16b,v0.16b //inp^=Xi
  235. eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
  236. pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
  237. eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
  238. pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
  239. pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
  240. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  241. eor v18.16b,v0.16b,v2.16b
  242. eor v1.16b,v1.16b,v17.16b
  243. eor v1.16b,v1.16b,v18.16b
  244. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  245. ins v2.d[0],v1.d[1]
  246. ins v1.d[1],v0.d[0]
  247. eor v0.16b,v1.16b,v18.16b
  248. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  249. pmull v0.1q,v0.1d,v19.1d
  250. eor v18.16b,v18.16b,v2.16b
  251. eor v0.16b,v0.16b,v18.16b
  252. Ldone_v8:
  253. #ifndef __ARMEB__
  254. rev64 v0.16b,v0.16b
  255. #endif
  256. ext v0.16b,v0.16b,v0.16b,#8
  257. st1 {v0.2d},[x0] //write out Xi
  258. ret
  259. .def gcm_ghash_v8_4x
  260. .type 32
  261. .endef
  262. .align 4
  263. gcm_ghash_v8_4x:
  264. Lgcm_ghash_v8_4x:
  265. ld1 {v0.2d},[x0] //load [rotated] Xi
  266. ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
  267. movi v19.16b,#0xe1
  268. ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
  269. shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
  270. ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
  271. #ifndef __ARMEB__
  272. rev64 v0.16b,v0.16b
  273. rev64 v5.16b,v5.16b
  274. rev64 v6.16b,v6.16b
  275. rev64 v7.16b,v7.16b
  276. rev64 v4.16b,v4.16b
  277. #endif
  278. ext v25.16b,v7.16b,v7.16b,#8
  279. ext v24.16b,v6.16b,v6.16b,#8
  280. ext v23.16b,v5.16b,v5.16b,#8
  281. pmull v29.1q,v20.1d,v25.1d //H·Ii+3
  282. eor v7.16b,v7.16b,v25.16b
  283. pmull2 v31.1q,v20.2d,v25.2d
  284. pmull v30.1q,v21.1d,v7.1d
  285. pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
  286. eor v6.16b,v6.16b,v24.16b
  287. pmull2 v24.1q,v22.2d,v24.2d
  288. pmull2 v6.1q,v21.2d,v6.2d
  289. eor v29.16b,v29.16b,v16.16b
  290. eor v31.16b,v31.16b,v24.16b
  291. eor v30.16b,v30.16b,v6.16b
  292. pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
  293. eor v5.16b,v5.16b,v23.16b
  294. pmull2 v23.1q,v26.2d,v23.2d
  295. pmull v5.1q,v27.1d,v5.1d
  296. eor v29.16b,v29.16b,v7.16b
  297. eor v31.16b,v31.16b,v23.16b
  298. eor v30.16b,v30.16b,v5.16b
  299. subs x3,x3,#128
  300. b.lo Ltail4x
  301. b Loop4x
  302. .align 4
  303. Loop4x:
  304. eor v16.16b,v4.16b,v0.16b
  305. ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
  306. ext v3.16b,v16.16b,v16.16b,#8
  307. #ifndef __ARMEB__
  308. rev64 v5.16b,v5.16b
  309. rev64 v6.16b,v6.16b
  310. rev64 v7.16b,v7.16b
  311. rev64 v4.16b,v4.16b
  312. #endif
  313. pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
  314. eor v16.16b,v16.16b,v3.16b
  315. pmull2 v2.1q,v28.2d,v3.2d
  316. ext v25.16b,v7.16b,v7.16b,#8
  317. pmull2 v1.1q,v27.2d,v16.2d
  318. eor v0.16b,v0.16b,v29.16b
  319. eor v2.16b,v2.16b,v31.16b
  320. ext v24.16b,v6.16b,v6.16b,#8
  321. eor v1.16b,v1.16b,v30.16b
  322. ext v23.16b,v5.16b,v5.16b,#8
  323. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  324. eor v18.16b,v0.16b,v2.16b
  325. pmull v29.1q,v20.1d,v25.1d //H·Ii+3
  326. eor v7.16b,v7.16b,v25.16b
  327. eor v1.16b,v1.16b,v17.16b
  328. pmull2 v31.1q,v20.2d,v25.2d
  329. eor v1.16b,v1.16b,v18.16b
  330. pmull v30.1q,v21.1d,v7.1d
  331. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  332. ins v2.d[0],v1.d[1]
  333. ins v1.d[1],v0.d[0]
  334. pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
  335. eor v6.16b,v6.16b,v24.16b
  336. pmull2 v24.1q,v22.2d,v24.2d
  337. eor v0.16b,v1.16b,v18.16b
  338. pmull2 v6.1q,v21.2d,v6.2d
  339. eor v29.16b,v29.16b,v16.16b
  340. eor v31.16b,v31.16b,v24.16b
  341. eor v30.16b,v30.16b,v6.16b
  342. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  343. pmull v0.1q,v0.1d,v19.1d
  344. pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
  345. eor v5.16b,v5.16b,v23.16b
  346. eor v18.16b,v18.16b,v2.16b
  347. pmull2 v23.1q,v26.2d,v23.2d
  348. pmull v5.1q,v27.1d,v5.1d
  349. eor v0.16b,v0.16b,v18.16b
  350. eor v29.16b,v29.16b,v7.16b
  351. eor v31.16b,v31.16b,v23.16b
  352. ext v0.16b,v0.16b,v0.16b,#8
  353. eor v30.16b,v30.16b,v5.16b
  354. subs x3,x3,#64
  355. b.hs Loop4x
  356. Ltail4x:
  357. eor v16.16b,v4.16b,v0.16b
  358. ext v3.16b,v16.16b,v16.16b,#8
  359. pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
  360. eor v16.16b,v16.16b,v3.16b
  361. pmull2 v2.1q,v28.2d,v3.2d
  362. pmull2 v1.1q,v27.2d,v16.2d
  363. eor v0.16b,v0.16b,v29.16b
  364. eor v2.16b,v2.16b,v31.16b
  365. eor v1.16b,v1.16b,v30.16b
  366. adds x3,x3,#64
  367. b.eq Ldone4x
  368. cmp x3,#32
  369. b.lo Lone
  370. b.eq Ltwo
  371. Lthree:
  372. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  373. eor v18.16b,v0.16b,v2.16b
  374. eor v1.16b,v1.16b,v17.16b
  375. ld1 {v4.2d,v5.2d,v6.2d},[x2]
  376. eor v1.16b,v1.16b,v18.16b
  377. #ifndef __ARMEB__
  378. rev64 v5.16b,v5.16b
  379. rev64 v6.16b,v6.16b
  380. rev64 v4.16b,v4.16b
  381. #endif
  382. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  383. ins v2.d[0],v1.d[1]
  384. ins v1.d[1],v0.d[0]
  385. ext v24.16b,v6.16b,v6.16b,#8
  386. ext v23.16b,v5.16b,v5.16b,#8
  387. eor v0.16b,v1.16b,v18.16b
  388. pmull v29.1q,v20.1d,v24.1d //H·Ii+2
  389. eor v6.16b,v6.16b,v24.16b
  390. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  391. pmull v0.1q,v0.1d,v19.1d
  392. eor v18.16b,v18.16b,v2.16b
  393. pmull2 v31.1q,v20.2d,v24.2d
  394. pmull v30.1q,v21.1d,v6.1d
  395. eor v0.16b,v0.16b,v18.16b
  396. pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
  397. eor v5.16b,v5.16b,v23.16b
  398. ext v0.16b,v0.16b,v0.16b,#8
  399. pmull2 v23.1q,v22.2d,v23.2d
  400. eor v16.16b,v4.16b,v0.16b
  401. pmull2 v5.1q,v21.2d,v5.2d
  402. ext v3.16b,v16.16b,v16.16b,#8
  403. eor v29.16b,v29.16b,v7.16b
  404. eor v31.16b,v31.16b,v23.16b
  405. eor v30.16b,v30.16b,v5.16b
  406. pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
  407. eor v16.16b,v16.16b,v3.16b
  408. pmull2 v2.1q,v26.2d,v3.2d
  409. pmull v1.1q,v27.1d,v16.1d
  410. eor v0.16b,v0.16b,v29.16b
  411. eor v2.16b,v2.16b,v31.16b
  412. eor v1.16b,v1.16b,v30.16b
  413. b Ldone4x
  414. .align 4
  415. Ltwo:
  416. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  417. eor v18.16b,v0.16b,v2.16b
  418. eor v1.16b,v1.16b,v17.16b
  419. ld1 {v4.2d,v5.2d},[x2]
  420. eor v1.16b,v1.16b,v18.16b
  421. #ifndef __ARMEB__
  422. rev64 v5.16b,v5.16b
  423. rev64 v4.16b,v4.16b
  424. #endif
  425. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  426. ins v2.d[0],v1.d[1]
  427. ins v1.d[1],v0.d[0]
  428. ext v23.16b,v5.16b,v5.16b,#8
  429. eor v0.16b,v1.16b,v18.16b
  430. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  431. pmull v0.1q,v0.1d,v19.1d
  432. eor v18.16b,v18.16b,v2.16b
  433. eor v0.16b,v0.16b,v18.16b
  434. ext v0.16b,v0.16b,v0.16b,#8
  435. pmull v29.1q,v20.1d,v23.1d //H·Ii+1
  436. eor v5.16b,v5.16b,v23.16b
  437. eor v16.16b,v4.16b,v0.16b
  438. ext v3.16b,v16.16b,v16.16b,#8
  439. pmull2 v31.1q,v20.2d,v23.2d
  440. pmull v30.1q,v21.1d,v5.1d
  441. pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
  442. eor v16.16b,v16.16b,v3.16b
  443. pmull2 v2.1q,v22.2d,v3.2d
  444. pmull2 v1.1q,v21.2d,v16.2d
  445. eor v0.16b,v0.16b,v29.16b
  446. eor v2.16b,v2.16b,v31.16b
  447. eor v1.16b,v1.16b,v30.16b
  448. b Ldone4x
  449. .align 4
  450. Lone:
  451. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  452. eor v18.16b,v0.16b,v2.16b
  453. eor v1.16b,v1.16b,v17.16b
  454. ld1 {v4.2d},[x2]
  455. eor v1.16b,v1.16b,v18.16b
  456. #ifndef __ARMEB__
  457. rev64 v4.16b,v4.16b
  458. #endif
  459. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  460. ins v2.d[0],v1.d[1]
  461. ins v1.d[1],v0.d[0]
  462. eor v0.16b,v1.16b,v18.16b
  463. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  464. pmull v0.1q,v0.1d,v19.1d
  465. eor v18.16b,v18.16b,v2.16b
  466. eor v0.16b,v0.16b,v18.16b
  467. ext v0.16b,v0.16b,v0.16b,#8
  468. eor v16.16b,v4.16b,v0.16b
  469. ext v3.16b,v16.16b,v16.16b,#8
  470. pmull v0.1q,v20.1d,v3.1d
  471. eor v16.16b,v16.16b,v3.16b
  472. pmull2 v2.1q,v20.2d,v3.2d
  473. pmull v1.1q,v21.1d,v16.1d
  474. Ldone4x:
  475. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  476. eor v18.16b,v0.16b,v2.16b
  477. eor v1.16b,v1.16b,v17.16b
  478. eor v1.16b,v1.16b,v18.16b
  479. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  480. ins v2.d[0],v1.d[1]
  481. ins v1.d[1],v0.d[0]
  482. eor v0.16b,v1.16b,v18.16b
  483. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  484. pmull v0.1q,v0.1d,v19.1d
  485. eor v18.16b,v18.16b,v2.16b
  486. eor v0.16b,v0.16b,v18.16b
  487. ext v0.16b,v0.16b,v0.16b,#8
  488. #ifndef __ARMEB__
  489. rev64 v0.16b,v0.16b
  490. #endif
  491. st1 {v0.2d},[x0] //write out Xi
  492. ret
  493. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  494. .align 2
  495. .align 2
  496. #endif
  497. #endif
  498. #endif // !OPENSSL_NO_ASM