ghashv8-armx64.S 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. #if __ARM_MAX_ARCH__>=7
  15. .text
  16. .globl _gcm_init_v8
  17. .private_extern _gcm_init_v8
  18. .align 4
  19. _gcm_init_v8:
  20. AARCH64_VALID_CALL_TARGET
  21. ld1 {v17.2d},[x1] //load input H
  22. movi v19.16b,#0xe1
  23. shl v19.2d,v19.2d,#57 //0xc2.0
  24. ext v3.16b,v17.16b,v17.16b,#8
  25. ushr v18.2d,v19.2d,#63
  26. dup v17.4s,v17.s[1]
  27. ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
  28. ushr v18.2d,v3.2d,#63
  29. sshr v17.4s,v17.4s,#31 //broadcast carry bit
  30. and v18.16b,v18.16b,v16.16b
  31. shl v3.2d,v3.2d,#1
  32. ext v18.16b,v18.16b,v18.16b,#8
  33. and v16.16b,v16.16b,v17.16b
  34. orr v3.16b,v3.16b,v18.16b //H<<<=1
  35. eor v20.16b,v3.16b,v16.16b //twisted H
  36. st1 {v20.2d},[x0],#16 //store Htable[0]
  37. //calculate H^2
  38. ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
  39. pmull v0.1q,v20.1d,v20.1d
  40. eor v16.16b,v16.16b,v20.16b
  41. pmull2 v2.1q,v20.2d,v20.2d
  42. pmull v1.1q,v16.1d,v16.1d
  43. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  44. eor v18.16b,v0.16b,v2.16b
  45. eor v1.16b,v1.16b,v17.16b
  46. eor v1.16b,v1.16b,v18.16b
  47. pmull v18.1q,v0.1d,v19.1d //1st phase
  48. ins v2.d[0],v1.d[1]
  49. ins v1.d[1],v0.d[0]
  50. eor v0.16b,v1.16b,v18.16b
  51. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
  52. pmull v0.1q,v0.1d,v19.1d
  53. eor v18.16b,v18.16b,v2.16b
  54. eor v22.16b,v0.16b,v18.16b
  55. ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
  56. eor v17.16b,v17.16b,v22.16b
  57. ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
  58. st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
  59. //calculate H^3 and H^4
  60. pmull v0.1q,v20.1d, v22.1d
  61. pmull v5.1q,v22.1d,v22.1d
  62. pmull2 v2.1q,v20.2d, v22.2d
  63. pmull2 v7.1q,v22.2d,v22.2d
  64. pmull v1.1q,v16.1d,v17.1d
  65. pmull v6.1q,v17.1d,v17.1d
  66. ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  67. ext v17.16b,v5.16b,v7.16b,#8
  68. eor v18.16b,v0.16b,v2.16b
  69. eor v1.16b,v1.16b,v16.16b
  70. eor v4.16b,v5.16b,v7.16b
  71. eor v6.16b,v6.16b,v17.16b
  72. eor v1.16b,v1.16b,v18.16b
  73. pmull v18.1q,v0.1d,v19.1d //1st phase
  74. eor v6.16b,v6.16b,v4.16b
  75. pmull v4.1q,v5.1d,v19.1d
  76. ins v2.d[0],v1.d[1]
  77. ins v7.d[0],v6.d[1]
  78. ins v1.d[1],v0.d[0]
  79. ins v6.d[1],v5.d[0]
  80. eor v0.16b,v1.16b,v18.16b
  81. eor v5.16b,v6.16b,v4.16b
  82. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
  83. ext v4.16b,v5.16b,v5.16b,#8
  84. pmull v0.1q,v0.1d,v19.1d
  85. pmull v5.1q,v5.1d,v19.1d
  86. eor v18.16b,v18.16b,v2.16b
  87. eor v4.16b,v4.16b,v7.16b
  88. eor v20.16b, v0.16b,v18.16b //H^3
  89. eor v22.16b,v5.16b,v4.16b //H^4
  90. ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
  91. ext v17.16b,v22.16b,v22.16b,#8
  92. eor v16.16b,v16.16b,v20.16b
  93. eor v17.16b,v17.16b,v22.16b
  94. ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
  95. st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
  96. ret
  97. .globl _gcm_gmult_v8
  98. .private_extern _gcm_gmult_v8
  99. .align 4
  100. _gcm_gmult_v8:
  101. AARCH64_VALID_CALL_TARGET
  102. ld1 {v17.2d},[x0] //load Xi
  103. movi v19.16b,#0xe1
  104. ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
  105. shl v19.2d,v19.2d,#57
  106. #ifndef __ARMEB__
  107. rev64 v17.16b,v17.16b
  108. #endif
  109. ext v3.16b,v17.16b,v17.16b,#8
  110. pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
  111. eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
  112. pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
  113. pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
  114. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  115. eor v18.16b,v0.16b,v2.16b
  116. eor v1.16b,v1.16b,v17.16b
  117. eor v1.16b,v1.16b,v18.16b
  118. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  119. ins v2.d[0],v1.d[1]
  120. ins v1.d[1],v0.d[0]
  121. eor v0.16b,v1.16b,v18.16b
  122. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  123. pmull v0.1q,v0.1d,v19.1d
  124. eor v18.16b,v18.16b,v2.16b
  125. eor v0.16b,v0.16b,v18.16b
  126. #ifndef __ARMEB__
  127. rev64 v0.16b,v0.16b
  128. #endif
  129. ext v0.16b,v0.16b,v0.16b,#8
  130. st1 {v0.2d},[x0] //write out Xi
  131. ret
  132. .globl _gcm_ghash_v8
  133. .private_extern _gcm_ghash_v8
  134. .align 4
  135. _gcm_ghash_v8:
  136. AARCH64_VALID_CALL_TARGET
  137. cmp x3,#64
  138. b.hs Lgcm_ghash_v8_4x
  139. ld1 {v0.2d},[x0] //load [rotated] Xi
  140. //"[rotated]" means that
  141. //loaded value would have
  142. //to be rotated in order to
  143. //make it appear as in
  144. //algorithm specification
  145. subs x3,x3,#32 //see if x3 is 32 or larger
  146. mov x12,#16 //x12 is used as post-
  147. //increment for input pointer;
  148. //as loop is modulo-scheduled
  149. //x12 is zeroed just in time
  150. //to preclude overstepping
  151. //inp[len], which means that
  152. //last block[s] are actually
  153. //loaded twice, but last
  154. //copy is not processed
  155. ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
  156. movi v19.16b,#0xe1
  157. ld1 {v22.2d},[x1]
  158. csel x12,xzr,x12,eq //is it time to zero x12?
  159. ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
  160. ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
  161. shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
  162. #ifndef __ARMEB__
  163. rev64 v16.16b,v16.16b
  164. rev64 v0.16b,v0.16b
  165. #endif
  166. ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
  167. b.lo Lodd_tail_v8 //x3 was less than 32
  168. ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
  169. #ifndef __ARMEB__
  170. rev64 v17.16b,v17.16b
  171. #endif
  172. ext v7.16b,v17.16b,v17.16b,#8
  173. eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
  174. pmull v4.1q,v20.1d,v7.1d //H·Ii+1
  175. eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
  176. pmull2 v6.1q,v20.2d,v7.2d
  177. b Loop_mod2x_v8
  178. .align 4
  179. Loop_mod2x_v8:
  180. ext v18.16b,v3.16b,v3.16b,#8
  181. subs x3,x3,#32 //is there more data?
  182. pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
  183. csel x12,xzr,x12,lo //is it time to zero x12?
  184. pmull v5.1q,v21.1d,v17.1d
  185. eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
  186. pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
  187. eor v0.16b,v0.16b,v4.16b //accumulate
  188. pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
  189. ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
  190. eor v2.16b,v2.16b,v6.16b
  191. csel x12,xzr,x12,eq //is it time to zero x12?
  192. eor v1.16b,v1.16b,v5.16b
  193. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  194. eor v18.16b,v0.16b,v2.16b
  195. eor v1.16b,v1.16b,v17.16b
  196. ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
  197. #ifndef __ARMEB__
  198. rev64 v16.16b,v16.16b
  199. #endif
  200. eor v1.16b,v1.16b,v18.16b
  201. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  202. #ifndef __ARMEB__
  203. rev64 v17.16b,v17.16b
  204. #endif
  205. ins v2.d[0],v1.d[1]
  206. ins v1.d[1],v0.d[0]
  207. ext v7.16b,v17.16b,v17.16b,#8
  208. ext v3.16b,v16.16b,v16.16b,#8
  209. eor v0.16b,v1.16b,v18.16b
  210. pmull v4.1q,v20.1d,v7.1d //H·Ii+1
  211. eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
  212. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  213. pmull v0.1q,v0.1d,v19.1d
  214. eor v3.16b,v3.16b,v18.16b
  215. eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
  216. eor v3.16b,v3.16b,v0.16b
  217. pmull2 v6.1q,v20.2d,v7.2d
  218. b.hs Loop_mod2x_v8 //there was at least 32 more bytes
  219. eor v2.16b,v2.16b,v18.16b
  220. ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
  221. adds x3,x3,#32 //re-construct x3
  222. eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
  223. b.eq Ldone_v8 //is x3 zero?
  224. Lodd_tail_v8:
  225. ext v18.16b,v0.16b,v0.16b,#8
  226. eor v3.16b,v3.16b,v0.16b //inp^=Xi
  227. eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
  228. pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
  229. eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
  230. pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
  231. pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
  232. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  233. eor v18.16b,v0.16b,v2.16b
  234. eor v1.16b,v1.16b,v17.16b
  235. eor v1.16b,v1.16b,v18.16b
  236. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  237. ins v2.d[0],v1.d[1]
  238. ins v1.d[1],v0.d[0]
  239. eor v0.16b,v1.16b,v18.16b
  240. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  241. pmull v0.1q,v0.1d,v19.1d
  242. eor v18.16b,v18.16b,v2.16b
  243. eor v0.16b,v0.16b,v18.16b
  244. Ldone_v8:
  245. #ifndef __ARMEB__
  246. rev64 v0.16b,v0.16b
  247. #endif
  248. ext v0.16b,v0.16b,v0.16b,#8
  249. st1 {v0.2d},[x0] //write out Xi
  250. ret
  251. .align 4
  252. gcm_ghash_v8_4x:
  253. Lgcm_ghash_v8_4x:
  254. ld1 {v0.2d},[x0] //load [rotated] Xi
  255. ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
  256. movi v19.16b,#0xe1
  257. ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
  258. shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
  259. ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
  260. #ifndef __ARMEB__
  261. rev64 v0.16b,v0.16b
  262. rev64 v5.16b,v5.16b
  263. rev64 v6.16b,v6.16b
  264. rev64 v7.16b,v7.16b
  265. rev64 v4.16b,v4.16b
  266. #endif
  267. ext v25.16b,v7.16b,v7.16b,#8
  268. ext v24.16b,v6.16b,v6.16b,#8
  269. ext v23.16b,v5.16b,v5.16b,#8
  270. pmull v29.1q,v20.1d,v25.1d //H·Ii+3
  271. eor v7.16b,v7.16b,v25.16b
  272. pmull2 v31.1q,v20.2d,v25.2d
  273. pmull v30.1q,v21.1d,v7.1d
  274. pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
  275. eor v6.16b,v6.16b,v24.16b
  276. pmull2 v24.1q,v22.2d,v24.2d
  277. pmull2 v6.1q,v21.2d,v6.2d
  278. eor v29.16b,v29.16b,v16.16b
  279. eor v31.16b,v31.16b,v24.16b
  280. eor v30.16b,v30.16b,v6.16b
  281. pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
  282. eor v5.16b,v5.16b,v23.16b
  283. pmull2 v23.1q,v26.2d,v23.2d
  284. pmull v5.1q,v27.1d,v5.1d
  285. eor v29.16b,v29.16b,v7.16b
  286. eor v31.16b,v31.16b,v23.16b
  287. eor v30.16b,v30.16b,v5.16b
  288. subs x3,x3,#128
  289. b.lo Ltail4x
  290. b Loop4x
  291. .align 4
  292. Loop4x:
  293. eor v16.16b,v4.16b,v0.16b
  294. ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
  295. ext v3.16b,v16.16b,v16.16b,#8
  296. #ifndef __ARMEB__
  297. rev64 v5.16b,v5.16b
  298. rev64 v6.16b,v6.16b
  299. rev64 v7.16b,v7.16b
  300. rev64 v4.16b,v4.16b
  301. #endif
  302. pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
  303. eor v16.16b,v16.16b,v3.16b
  304. pmull2 v2.1q,v28.2d,v3.2d
  305. ext v25.16b,v7.16b,v7.16b,#8
  306. pmull2 v1.1q,v27.2d,v16.2d
  307. eor v0.16b,v0.16b,v29.16b
  308. eor v2.16b,v2.16b,v31.16b
  309. ext v24.16b,v6.16b,v6.16b,#8
  310. eor v1.16b,v1.16b,v30.16b
  311. ext v23.16b,v5.16b,v5.16b,#8
  312. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  313. eor v18.16b,v0.16b,v2.16b
  314. pmull v29.1q,v20.1d,v25.1d //H·Ii+3
  315. eor v7.16b,v7.16b,v25.16b
  316. eor v1.16b,v1.16b,v17.16b
  317. pmull2 v31.1q,v20.2d,v25.2d
  318. eor v1.16b,v1.16b,v18.16b
  319. pmull v30.1q,v21.1d,v7.1d
  320. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  321. ins v2.d[0],v1.d[1]
  322. ins v1.d[1],v0.d[0]
  323. pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
  324. eor v6.16b,v6.16b,v24.16b
  325. pmull2 v24.1q,v22.2d,v24.2d
  326. eor v0.16b,v1.16b,v18.16b
  327. pmull2 v6.1q,v21.2d,v6.2d
  328. eor v29.16b,v29.16b,v16.16b
  329. eor v31.16b,v31.16b,v24.16b
  330. eor v30.16b,v30.16b,v6.16b
  331. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  332. pmull v0.1q,v0.1d,v19.1d
  333. pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
  334. eor v5.16b,v5.16b,v23.16b
  335. eor v18.16b,v18.16b,v2.16b
  336. pmull2 v23.1q,v26.2d,v23.2d
  337. pmull v5.1q,v27.1d,v5.1d
  338. eor v0.16b,v0.16b,v18.16b
  339. eor v29.16b,v29.16b,v7.16b
  340. eor v31.16b,v31.16b,v23.16b
  341. ext v0.16b,v0.16b,v0.16b,#8
  342. eor v30.16b,v30.16b,v5.16b
  343. subs x3,x3,#64
  344. b.hs Loop4x
  345. Ltail4x:
  346. eor v16.16b,v4.16b,v0.16b
  347. ext v3.16b,v16.16b,v16.16b,#8
  348. pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
  349. eor v16.16b,v16.16b,v3.16b
  350. pmull2 v2.1q,v28.2d,v3.2d
  351. pmull2 v1.1q,v27.2d,v16.2d
  352. eor v0.16b,v0.16b,v29.16b
  353. eor v2.16b,v2.16b,v31.16b
  354. eor v1.16b,v1.16b,v30.16b
  355. adds x3,x3,#64
  356. b.eq Ldone4x
  357. cmp x3,#32
  358. b.lo Lone
  359. b.eq Ltwo
  360. Lthree:
  361. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  362. eor v18.16b,v0.16b,v2.16b
  363. eor v1.16b,v1.16b,v17.16b
  364. ld1 {v4.2d,v5.2d,v6.2d},[x2]
  365. eor v1.16b,v1.16b,v18.16b
  366. #ifndef __ARMEB__
  367. rev64 v5.16b,v5.16b
  368. rev64 v6.16b,v6.16b
  369. rev64 v4.16b,v4.16b
  370. #endif
  371. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  372. ins v2.d[0],v1.d[1]
  373. ins v1.d[1],v0.d[0]
  374. ext v24.16b,v6.16b,v6.16b,#8
  375. ext v23.16b,v5.16b,v5.16b,#8
  376. eor v0.16b,v1.16b,v18.16b
  377. pmull v29.1q,v20.1d,v24.1d //H·Ii+2
  378. eor v6.16b,v6.16b,v24.16b
  379. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  380. pmull v0.1q,v0.1d,v19.1d
  381. eor v18.16b,v18.16b,v2.16b
  382. pmull2 v31.1q,v20.2d,v24.2d
  383. pmull v30.1q,v21.1d,v6.1d
  384. eor v0.16b,v0.16b,v18.16b
  385. pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
  386. eor v5.16b,v5.16b,v23.16b
  387. ext v0.16b,v0.16b,v0.16b,#8
  388. pmull2 v23.1q,v22.2d,v23.2d
  389. eor v16.16b,v4.16b,v0.16b
  390. pmull2 v5.1q,v21.2d,v5.2d
  391. ext v3.16b,v16.16b,v16.16b,#8
  392. eor v29.16b,v29.16b,v7.16b
  393. eor v31.16b,v31.16b,v23.16b
  394. eor v30.16b,v30.16b,v5.16b
  395. pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
  396. eor v16.16b,v16.16b,v3.16b
  397. pmull2 v2.1q,v26.2d,v3.2d
  398. pmull v1.1q,v27.1d,v16.1d
  399. eor v0.16b,v0.16b,v29.16b
  400. eor v2.16b,v2.16b,v31.16b
  401. eor v1.16b,v1.16b,v30.16b
  402. b Ldone4x
  403. .align 4
  404. Ltwo:
  405. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  406. eor v18.16b,v0.16b,v2.16b
  407. eor v1.16b,v1.16b,v17.16b
  408. ld1 {v4.2d,v5.2d},[x2]
  409. eor v1.16b,v1.16b,v18.16b
  410. #ifndef __ARMEB__
  411. rev64 v5.16b,v5.16b
  412. rev64 v4.16b,v4.16b
  413. #endif
  414. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  415. ins v2.d[0],v1.d[1]
  416. ins v1.d[1],v0.d[0]
  417. ext v23.16b,v5.16b,v5.16b,#8
  418. eor v0.16b,v1.16b,v18.16b
  419. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  420. pmull v0.1q,v0.1d,v19.1d
  421. eor v18.16b,v18.16b,v2.16b
  422. eor v0.16b,v0.16b,v18.16b
  423. ext v0.16b,v0.16b,v0.16b,#8
  424. pmull v29.1q,v20.1d,v23.1d //H·Ii+1
  425. eor v5.16b,v5.16b,v23.16b
  426. eor v16.16b,v4.16b,v0.16b
  427. ext v3.16b,v16.16b,v16.16b,#8
  428. pmull2 v31.1q,v20.2d,v23.2d
  429. pmull v30.1q,v21.1d,v5.1d
  430. pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
  431. eor v16.16b,v16.16b,v3.16b
  432. pmull2 v2.1q,v22.2d,v3.2d
  433. pmull2 v1.1q,v21.2d,v16.2d
  434. eor v0.16b,v0.16b,v29.16b
  435. eor v2.16b,v2.16b,v31.16b
  436. eor v1.16b,v1.16b,v30.16b
  437. b Ldone4x
  438. .align 4
  439. Lone:
  440. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  441. eor v18.16b,v0.16b,v2.16b
  442. eor v1.16b,v1.16b,v17.16b
  443. ld1 {v4.2d},[x2]
  444. eor v1.16b,v1.16b,v18.16b
  445. #ifndef __ARMEB__
  446. rev64 v4.16b,v4.16b
  447. #endif
  448. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  449. ins v2.d[0],v1.d[1]
  450. ins v1.d[1],v0.d[0]
  451. eor v0.16b,v1.16b,v18.16b
  452. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  453. pmull v0.1q,v0.1d,v19.1d
  454. eor v18.16b,v18.16b,v2.16b
  455. eor v0.16b,v0.16b,v18.16b
  456. ext v0.16b,v0.16b,v0.16b,#8
  457. eor v16.16b,v4.16b,v0.16b
  458. ext v3.16b,v16.16b,v16.16b,#8
  459. pmull v0.1q,v20.1d,v3.1d
  460. eor v16.16b,v16.16b,v3.16b
  461. pmull2 v2.1q,v20.2d,v3.2d
  462. pmull v1.1q,v21.1d,v16.1d
  463. Ldone4x:
  464. ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
  465. eor v18.16b,v0.16b,v2.16b
  466. eor v1.16b,v1.16b,v17.16b
  467. eor v1.16b,v1.16b,v18.16b
  468. pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
  469. ins v2.d[0],v1.d[1]
  470. ins v1.d[1],v0.d[0]
  471. eor v0.16b,v1.16b,v18.16b
  472. ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
  473. pmull v0.1q,v0.1d,v19.1d
  474. eor v18.16b,v18.16b,v2.16b
  475. eor v0.16b,v0.16b,v18.16b
  476. ext v0.16b,v0.16b,v0.16b,#8
  477. #ifndef __ARMEB__
  478. rev64 v0.16b,v0.16b
  479. #endif
  480. st1 {v0.2d},[x0] //write out Xi
  481. ret
  482. .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  483. .align 2
  484. .align 2
  485. #endif
  486. #endif // !OPENSSL_NO_ASM