chacha-armv8.S 40 KB


  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .hidden OPENSSL_armcap_P
  16. .section .rodata
  17. .align 5
  18. .Lsigma:
  19. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  20. .Lone:
  21. .long 1,0,0,0
  22. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  23. .align 2
  24. .text
  25. .globl ChaCha20_ctr32
  26. .hidden ChaCha20_ctr32
  27. .type ChaCha20_ctr32,%function
  28. .align 5
  29. ChaCha20_ctr32:
  30. AARCH64_VALID_CALL_TARGET
  31. cbz x2,.Labort
  32. #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
  33. adrp x5,:pg_hi21_nc:OPENSSL_armcap_P
  34. #else
  35. adrp x5,OPENSSL_armcap_P
  36. #endif
  37. cmp x2,#192
  38. b.lo .Lshort
  39. ldr w17,[x5,:lo12:OPENSSL_armcap_P]
  40. tst w17,#ARMV7_NEON
  41. b.ne ChaCha20_neon
  42. .Lshort:
  43. AARCH64_SIGN_LINK_REGISTER
  44. stp x29,x30,[sp,#-96]!
  45. add x29,sp,#0
  46. adrp x5,.Lsigma
  47. add x5,x5,:lo12:.Lsigma
  48. stp x19,x20,[sp,#16]
  49. stp x21,x22,[sp,#32]
  50. stp x23,x24,[sp,#48]
  51. stp x25,x26,[sp,#64]
  52. stp x27,x28,[sp,#80]
  53. sub sp,sp,#64
  54. ldp x22,x23,[x5] // load sigma
  55. ldp x24,x25,[x3] // load key
  56. ldp x26,x27,[x3,#16]
  57. ldp x28,x30,[x4] // load counter
  58. #ifdef __ARMEB__
  59. ror x24,x24,#32
  60. ror x25,x25,#32
  61. ror x26,x26,#32
  62. ror x27,x27,#32
  63. ror x28,x28,#32
  64. ror x30,x30,#32
  65. #endif
  66. .Loop_outer:
  67. mov w5,w22 // unpack key block
  68. lsr x6,x22,#32
  69. mov w7,w23
  70. lsr x8,x23,#32
  71. mov w9,w24
  72. lsr x10,x24,#32
  73. mov w11,w25
  74. lsr x12,x25,#32
  75. mov w13,w26
  76. lsr x14,x26,#32
  77. mov w15,w27
  78. lsr x16,x27,#32
  79. mov w17,w28
  80. lsr x19,x28,#32
  81. mov w20,w30
  82. lsr x21,x30,#32
  83. mov x4,#10
  84. subs x2,x2,#64
  85. .Loop:
  86. sub x4,x4,#1
  87. add w5,w5,w9
  88. add w6,w6,w10
  89. add w7,w7,w11
  90. add w8,w8,w12
  91. eor w17,w17,w5
  92. eor w19,w19,w6
  93. eor w20,w20,w7
  94. eor w21,w21,w8
  95. ror w17,w17,#16
  96. ror w19,w19,#16
  97. ror w20,w20,#16
  98. ror w21,w21,#16
  99. add w13,w13,w17
  100. add w14,w14,w19
  101. add w15,w15,w20
  102. add w16,w16,w21
  103. eor w9,w9,w13
  104. eor w10,w10,w14
  105. eor w11,w11,w15
  106. eor w12,w12,w16
  107. ror w9,w9,#20
  108. ror w10,w10,#20
  109. ror w11,w11,#20
  110. ror w12,w12,#20
  111. add w5,w5,w9
  112. add w6,w6,w10
  113. add w7,w7,w11
  114. add w8,w8,w12
  115. eor w17,w17,w5
  116. eor w19,w19,w6
  117. eor w20,w20,w7
  118. eor w21,w21,w8
  119. ror w17,w17,#24
  120. ror w19,w19,#24
  121. ror w20,w20,#24
  122. ror w21,w21,#24
  123. add w13,w13,w17
  124. add w14,w14,w19
  125. add w15,w15,w20
  126. add w16,w16,w21
  127. eor w9,w9,w13
  128. eor w10,w10,w14
  129. eor w11,w11,w15
  130. eor w12,w12,w16
  131. ror w9,w9,#25
  132. ror w10,w10,#25
  133. ror w11,w11,#25
  134. ror w12,w12,#25
  135. add w5,w5,w10
  136. add w6,w6,w11
  137. add w7,w7,w12
  138. add w8,w8,w9
  139. eor w21,w21,w5
  140. eor w17,w17,w6
  141. eor w19,w19,w7
  142. eor w20,w20,w8
  143. ror w21,w21,#16
  144. ror w17,w17,#16
  145. ror w19,w19,#16
  146. ror w20,w20,#16
  147. add w15,w15,w21
  148. add w16,w16,w17
  149. add w13,w13,w19
  150. add w14,w14,w20
  151. eor w10,w10,w15
  152. eor w11,w11,w16
  153. eor w12,w12,w13
  154. eor w9,w9,w14
  155. ror w10,w10,#20
  156. ror w11,w11,#20
  157. ror w12,w12,#20
  158. ror w9,w9,#20
  159. add w5,w5,w10
  160. add w6,w6,w11
  161. add w7,w7,w12
  162. add w8,w8,w9
  163. eor w21,w21,w5
  164. eor w17,w17,w6
  165. eor w19,w19,w7
  166. eor w20,w20,w8
  167. ror w21,w21,#24
  168. ror w17,w17,#24
  169. ror w19,w19,#24
  170. ror w20,w20,#24
  171. add w15,w15,w21
  172. add w16,w16,w17
  173. add w13,w13,w19
  174. add w14,w14,w20
  175. eor w10,w10,w15
  176. eor w11,w11,w16
  177. eor w12,w12,w13
  178. eor w9,w9,w14
  179. ror w10,w10,#25
  180. ror w11,w11,#25
  181. ror w12,w12,#25
  182. ror w9,w9,#25
  183. cbnz x4,.Loop
  184. add w5,w5,w22 // accumulate key block
  185. add x6,x6,x22,lsr#32
  186. add w7,w7,w23
  187. add x8,x8,x23,lsr#32
  188. add w9,w9,w24
  189. add x10,x10,x24,lsr#32
  190. add w11,w11,w25
  191. add x12,x12,x25,lsr#32
  192. add w13,w13,w26
  193. add x14,x14,x26,lsr#32
  194. add w15,w15,w27
  195. add x16,x16,x27,lsr#32
  196. add w17,w17,w28
  197. add x19,x19,x28,lsr#32
  198. add w20,w20,w30
  199. add x21,x21,x30,lsr#32
  200. b.lo .Ltail
  201. add x5,x5,x6,lsl#32 // pack
  202. add x7,x7,x8,lsl#32
  203. ldp x6,x8,[x1,#0] // load input
  204. add x9,x9,x10,lsl#32
  205. add x11,x11,x12,lsl#32
  206. ldp x10,x12,[x1,#16]
  207. add x13,x13,x14,lsl#32
  208. add x15,x15,x16,lsl#32
  209. ldp x14,x16,[x1,#32]
  210. add x17,x17,x19,lsl#32
  211. add x20,x20,x21,lsl#32
  212. ldp x19,x21,[x1,#48]
  213. add x1,x1,#64
  214. #ifdef __ARMEB__
  215. rev x5,x5
  216. rev x7,x7
  217. rev x9,x9
  218. rev x11,x11
  219. rev x13,x13
  220. rev x15,x15
  221. rev x17,x17
  222. rev x20,x20
  223. #endif
  224. eor x5,x5,x6
  225. eor x7,x7,x8
  226. eor x9,x9,x10
  227. eor x11,x11,x12
  228. eor x13,x13,x14
  229. eor x15,x15,x16
  230. eor x17,x17,x19
  231. eor x20,x20,x21
  232. stp x5,x7,[x0,#0] // store output
  233. add x28,x28,#1 // increment counter
  234. stp x9,x11,[x0,#16]
  235. stp x13,x15,[x0,#32]
  236. stp x17,x20,[x0,#48]
  237. add x0,x0,#64
  238. b.hi .Loop_outer
  239. ldp x19,x20,[x29,#16]
  240. add sp,sp,#64
  241. ldp x21,x22,[x29,#32]
  242. ldp x23,x24,[x29,#48]
  243. ldp x25,x26,[x29,#64]
  244. ldp x27,x28,[x29,#80]
  245. ldp x29,x30,[sp],#96
  246. AARCH64_VALIDATE_LINK_REGISTER
  247. .Labort:
  248. ret
  249. .align 4
  250. .Ltail:
  251. add x2,x2,#64
  252. .Less_than_64:
  253. sub x0,x0,#1
  254. add x1,x1,x2
  255. add x0,x0,x2
  256. add x4,sp,x2
  257. neg x2,x2
  258. add x5,x5,x6,lsl#32 // pack
  259. add x7,x7,x8,lsl#32
  260. add x9,x9,x10,lsl#32
  261. add x11,x11,x12,lsl#32
  262. add x13,x13,x14,lsl#32
  263. add x15,x15,x16,lsl#32
  264. add x17,x17,x19,lsl#32
  265. add x20,x20,x21,lsl#32
  266. #ifdef __ARMEB__
  267. rev x5,x5
  268. rev x7,x7
  269. rev x9,x9
  270. rev x11,x11
  271. rev x13,x13
  272. rev x15,x15
  273. rev x17,x17
  274. rev x20,x20
  275. #endif
  276. stp x5,x7,[sp,#0]
  277. stp x9,x11,[sp,#16]
  278. stp x13,x15,[sp,#32]
  279. stp x17,x20,[sp,#48]
  280. .Loop_tail:
  281. ldrb w10,[x1,x2]
  282. ldrb w11,[x4,x2]
  283. add x2,x2,#1
  284. eor w10,w10,w11
  285. strb w10,[x0,x2]
  286. cbnz x2,.Loop_tail
  287. stp xzr,xzr,[sp,#0]
  288. stp xzr,xzr,[sp,#16]
  289. stp xzr,xzr,[sp,#32]
  290. stp xzr,xzr,[sp,#48]
  291. ldp x19,x20,[x29,#16]
  292. add sp,sp,#64
  293. ldp x21,x22,[x29,#32]
  294. ldp x23,x24,[x29,#48]
  295. ldp x25,x26,[x29,#64]
  296. ldp x27,x28,[x29,#80]
  297. ldp x29,x30,[sp],#96
  298. AARCH64_VALIDATE_LINK_REGISTER
  299. ret
  300. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  301. .type ChaCha20_neon,%function
  302. .align 5
  303. ChaCha20_neon:
  304. AARCH64_SIGN_LINK_REGISTER
  305. stp x29,x30,[sp,#-96]!
  306. add x29,sp,#0
  307. adrp x5,.Lsigma
  308. add x5,x5,:lo12:.Lsigma
  309. stp x19,x20,[sp,#16]
  310. stp x21,x22,[sp,#32]
  311. stp x23,x24,[sp,#48]
  312. stp x25,x26,[sp,#64]
  313. stp x27,x28,[sp,#80]
  314. cmp x2,#512
  315. b.hs .L512_or_more_neon
  316. sub sp,sp,#64
  317. ldp x22,x23,[x5] // load sigma
  318. ld1 {v24.4s},[x5],#16
  319. ldp x24,x25,[x3] // load key
  320. ldp x26,x27,[x3,#16]
  321. ld1 {v25.4s,v26.4s},[x3]
  322. ldp x28,x30,[x4] // load counter
  323. ld1 {v27.4s},[x4]
  324. ld1 {v31.4s},[x5]
  325. #ifdef __ARMEB__
  326. rev64 v24.4s,v24.4s
  327. ror x24,x24,#32
  328. ror x25,x25,#32
  329. ror x26,x26,#32
  330. ror x27,x27,#32
  331. ror x28,x28,#32
  332. ror x30,x30,#32
  333. #endif
  334. add v27.4s,v27.4s,v31.4s // += 1
  335. add v28.4s,v27.4s,v31.4s
  336. add v29.4s,v28.4s,v31.4s
  337. shl v31.4s,v31.4s,#2 // 1 -> 4
  338. .Loop_outer_neon:
  339. mov w5,w22 // unpack key block
  340. lsr x6,x22,#32
  341. mov v0.16b,v24.16b
  342. mov w7,w23
  343. lsr x8,x23,#32
  344. mov v4.16b,v24.16b
  345. mov w9,w24
  346. lsr x10,x24,#32
  347. mov v16.16b,v24.16b
  348. mov w11,w25
  349. mov v1.16b,v25.16b
  350. lsr x12,x25,#32
  351. mov v5.16b,v25.16b
  352. mov w13,w26
  353. mov v17.16b,v25.16b
  354. lsr x14,x26,#32
  355. mov v3.16b,v27.16b
  356. mov w15,w27
  357. mov v7.16b,v28.16b
  358. lsr x16,x27,#32
  359. mov v19.16b,v29.16b
  360. mov w17,w28
  361. mov v2.16b,v26.16b
  362. lsr x19,x28,#32
  363. mov v6.16b,v26.16b
  364. mov w20,w30
  365. mov v18.16b,v26.16b
  366. lsr x21,x30,#32
  367. mov x4,#10
  368. subs x2,x2,#256
  369. .Loop_neon:
  370. sub x4,x4,#1
  371. add v0.4s,v0.4s,v1.4s
  372. add w5,w5,w9
  373. add v4.4s,v4.4s,v5.4s
  374. add w6,w6,w10
  375. add v16.4s,v16.4s,v17.4s
  376. add w7,w7,w11
  377. eor v3.16b,v3.16b,v0.16b
  378. add w8,w8,w12
  379. eor v7.16b,v7.16b,v4.16b
  380. eor w17,w17,w5
  381. eor v19.16b,v19.16b,v16.16b
  382. eor w19,w19,w6
  383. rev32 v3.8h,v3.8h
  384. eor w20,w20,w7
  385. rev32 v7.8h,v7.8h
  386. eor w21,w21,w8
  387. rev32 v19.8h,v19.8h
  388. ror w17,w17,#16
  389. add v2.4s,v2.4s,v3.4s
  390. ror w19,w19,#16
  391. add v6.4s,v6.4s,v7.4s
  392. ror w20,w20,#16
  393. add v18.4s,v18.4s,v19.4s
  394. ror w21,w21,#16
  395. eor v20.16b,v1.16b,v2.16b
  396. add w13,w13,w17
  397. eor v21.16b,v5.16b,v6.16b
  398. add w14,w14,w19
  399. eor v22.16b,v17.16b,v18.16b
  400. add w15,w15,w20
  401. ushr v1.4s,v20.4s,#20
  402. add w16,w16,w21
  403. ushr v5.4s,v21.4s,#20
  404. eor w9,w9,w13
  405. ushr v17.4s,v22.4s,#20
  406. eor w10,w10,w14
  407. sli v1.4s,v20.4s,#12
  408. eor w11,w11,w15
  409. sli v5.4s,v21.4s,#12
  410. eor w12,w12,w16
  411. sli v17.4s,v22.4s,#12
  412. ror w9,w9,#20
  413. add v0.4s,v0.4s,v1.4s
  414. ror w10,w10,#20
  415. add v4.4s,v4.4s,v5.4s
  416. ror w11,w11,#20
  417. add v16.4s,v16.4s,v17.4s
  418. ror w12,w12,#20
  419. eor v20.16b,v3.16b,v0.16b
  420. add w5,w5,w9
  421. eor v21.16b,v7.16b,v4.16b
  422. add w6,w6,w10
  423. eor v22.16b,v19.16b,v16.16b
  424. add w7,w7,w11
  425. ushr v3.4s,v20.4s,#24
  426. add w8,w8,w12
  427. ushr v7.4s,v21.4s,#24
  428. eor w17,w17,w5
  429. ushr v19.4s,v22.4s,#24
  430. eor w19,w19,w6
  431. sli v3.4s,v20.4s,#8
  432. eor w20,w20,w7
  433. sli v7.4s,v21.4s,#8
  434. eor w21,w21,w8
  435. sli v19.4s,v22.4s,#8
  436. ror w17,w17,#24
  437. add v2.4s,v2.4s,v3.4s
  438. ror w19,w19,#24
  439. add v6.4s,v6.4s,v7.4s
  440. ror w20,w20,#24
  441. add v18.4s,v18.4s,v19.4s
  442. ror w21,w21,#24
  443. eor v20.16b,v1.16b,v2.16b
  444. add w13,w13,w17
  445. eor v21.16b,v5.16b,v6.16b
  446. add w14,w14,w19
  447. eor v22.16b,v17.16b,v18.16b
  448. add w15,w15,w20
  449. ushr v1.4s,v20.4s,#25
  450. add w16,w16,w21
  451. ushr v5.4s,v21.4s,#25
  452. eor w9,w9,w13
  453. ushr v17.4s,v22.4s,#25
  454. eor w10,w10,w14
  455. sli v1.4s,v20.4s,#7
  456. eor w11,w11,w15
  457. sli v5.4s,v21.4s,#7
  458. eor w12,w12,w16
  459. sli v17.4s,v22.4s,#7
  460. ror w9,w9,#25
  461. ext v2.16b,v2.16b,v2.16b,#8
  462. ror w10,w10,#25
  463. ext v6.16b,v6.16b,v6.16b,#8
  464. ror w11,w11,#25
  465. ext v18.16b,v18.16b,v18.16b,#8
  466. ror w12,w12,#25
  467. ext v3.16b,v3.16b,v3.16b,#12
  468. ext v7.16b,v7.16b,v7.16b,#12
  469. ext v19.16b,v19.16b,v19.16b,#12
  470. ext v1.16b,v1.16b,v1.16b,#4
  471. ext v5.16b,v5.16b,v5.16b,#4
  472. ext v17.16b,v17.16b,v17.16b,#4
  473. add v0.4s,v0.4s,v1.4s
  474. add w5,w5,w10
  475. add v4.4s,v4.4s,v5.4s
  476. add w6,w6,w11
  477. add v16.4s,v16.4s,v17.4s
  478. add w7,w7,w12
  479. eor v3.16b,v3.16b,v0.16b
  480. add w8,w8,w9
  481. eor v7.16b,v7.16b,v4.16b
  482. eor w21,w21,w5
  483. eor v19.16b,v19.16b,v16.16b
  484. eor w17,w17,w6
  485. rev32 v3.8h,v3.8h
  486. eor w19,w19,w7
  487. rev32 v7.8h,v7.8h
  488. eor w20,w20,w8
  489. rev32 v19.8h,v19.8h
  490. ror w21,w21,#16
  491. add v2.4s,v2.4s,v3.4s
  492. ror w17,w17,#16
  493. add v6.4s,v6.4s,v7.4s
  494. ror w19,w19,#16
  495. add v18.4s,v18.4s,v19.4s
  496. ror w20,w20,#16
  497. eor v20.16b,v1.16b,v2.16b
  498. add w15,w15,w21
  499. eor v21.16b,v5.16b,v6.16b
  500. add w16,w16,w17
  501. eor v22.16b,v17.16b,v18.16b
  502. add w13,w13,w19
  503. ushr v1.4s,v20.4s,#20
  504. add w14,w14,w20
  505. ushr v5.4s,v21.4s,#20
  506. eor w10,w10,w15
  507. ushr v17.4s,v22.4s,#20
  508. eor w11,w11,w16
  509. sli v1.4s,v20.4s,#12
  510. eor w12,w12,w13
  511. sli v5.4s,v21.4s,#12
  512. eor w9,w9,w14
  513. sli v17.4s,v22.4s,#12
  514. ror w10,w10,#20
  515. add v0.4s,v0.4s,v1.4s
  516. ror w11,w11,#20
  517. add v4.4s,v4.4s,v5.4s
  518. ror w12,w12,#20
  519. add v16.4s,v16.4s,v17.4s
  520. ror w9,w9,#20
  521. eor v20.16b,v3.16b,v0.16b
  522. add w5,w5,w10
  523. eor v21.16b,v7.16b,v4.16b
  524. add w6,w6,w11
  525. eor v22.16b,v19.16b,v16.16b
  526. add w7,w7,w12
  527. ushr v3.4s,v20.4s,#24
  528. add w8,w8,w9
  529. ushr v7.4s,v21.4s,#24
  530. eor w21,w21,w5
  531. ushr v19.4s,v22.4s,#24
  532. eor w17,w17,w6
  533. sli v3.4s,v20.4s,#8
  534. eor w19,w19,w7
  535. sli v7.4s,v21.4s,#8
  536. eor w20,w20,w8
  537. sli v19.4s,v22.4s,#8
  538. ror w21,w21,#24
  539. add v2.4s,v2.4s,v3.4s
  540. ror w17,w17,#24
  541. add v6.4s,v6.4s,v7.4s
  542. ror w19,w19,#24
  543. add v18.4s,v18.4s,v19.4s
  544. ror w20,w20,#24
  545. eor v20.16b,v1.16b,v2.16b
  546. add w15,w15,w21
  547. eor v21.16b,v5.16b,v6.16b
  548. add w16,w16,w17
  549. eor v22.16b,v17.16b,v18.16b
  550. add w13,w13,w19
  551. ushr v1.4s,v20.4s,#25
  552. add w14,w14,w20
  553. ushr v5.4s,v21.4s,#25
  554. eor w10,w10,w15
  555. ushr v17.4s,v22.4s,#25
  556. eor w11,w11,w16
  557. sli v1.4s,v20.4s,#7
  558. eor w12,w12,w13
  559. sli v5.4s,v21.4s,#7
  560. eor w9,w9,w14
  561. sli v17.4s,v22.4s,#7
  562. ror w10,w10,#25
  563. ext v2.16b,v2.16b,v2.16b,#8
  564. ror w11,w11,#25
  565. ext v6.16b,v6.16b,v6.16b,#8
  566. ror w12,w12,#25
  567. ext v18.16b,v18.16b,v18.16b,#8
  568. ror w9,w9,#25
  569. ext v3.16b,v3.16b,v3.16b,#4
  570. ext v7.16b,v7.16b,v7.16b,#4
  571. ext v19.16b,v19.16b,v19.16b,#4
  572. ext v1.16b,v1.16b,v1.16b,#12
  573. ext v5.16b,v5.16b,v5.16b,#12
  574. ext v17.16b,v17.16b,v17.16b,#12
  575. cbnz x4,.Loop_neon
  576. add w5,w5,w22 // accumulate key block
  577. add v0.4s,v0.4s,v24.4s
  578. add x6,x6,x22,lsr#32
  579. add v4.4s,v4.4s,v24.4s
  580. add w7,w7,w23
  581. add v16.4s,v16.4s,v24.4s
  582. add x8,x8,x23,lsr#32
  583. add v2.4s,v2.4s,v26.4s
  584. add w9,w9,w24
  585. add v6.4s,v6.4s,v26.4s
  586. add x10,x10,x24,lsr#32
  587. add v18.4s,v18.4s,v26.4s
  588. add w11,w11,w25
  589. add v3.4s,v3.4s,v27.4s
  590. add x12,x12,x25,lsr#32
  591. add w13,w13,w26
  592. add v7.4s,v7.4s,v28.4s
  593. add x14,x14,x26,lsr#32
  594. add w15,w15,w27
  595. add v19.4s,v19.4s,v29.4s
  596. add x16,x16,x27,lsr#32
  597. add w17,w17,w28
  598. add v1.4s,v1.4s,v25.4s
  599. add x19,x19,x28,lsr#32
  600. add w20,w20,w30
  601. add v5.4s,v5.4s,v25.4s
  602. add x21,x21,x30,lsr#32
  603. add v17.4s,v17.4s,v25.4s
  604. b.lo .Ltail_neon
  605. add x5,x5,x6,lsl#32 // pack
  606. add x7,x7,x8,lsl#32
  607. ldp x6,x8,[x1,#0] // load input
  608. add x9,x9,x10,lsl#32
  609. add x11,x11,x12,lsl#32
  610. ldp x10,x12,[x1,#16]
  611. add x13,x13,x14,lsl#32
  612. add x15,x15,x16,lsl#32
  613. ldp x14,x16,[x1,#32]
  614. add x17,x17,x19,lsl#32
  615. add x20,x20,x21,lsl#32
  616. ldp x19,x21,[x1,#48]
  617. add x1,x1,#64
  618. #ifdef __ARMEB__
  619. rev x5,x5
  620. rev x7,x7
  621. rev x9,x9
  622. rev x11,x11
  623. rev x13,x13
  624. rev x15,x15
  625. rev x17,x17
  626. rev x20,x20
  627. #endif
  628. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  629. eor x5,x5,x6
  630. eor x7,x7,x8
  631. eor x9,x9,x10
  632. eor x11,x11,x12
  633. eor x13,x13,x14
  634. eor v0.16b,v0.16b,v20.16b
  635. eor x15,x15,x16
  636. eor v1.16b,v1.16b,v21.16b
  637. eor x17,x17,x19
  638. eor v2.16b,v2.16b,v22.16b
  639. eor x20,x20,x21
  640. eor v3.16b,v3.16b,v23.16b
  641. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  642. stp x5,x7,[x0,#0] // store output
  643. add x28,x28,#4 // increment counter
  644. stp x9,x11,[x0,#16]
  645. add v27.4s,v27.4s,v31.4s // += 4
  646. stp x13,x15,[x0,#32]
  647. add v28.4s,v28.4s,v31.4s
  648. stp x17,x20,[x0,#48]
  649. add v29.4s,v29.4s,v31.4s
  650. add x0,x0,#64
  651. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  652. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  653. eor v4.16b,v4.16b,v20.16b
  654. eor v5.16b,v5.16b,v21.16b
  655. eor v6.16b,v6.16b,v22.16b
  656. eor v7.16b,v7.16b,v23.16b
  657. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  658. eor v16.16b,v16.16b,v0.16b
  659. eor v17.16b,v17.16b,v1.16b
  660. eor v18.16b,v18.16b,v2.16b
  661. eor v19.16b,v19.16b,v3.16b
  662. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  663. b.hi .Loop_outer_neon
  664. ldp x19,x20,[x29,#16]
  665. add sp,sp,#64
  666. ldp x21,x22,[x29,#32]
  667. ldp x23,x24,[x29,#48]
  668. ldp x25,x26,[x29,#64]
  669. ldp x27,x28,[x29,#80]
  670. ldp x29,x30,[sp],#96
  671. AARCH64_VALIDATE_LINK_REGISTER
  672. ret
  673. .Ltail_neon:
  674. add x2,x2,#256
  675. cmp x2,#64
  676. b.lo .Less_than_64
  677. add x5,x5,x6,lsl#32 // pack
  678. add x7,x7,x8,lsl#32
  679. ldp x6,x8,[x1,#0] // load input
  680. add x9,x9,x10,lsl#32
  681. add x11,x11,x12,lsl#32
  682. ldp x10,x12,[x1,#16]
  683. add x13,x13,x14,lsl#32
  684. add x15,x15,x16,lsl#32
  685. ldp x14,x16,[x1,#32]
  686. add x17,x17,x19,lsl#32
  687. add x20,x20,x21,lsl#32
  688. ldp x19,x21,[x1,#48]
  689. add x1,x1,#64
  690. #ifdef __ARMEB__
  691. rev x5,x5
  692. rev x7,x7
  693. rev x9,x9
  694. rev x11,x11
  695. rev x13,x13
  696. rev x15,x15
  697. rev x17,x17
  698. rev x20,x20
  699. #endif
  700. eor x5,x5,x6
  701. eor x7,x7,x8
  702. eor x9,x9,x10
  703. eor x11,x11,x12
  704. eor x13,x13,x14
  705. eor x15,x15,x16
  706. eor x17,x17,x19
  707. eor x20,x20,x21
  708. stp x5,x7,[x0,#0] // store output
  709. add x28,x28,#4 // increment counter
  710. stp x9,x11,[x0,#16]
  711. stp x13,x15,[x0,#32]
  712. stp x17,x20,[x0,#48]
  713. add x0,x0,#64
  714. b.eq .Ldone_neon
  715. sub x2,x2,#64
  716. cmp x2,#64
  717. b.lo .Less_than_128
  718. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  719. eor v0.16b,v0.16b,v20.16b
  720. eor v1.16b,v1.16b,v21.16b
  721. eor v2.16b,v2.16b,v22.16b
  722. eor v3.16b,v3.16b,v23.16b
  723. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  724. b.eq .Ldone_neon
  725. sub x2,x2,#64
  726. cmp x2,#64
  727. b.lo .Less_than_192
  728. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  729. eor v4.16b,v4.16b,v20.16b
  730. eor v5.16b,v5.16b,v21.16b
  731. eor v6.16b,v6.16b,v22.16b
  732. eor v7.16b,v7.16b,v23.16b
  733. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  734. b.eq .Ldone_neon
  735. sub x2,x2,#64
  736. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  737. b .Last_neon
  738. .Less_than_128:
  739. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  740. b .Last_neon
  741. .Less_than_192:
  742. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  743. b .Last_neon
  744. .align 4
  745. .Last_neon:
  746. sub x0,x0,#1
  747. add x1,x1,x2
  748. add x0,x0,x2
  749. add x4,sp,x2
  750. neg x2,x2
  751. .Loop_tail_neon:
  752. ldrb w10,[x1,x2]
  753. ldrb w11,[x4,x2]
  754. add x2,x2,#1
  755. eor w10,w10,w11
  756. strb w10,[x0,x2]
  757. cbnz x2,.Loop_tail_neon
  758. stp xzr,xzr,[sp,#0]
  759. stp xzr,xzr,[sp,#16]
  760. stp xzr,xzr,[sp,#32]
  761. stp xzr,xzr,[sp,#48]
  762. .Ldone_neon:
  763. ldp x19,x20,[x29,#16]
  764. add sp,sp,#64
  765. ldp x21,x22,[x29,#32]
  766. ldp x23,x24,[x29,#48]
  767. ldp x25,x26,[x29,#64]
  768. ldp x27,x28,[x29,#80]
  769. ldp x29,x30,[sp],#96
  770. AARCH64_VALIDATE_LINK_REGISTER
  771. ret
  772. .size ChaCha20_neon,.-ChaCha20_neon
  773. .type ChaCha20_512_neon,%function
  774. .align 5
  775. ChaCha20_512_neon:
  776. AARCH64_SIGN_LINK_REGISTER
  777. stp x29,x30,[sp,#-96]!
  778. add x29,sp,#0
  779. adrp x5,.Lsigma
  780. add x5,x5,:lo12:.Lsigma
  781. stp x19,x20,[sp,#16]
  782. stp x21,x22,[sp,#32]
  783. stp x23,x24,[sp,#48]
  784. stp x25,x26,[sp,#64]
  785. stp x27,x28,[sp,#80]
  786. .L512_or_more_neon:
  787. sub sp,sp,#128+64
  788. ldp x22,x23,[x5] // load sigma
  789. ld1 {v24.4s},[x5],#16
  790. ldp x24,x25,[x3] // load key
  791. ldp x26,x27,[x3,#16]
  792. ld1 {v25.4s,v26.4s},[x3]
  793. ldp x28,x30,[x4] // load counter
  794. ld1 {v27.4s},[x4]
  795. ld1 {v31.4s},[x5]
  796. #ifdef __ARMEB__
  797. rev64 v24.4s,v24.4s
  798. ror x24,x24,#32
  799. ror x25,x25,#32
  800. ror x26,x26,#32
  801. ror x27,x27,#32
  802. ror x28,x28,#32
  803. ror x30,x30,#32
  804. #endif
  805. add v27.4s,v27.4s,v31.4s // += 1
  806. stp q24,q25,[sp,#0] // off-load key block, invariant part
  807. add v27.4s,v27.4s,v31.4s // not typo
  808. str q26,[sp,#32]
  809. add v28.4s,v27.4s,v31.4s
  810. add v29.4s,v28.4s,v31.4s
  811. add v30.4s,v29.4s,v31.4s
  812. shl v31.4s,v31.4s,#2 // 1 -> 4
  813. stp d8,d9,[sp,#128+0] // meet ABI requirements
  814. stp d10,d11,[sp,#128+16]
  815. stp d12,d13,[sp,#128+32]
  816. stp d14,d15,[sp,#128+48]
  817. sub x2,x2,#512 // not typo
  818. .Loop_outer_512_neon:
  819. mov v0.16b,v24.16b
  820. mov v4.16b,v24.16b
  821. mov v8.16b,v24.16b
  822. mov v12.16b,v24.16b
  823. mov v16.16b,v24.16b
  824. mov v20.16b,v24.16b
  825. mov v1.16b,v25.16b
  826. mov w5,w22 // unpack key block
  827. mov v5.16b,v25.16b
  828. lsr x6,x22,#32
  829. mov v9.16b,v25.16b
  830. mov w7,w23
  831. mov v13.16b,v25.16b
  832. lsr x8,x23,#32
  833. mov v17.16b,v25.16b
  834. mov w9,w24
  835. mov v21.16b,v25.16b
  836. lsr x10,x24,#32
  837. mov v3.16b,v27.16b
  838. mov w11,w25
  839. mov v7.16b,v28.16b
  840. lsr x12,x25,#32
  841. mov v11.16b,v29.16b
  842. mov w13,w26
  843. mov v15.16b,v30.16b
  844. lsr x14,x26,#32
  845. mov v2.16b,v26.16b
  846. mov w15,w27
  847. mov v6.16b,v26.16b
  848. lsr x16,x27,#32
  849. add v19.4s,v3.4s,v31.4s // +4
  850. mov w17,w28
  851. add v23.4s,v7.4s,v31.4s // +4
  852. lsr x19,x28,#32
  853. mov v10.16b,v26.16b
  854. mov w20,w30
  855. mov v14.16b,v26.16b
  856. lsr x21,x30,#32
  857. mov v18.16b,v26.16b
  858. stp q27,q28,[sp,#48] // off-load key block, variable part
  859. mov v22.16b,v26.16b
  860. str q29,[sp,#80]
  861. mov x4,#5
  862. subs x2,x2,#512
  863. .Loop_upper_neon:
  864. sub x4,x4,#1
  865. add v0.4s,v0.4s,v1.4s
  866. add w5,w5,w9
  867. add v4.4s,v4.4s,v5.4s
  868. add w6,w6,w10
  869. add v8.4s,v8.4s,v9.4s
  870. add w7,w7,w11
  871. add v12.4s,v12.4s,v13.4s
  872. add w8,w8,w12
  873. add v16.4s,v16.4s,v17.4s
  874. eor w17,w17,w5
  875. add v20.4s,v20.4s,v21.4s
  876. eor w19,w19,w6
  877. eor v3.16b,v3.16b,v0.16b
  878. eor w20,w20,w7
  879. eor v7.16b,v7.16b,v4.16b
  880. eor w21,w21,w8
  881. eor v11.16b,v11.16b,v8.16b
  882. ror w17,w17,#16
  883. eor v15.16b,v15.16b,v12.16b
  884. ror w19,w19,#16
  885. eor v19.16b,v19.16b,v16.16b
  886. ror w20,w20,#16
  887. eor v23.16b,v23.16b,v20.16b
  888. ror w21,w21,#16
  889. rev32 v3.8h,v3.8h
  890. add w13,w13,w17
  891. rev32 v7.8h,v7.8h
  892. add w14,w14,w19
  893. rev32 v11.8h,v11.8h
  894. add w15,w15,w20
  895. rev32 v15.8h,v15.8h
  896. add w16,w16,w21
  897. rev32 v19.8h,v19.8h
  898. eor w9,w9,w13
  899. rev32 v23.8h,v23.8h
  900. eor w10,w10,w14
  901. add v2.4s,v2.4s,v3.4s
  902. eor w11,w11,w15
  903. add v6.4s,v6.4s,v7.4s
  904. eor w12,w12,w16
  905. add v10.4s,v10.4s,v11.4s
  906. ror w9,w9,#20
  907. add v14.4s,v14.4s,v15.4s
  908. ror w10,w10,#20
  909. add v18.4s,v18.4s,v19.4s
  910. ror w11,w11,#20
  911. add v22.4s,v22.4s,v23.4s
  912. ror w12,w12,#20
  913. eor v24.16b,v1.16b,v2.16b
  914. add w5,w5,w9
  915. eor v25.16b,v5.16b,v6.16b
  916. add w6,w6,w10
  917. eor v26.16b,v9.16b,v10.16b
  918. add w7,w7,w11
  919. eor v27.16b,v13.16b,v14.16b
  920. add w8,w8,w12
  921. eor v28.16b,v17.16b,v18.16b
  922. eor w17,w17,w5
  923. eor v29.16b,v21.16b,v22.16b
  924. eor w19,w19,w6
  925. ushr v1.4s,v24.4s,#20
  926. eor w20,w20,w7
  927. ushr v5.4s,v25.4s,#20
  928. eor w21,w21,w8
  929. ushr v9.4s,v26.4s,#20
  930. ror w17,w17,#24
  931. ushr v13.4s,v27.4s,#20
  932. ror w19,w19,#24
  933. ushr v17.4s,v28.4s,#20
  934. ror w20,w20,#24
  935. ushr v21.4s,v29.4s,#20
  936. ror w21,w21,#24
  937. sli v1.4s,v24.4s,#12
  938. add w13,w13,w17
  939. sli v5.4s,v25.4s,#12
  940. add w14,w14,w19
  941. sli v9.4s,v26.4s,#12
  942. add w15,w15,w20
  943. sli v13.4s,v27.4s,#12
  944. add w16,w16,w21
  945. sli v17.4s,v28.4s,#12
  946. eor w9,w9,w13
  947. sli v21.4s,v29.4s,#12
  948. eor w10,w10,w14
  949. add v0.4s,v0.4s,v1.4s
  950. eor w11,w11,w15
  951. add v4.4s,v4.4s,v5.4s
  952. eor w12,w12,w16
  953. add v8.4s,v8.4s,v9.4s
  954. ror w9,w9,#25
  955. add v12.4s,v12.4s,v13.4s
  956. ror w10,w10,#25
  957. add v16.4s,v16.4s,v17.4s
  958. ror w11,w11,#25
  959. add v20.4s,v20.4s,v21.4s
  960. ror w12,w12,#25
  961. eor v24.16b,v3.16b,v0.16b
  962. add w5,w5,w10
  963. eor v25.16b,v7.16b,v4.16b
  964. add w6,w6,w11
  965. eor v26.16b,v11.16b,v8.16b
  966. add w7,w7,w12
  967. eor v27.16b,v15.16b,v12.16b
  968. add w8,w8,w9
  969. eor v28.16b,v19.16b,v16.16b
  970. eor w21,w21,w5
  971. eor v29.16b,v23.16b,v20.16b
  972. eor w17,w17,w6
  973. ushr v3.4s,v24.4s,#24
  974. eor w19,w19,w7
  975. ushr v7.4s,v25.4s,#24
  976. eor w20,w20,w8
  977. ushr v11.4s,v26.4s,#24
  978. ror w21,w21,#16
  979. ushr v15.4s,v27.4s,#24
  980. ror w17,w17,#16
  981. ushr v19.4s,v28.4s,#24
  982. ror w19,w19,#16
  983. ushr v23.4s,v29.4s,#24
  984. ror w20,w20,#16
  985. sli v3.4s,v24.4s,#8
  986. add w15,w15,w21
  987. sli v7.4s,v25.4s,#8
  988. add w16,w16,w17
  989. sli v11.4s,v26.4s,#8
  990. add w13,w13,w19
  991. sli v15.4s,v27.4s,#8
  992. add w14,w14,w20
  993. sli v19.4s,v28.4s,#8
  994. eor w10,w10,w15
  995. sli v23.4s,v29.4s,#8
  996. eor w11,w11,w16
  997. add v2.4s,v2.4s,v3.4s
  998. eor w12,w12,w13
  999. add v6.4s,v6.4s,v7.4s
  1000. eor w9,w9,w14
  1001. add v10.4s,v10.4s,v11.4s
  1002. ror w10,w10,#20
  1003. add v14.4s,v14.4s,v15.4s
  1004. ror w11,w11,#20
  1005. add v18.4s,v18.4s,v19.4s
  1006. ror w12,w12,#20
  1007. add v22.4s,v22.4s,v23.4s
  1008. ror w9,w9,#20
  1009. eor v24.16b,v1.16b,v2.16b
  1010. add w5,w5,w10
  1011. eor v25.16b,v5.16b,v6.16b
  1012. add w6,w6,w11
  1013. eor v26.16b,v9.16b,v10.16b
  1014. add w7,w7,w12
  1015. eor v27.16b,v13.16b,v14.16b
  1016. add w8,w8,w9
  1017. eor v28.16b,v17.16b,v18.16b
  1018. eor w21,w21,w5
  1019. eor v29.16b,v21.16b,v22.16b
  1020. eor w17,w17,w6
  1021. ushr v1.4s,v24.4s,#25
  1022. eor w19,w19,w7
  1023. ushr v5.4s,v25.4s,#25
  1024. eor w20,w20,w8
  1025. ushr v9.4s,v26.4s,#25
  1026. ror w21,w21,#24
  1027. ushr v13.4s,v27.4s,#25
  1028. ror w17,w17,#24
  1029. ushr v17.4s,v28.4s,#25
  1030. ror w19,w19,#24
  1031. ushr v21.4s,v29.4s,#25
  1032. ror w20,w20,#24
  1033. sli v1.4s,v24.4s,#7
  1034. add w15,w15,w21
  1035. sli v5.4s,v25.4s,#7
  1036. add w16,w16,w17
  1037. sli v9.4s,v26.4s,#7
  1038. add w13,w13,w19
  1039. sli v13.4s,v27.4s,#7
  1040. add w14,w14,w20
  1041. sli v17.4s,v28.4s,#7
  1042. eor w10,w10,w15
  1043. sli v21.4s,v29.4s,#7
  1044. eor w11,w11,w16
  1045. ext v2.16b,v2.16b,v2.16b,#8
  1046. eor w12,w12,w13
  1047. ext v6.16b,v6.16b,v6.16b,#8
  1048. eor w9,w9,w14
  1049. ext v10.16b,v10.16b,v10.16b,#8
  1050. ror w10,w10,#25
  1051. ext v14.16b,v14.16b,v14.16b,#8
  1052. ror w11,w11,#25
  1053. ext v18.16b,v18.16b,v18.16b,#8
  1054. ror w12,w12,#25
  1055. ext v22.16b,v22.16b,v22.16b,#8
  1056. ror w9,w9,#25
  1057. ext v3.16b,v3.16b,v3.16b,#12
  1058. ext v7.16b,v7.16b,v7.16b,#12
  1059. ext v11.16b,v11.16b,v11.16b,#12
  1060. ext v15.16b,v15.16b,v15.16b,#12
  1061. ext v19.16b,v19.16b,v19.16b,#12
  1062. ext v23.16b,v23.16b,v23.16b,#12
  1063. ext v1.16b,v1.16b,v1.16b,#4
  1064. ext v5.16b,v5.16b,v5.16b,#4
  1065. ext v9.16b,v9.16b,v9.16b,#4
  1066. ext v13.16b,v13.16b,v13.16b,#4
  1067. ext v17.16b,v17.16b,v17.16b,#4
  1068. ext v21.16b,v21.16b,v21.16b,#4
  1069. add v0.4s,v0.4s,v1.4s
  1070. add w5,w5,w9
  1071. add v4.4s,v4.4s,v5.4s
  1072. add w6,w6,w10
  1073. add v8.4s,v8.4s,v9.4s
  1074. add w7,w7,w11
  1075. add v12.4s,v12.4s,v13.4s
  1076. add w8,w8,w12
  1077. add v16.4s,v16.4s,v17.4s
  1078. eor w17,w17,w5
  1079. add v20.4s,v20.4s,v21.4s
  1080. eor w19,w19,w6
  1081. eor v3.16b,v3.16b,v0.16b
  1082. eor w20,w20,w7
  1083. eor v7.16b,v7.16b,v4.16b
  1084. eor w21,w21,w8
  1085. eor v11.16b,v11.16b,v8.16b
  1086. ror w17,w17,#16
  1087. eor v15.16b,v15.16b,v12.16b
  1088. ror w19,w19,#16
  1089. eor v19.16b,v19.16b,v16.16b
  1090. ror w20,w20,#16
  1091. eor v23.16b,v23.16b,v20.16b
  1092. ror w21,w21,#16
  1093. rev32 v3.8h,v3.8h
  1094. add w13,w13,w17
  1095. rev32 v7.8h,v7.8h
  1096. add w14,w14,w19
  1097. rev32 v11.8h,v11.8h
  1098. add w15,w15,w20
  1099. rev32 v15.8h,v15.8h
  1100. add w16,w16,w21
  1101. rev32 v19.8h,v19.8h
  1102. eor w9,w9,w13
  1103. rev32 v23.8h,v23.8h
  1104. eor w10,w10,w14
  1105. add v2.4s,v2.4s,v3.4s
  1106. eor w11,w11,w15
  1107. add v6.4s,v6.4s,v7.4s
  1108. eor w12,w12,w16
  1109. add v10.4s,v10.4s,v11.4s
  1110. ror w9,w9,#20
  1111. add v14.4s,v14.4s,v15.4s
  1112. ror w10,w10,#20
  1113. add v18.4s,v18.4s,v19.4s
  1114. ror w11,w11,#20
  1115. add v22.4s,v22.4s,v23.4s
  1116. ror w12,w12,#20
  1117. eor v24.16b,v1.16b,v2.16b
  1118. add w5,w5,w9
  1119. eor v25.16b,v5.16b,v6.16b
  1120. add w6,w6,w10
  1121. eor v26.16b,v9.16b,v10.16b
  1122. add w7,w7,w11
  1123. eor v27.16b,v13.16b,v14.16b
  1124. add w8,w8,w12
  1125. eor v28.16b,v17.16b,v18.16b
  1126. eor w17,w17,w5
  1127. eor v29.16b,v21.16b,v22.16b
  1128. eor w19,w19,w6
  1129. ushr v1.4s,v24.4s,#20
  1130. eor w20,w20,w7
  1131. ushr v5.4s,v25.4s,#20
  1132. eor w21,w21,w8
  1133. ushr v9.4s,v26.4s,#20
  1134. ror w17,w17,#24
  1135. ushr v13.4s,v27.4s,#20
  1136. ror w19,w19,#24
  1137. ushr v17.4s,v28.4s,#20
  1138. ror w20,w20,#24
  1139. ushr v21.4s,v29.4s,#20
  1140. ror w21,w21,#24
  1141. sli v1.4s,v24.4s,#12
  1142. add w13,w13,w17
  1143. sli v5.4s,v25.4s,#12
  1144. add w14,w14,w19
  1145. sli v9.4s,v26.4s,#12
  1146. add w15,w15,w20
  1147. sli v13.4s,v27.4s,#12
  1148. add w16,w16,w21
  1149. sli v17.4s,v28.4s,#12
  1150. eor w9,w9,w13
  1151. sli v21.4s,v29.4s,#12
  1152. eor w10,w10,w14
  1153. add v0.4s,v0.4s,v1.4s
  1154. eor w11,w11,w15
  1155. add v4.4s,v4.4s,v5.4s
  1156. eor w12,w12,w16
  1157. add v8.4s,v8.4s,v9.4s
  1158. ror w9,w9,#25
  1159. add v12.4s,v12.4s,v13.4s
  1160. ror w10,w10,#25
  1161. add v16.4s,v16.4s,v17.4s
  1162. ror w11,w11,#25
  1163. add v20.4s,v20.4s,v21.4s
  1164. ror w12,w12,#25
  1165. eor v24.16b,v3.16b,v0.16b
  1166. add w5,w5,w10
  1167. eor v25.16b,v7.16b,v4.16b
  1168. add w6,w6,w11
  1169. eor v26.16b,v11.16b,v8.16b
  1170. add w7,w7,w12
  1171. eor v27.16b,v15.16b,v12.16b
  1172. add w8,w8,w9
  1173. eor v28.16b,v19.16b,v16.16b
  1174. eor w21,w21,w5
  1175. eor v29.16b,v23.16b,v20.16b
  1176. eor w17,w17,w6
  1177. ushr v3.4s,v24.4s,#24
  1178. eor w19,w19,w7
  1179. ushr v7.4s,v25.4s,#24
  1180. eor w20,w20,w8
  1181. ushr v11.4s,v26.4s,#24
  1182. ror w21,w21,#16
  1183. ushr v15.4s,v27.4s,#24
  1184. ror w17,w17,#16
  1185. ushr v19.4s,v28.4s,#24
  1186. ror w19,w19,#16
  1187. ushr v23.4s,v29.4s,#24
  1188. ror w20,w20,#16
  1189. sli v3.4s,v24.4s,#8
  1190. add w15,w15,w21
  1191. sli v7.4s,v25.4s,#8
  1192. add w16,w16,w17
  1193. sli v11.4s,v26.4s,#8
  1194. add w13,w13,w19
  1195. sli v15.4s,v27.4s,#8
  1196. add w14,w14,w20
  1197. sli v19.4s,v28.4s,#8
  1198. eor w10,w10,w15
  1199. sli v23.4s,v29.4s,#8
  1200. eor w11,w11,w16
  1201. add v2.4s,v2.4s,v3.4s
  1202. eor w12,w12,w13
  1203. add v6.4s,v6.4s,v7.4s
  1204. eor w9,w9,w14
  1205. add v10.4s,v10.4s,v11.4s
  1206. ror w10,w10,#20
  1207. add v14.4s,v14.4s,v15.4s
  1208. ror w11,w11,#20
  1209. add v18.4s,v18.4s,v19.4s
  1210. ror w12,w12,#20
  1211. add v22.4s,v22.4s,v23.4s
  1212. ror w9,w9,#20
  1213. eor v24.16b,v1.16b,v2.16b
  1214. add w5,w5,w10
  1215. eor v25.16b,v5.16b,v6.16b
  1216. add w6,w6,w11
  1217. eor v26.16b,v9.16b,v10.16b
  1218. add w7,w7,w12
  1219. eor v27.16b,v13.16b,v14.16b
  1220. add w8,w8,w9
  1221. eor v28.16b,v17.16b,v18.16b
  1222. eor w21,w21,w5
  1223. eor v29.16b,v21.16b,v22.16b
  1224. eor w17,w17,w6
  1225. ushr v1.4s,v24.4s,#25
  1226. eor w19,w19,w7
  1227. ushr v5.4s,v25.4s,#25
  1228. eor w20,w20,w8
  1229. ushr v9.4s,v26.4s,#25
  1230. ror w21,w21,#24
  1231. ushr v13.4s,v27.4s,#25
  1232. ror w17,w17,#24
  1233. ushr v17.4s,v28.4s,#25
  1234. ror w19,w19,#24
  1235. ushr v21.4s,v29.4s,#25
  1236. ror w20,w20,#24
  1237. sli v1.4s,v24.4s,#7
  1238. add w15,w15,w21
  1239. sli v5.4s,v25.4s,#7
  1240. add w16,w16,w17
  1241. sli v9.4s,v26.4s,#7
  1242. add w13,w13,w19
  1243. sli v13.4s,v27.4s,#7
  1244. add w14,w14,w20
  1245. sli v17.4s,v28.4s,#7
  1246. eor w10,w10,w15
  1247. sli v21.4s,v29.4s,#7
  1248. eor w11,w11,w16
  1249. ext v2.16b,v2.16b,v2.16b,#8
  1250. eor w12,w12,w13
  1251. ext v6.16b,v6.16b,v6.16b,#8
  1252. eor w9,w9,w14
  1253. ext v10.16b,v10.16b,v10.16b,#8
  1254. ror w10,w10,#25
  1255. ext v14.16b,v14.16b,v14.16b,#8
  1256. ror w11,w11,#25
  1257. ext v18.16b,v18.16b,v18.16b,#8
  1258. ror w12,w12,#25
  1259. ext v22.16b,v22.16b,v22.16b,#8
  1260. ror w9,w9,#25
  1261. ext v3.16b,v3.16b,v3.16b,#4
  1262. ext v7.16b,v7.16b,v7.16b,#4
  1263. ext v11.16b,v11.16b,v11.16b,#4
  1264. ext v15.16b,v15.16b,v15.16b,#4
  1265. ext v19.16b,v19.16b,v19.16b,#4
  1266. ext v23.16b,v23.16b,v23.16b,#4
  1267. ext v1.16b,v1.16b,v1.16b,#12
  1268. ext v5.16b,v5.16b,v5.16b,#12
  1269. ext v9.16b,v9.16b,v9.16b,#12
  1270. ext v13.16b,v13.16b,v13.16b,#12
  1271. ext v17.16b,v17.16b,v17.16b,#12
  1272. ext v21.16b,v21.16b,v21.16b,#12
  1273. cbnz x4,.Loop_upper_neon
  1274. add w5,w5,w22 // accumulate key block
  1275. add x6,x6,x22,lsr#32
  1276. add w7,w7,w23
  1277. add x8,x8,x23,lsr#32
  1278. add w9,w9,w24
  1279. add x10,x10,x24,lsr#32
  1280. add w11,w11,w25
  1281. add x12,x12,x25,lsr#32
  1282. add w13,w13,w26
  1283. add x14,x14,x26,lsr#32
  1284. add w15,w15,w27
  1285. add x16,x16,x27,lsr#32
  1286. add w17,w17,w28
  1287. add x19,x19,x28,lsr#32
  1288. add w20,w20,w30
  1289. add x21,x21,x30,lsr#32
  1290. add x5,x5,x6,lsl#32 // pack
  1291. add x7,x7,x8,lsl#32
  1292. ldp x6,x8,[x1,#0] // load input
  1293. add x9,x9,x10,lsl#32
  1294. add x11,x11,x12,lsl#32
  1295. ldp x10,x12,[x1,#16]
  1296. add x13,x13,x14,lsl#32
  1297. add x15,x15,x16,lsl#32
  1298. ldp x14,x16,[x1,#32]
  1299. add x17,x17,x19,lsl#32
  1300. add x20,x20,x21,lsl#32
  1301. ldp x19,x21,[x1,#48]
  1302. add x1,x1,#64
  1303. #ifdef __ARMEB__
  1304. rev x5,x5
  1305. rev x7,x7
  1306. rev x9,x9
  1307. rev x11,x11
  1308. rev x13,x13
  1309. rev x15,x15
  1310. rev x17,x17
  1311. rev x20,x20
  1312. #endif
  1313. eor x5,x5,x6
  1314. eor x7,x7,x8
  1315. eor x9,x9,x10
  1316. eor x11,x11,x12
  1317. eor x13,x13,x14
  1318. eor x15,x15,x16
  1319. eor x17,x17,x19
  1320. eor x20,x20,x21
  1321. stp x5,x7,[x0,#0] // store output
  1322. add x28,x28,#1 // increment counter
  1323. mov w5,w22 // unpack key block
  1324. lsr x6,x22,#32
  1325. stp x9,x11,[x0,#16]
  1326. mov w7,w23
  1327. lsr x8,x23,#32
  1328. stp x13,x15,[x0,#32]
  1329. mov w9,w24
  1330. lsr x10,x24,#32
  1331. stp x17,x20,[x0,#48]
  1332. add x0,x0,#64
  1333. mov w11,w25
  1334. lsr x12,x25,#32
  1335. mov w13,w26
  1336. lsr x14,x26,#32
  1337. mov w15,w27
  1338. lsr x16,x27,#32
  1339. mov w17,w28
  1340. lsr x19,x28,#32
  1341. mov w20,w30
  1342. lsr x21,x30,#32
  1343. mov x4,#5
  1344. .Loop_lower_neon:
  1345. sub x4,x4,#1
  1346. add v0.4s,v0.4s,v1.4s
  1347. add w5,w5,w9
  1348. add v4.4s,v4.4s,v5.4s
  1349. add w6,w6,w10
  1350. add v8.4s,v8.4s,v9.4s
  1351. add w7,w7,w11
  1352. add v12.4s,v12.4s,v13.4s
  1353. add w8,w8,w12
  1354. add v16.4s,v16.4s,v17.4s
  1355. eor w17,w17,w5
  1356. add v20.4s,v20.4s,v21.4s
  1357. eor w19,w19,w6
  1358. eor v3.16b,v3.16b,v0.16b
  1359. eor w20,w20,w7
  1360. eor v7.16b,v7.16b,v4.16b
  1361. eor w21,w21,w8
  1362. eor v11.16b,v11.16b,v8.16b
  1363. ror w17,w17,#16
  1364. eor v15.16b,v15.16b,v12.16b
  1365. ror w19,w19,#16
  1366. eor v19.16b,v19.16b,v16.16b
  1367. ror w20,w20,#16
  1368. eor v23.16b,v23.16b,v20.16b
  1369. ror w21,w21,#16
  1370. rev32 v3.8h,v3.8h
  1371. add w13,w13,w17
  1372. rev32 v7.8h,v7.8h
  1373. add w14,w14,w19
  1374. rev32 v11.8h,v11.8h
  1375. add w15,w15,w20
  1376. rev32 v15.8h,v15.8h
  1377. add w16,w16,w21
  1378. rev32 v19.8h,v19.8h
  1379. eor w9,w9,w13
  1380. rev32 v23.8h,v23.8h
  1381. eor w10,w10,w14
  1382. add v2.4s,v2.4s,v3.4s
  1383. eor w11,w11,w15
  1384. add v6.4s,v6.4s,v7.4s
  1385. eor w12,w12,w16
  1386. add v10.4s,v10.4s,v11.4s
  1387. ror w9,w9,#20
  1388. add v14.4s,v14.4s,v15.4s
  1389. ror w10,w10,#20
  1390. add v18.4s,v18.4s,v19.4s
  1391. ror w11,w11,#20
  1392. add v22.4s,v22.4s,v23.4s
  1393. ror w12,w12,#20
  1394. eor v24.16b,v1.16b,v2.16b
  1395. add w5,w5,w9
  1396. eor v25.16b,v5.16b,v6.16b
  1397. add w6,w6,w10
  1398. eor v26.16b,v9.16b,v10.16b
  1399. add w7,w7,w11
  1400. eor v27.16b,v13.16b,v14.16b
  1401. add w8,w8,w12
  1402. eor v28.16b,v17.16b,v18.16b
  1403. eor w17,w17,w5
  1404. eor v29.16b,v21.16b,v22.16b
  1405. eor w19,w19,w6
  1406. ushr v1.4s,v24.4s,#20
  1407. eor w20,w20,w7
  1408. ushr v5.4s,v25.4s,#20
  1409. eor w21,w21,w8
  1410. ushr v9.4s,v26.4s,#20
  1411. ror w17,w17,#24
  1412. ushr v13.4s,v27.4s,#20
  1413. ror w19,w19,#24
  1414. ushr v17.4s,v28.4s,#20
  1415. ror w20,w20,#24
  1416. ushr v21.4s,v29.4s,#20
  1417. ror w21,w21,#24
  1418. sli v1.4s,v24.4s,#12
  1419. add w13,w13,w17
  1420. sli v5.4s,v25.4s,#12
  1421. add w14,w14,w19
  1422. sli v9.4s,v26.4s,#12
  1423. add w15,w15,w20
  1424. sli v13.4s,v27.4s,#12
  1425. add w16,w16,w21
  1426. sli v17.4s,v28.4s,#12
  1427. eor w9,w9,w13
  1428. sli v21.4s,v29.4s,#12
  1429. eor w10,w10,w14
  1430. add v0.4s,v0.4s,v1.4s
  1431. eor w11,w11,w15
  1432. add v4.4s,v4.4s,v5.4s
  1433. eor w12,w12,w16
  1434. add v8.4s,v8.4s,v9.4s
  1435. ror w9,w9,#25
  1436. add v12.4s,v12.4s,v13.4s
  1437. ror w10,w10,#25
  1438. add v16.4s,v16.4s,v17.4s
  1439. ror w11,w11,#25
  1440. add v20.4s,v20.4s,v21.4s
  1441. ror w12,w12,#25
  1442. eor v24.16b,v3.16b,v0.16b
  1443. add w5,w5,w10
  1444. eor v25.16b,v7.16b,v4.16b
  1445. add w6,w6,w11
  1446. eor v26.16b,v11.16b,v8.16b
  1447. add w7,w7,w12
  1448. eor v27.16b,v15.16b,v12.16b
  1449. add w8,w8,w9
  1450. eor v28.16b,v19.16b,v16.16b
  1451. eor w21,w21,w5
  1452. eor v29.16b,v23.16b,v20.16b
  1453. eor w17,w17,w6
  1454. ushr v3.4s,v24.4s,#24
  1455. eor w19,w19,w7
  1456. ushr v7.4s,v25.4s,#24
  1457. eor w20,w20,w8
  1458. ushr v11.4s,v26.4s,#24
  1459. ror w21,w21,#16
  1460. ushr v15.4s,v27.4s,#24
  1461. ror w17,w17,#16
  1462. ushr v19.4s,v28.4s,#24
  1463. ror w19,w19,#16
  1464. ushr v23.4s,v29.4s,#24
  1465. ror w20,w20,#16
  1466. sli v3.4s,v24.4s,#8
  1467. add w15,w15,w21
  1468. sli v7.4s,v25.4s,#8
  1469. add w16,w16,w17
  1470. sli v11.4s,v26.4s,#8
  1471. add w13,w13,w19
  1472. sli v15.4s,v27.4s,#8
  1473. add w14,w14,w20
  1474. sli v19.4s,v28.4s,#8
  1475. eor w10,w10,w15
  1476. sli v23.4s,v29.4s,#8
  1477. eor w11,w11,w16
  1478. add v2.4s,v2.4s,v3.4s
  1479. eor w12,w12,w13
  1480. add v6.4s,v6.4s,v7.4s
  1481. eor w9,w9,w14
  1482. add v10.4s,v10.4s,v11.4s
  1483. ror w10,w10,#20
  1484. add v14.4s,v14.4s,v15.4s
  1485. ror w11,w11,#20
  1486. add v18.4s,v18.4s,v19.4s
  1487. ror w12,w12,#20
  1488. add v22.4s,v22.4s,v23.4s
  1489. ror w9,w9,#20
  1490. eor v24.16b,v1.16b,v2.16b
  1491. add w5,w5,w10
  1492. eor v25.16b,v5.16b,v6.16b
  1493. add w6,w6,w11
  1494. eor v26.16b,v9.16b,v10.16b
  1495. add w7,w7,w12
  1496. eor v27.16b,v13.16b,v14.16b
  1497. add w8,w8,w9
  1498. eor v28.16b,v17.16b,v18.16b
  1499. eor w21,w21,w5
  1500. eor v29.16b,v21.16b,v22.16b
  1501. eor w17,w17,w6
  1502. ushr v1.4s,v24.4s,#25
  1503. eor w19,w19,w7
  1504. ushr v5.4s,v25.4s,#25
  1505. eor w20,w20,w8
  1506. ushr v9.4s,v26.4s,#25
  1507. ror w21,w21,#24
  1508. ushr v13.4s,v27.4s,#25
  1509. ror w17,w17,#24
  1510. ushr v17.4s,v28.4s,#25
  1511. ror w19,w19,#24
  1512. ushr v21.4s,v29.4s,#25
  1513. ror w20,w20,#24
  1514. sli v1.4s,v24.4s,#7
  1515. add w15,w15,w21
  1516. sli v5.4s,v25.4s,#7
  1517. add w16,w16,w17
  1518. sli v9.4s,v26.4s,#7
  1519. add w13,w13,w19
  1520. sli v13.4s,v27.4s,#7
  1521. add w14,w14,w20
  1522. sli v17.4s,v28.4s,#7
  1523. eor w10,w10,w15
  1524. sli v21.4s,v29.4s,#7
  1525. eor w11,w11,w16
  1526. ext v2.16b,v2.16b,v2.16b,#8
  1527. eor w12,w12,w13
  1528. ext v6.16b,v6.16b,v6.16b,#8
  1529. eor w9,w9,w14
  1530. ext v10.16b,v10.16b,v10.16b,#8
  1531. ror w10,w10,#25
  1532. ext v14.16b,v14.16b,v14.16b,#8
  1533. ror w11,w11,#25
  1534. ext v18.16b,v18.16b,v18.16b,#8
  1535. ror w12,w12,#25
  1536. ext v22.16b,v22.16b,v22.16b,#8
  1537. ror w9,w9,#25
  1538. ext v3.16b,v3.16b,v3.16b,#12
  1539. ext v7.16b,v7.16b,v7.16b,#12
  1540. ext v11.16b,v11.16b,v11.16b,#12
  1541. ext v15.16b,v15.16b,v15.16b,#12
  1542. ext v19.16b,v19.16b,v19.16b,#12
  1543. ext v23.16b,v23.16b,v23.16b,#12
  1544. ext v1.16b,v1.16b,v1.16b,#4
  1545. ext v5.16b,v5.16b,v5.16b,#4
  1546. ext v9.16b,v9.16b,v9.16b,#4
  1547. ext v13.16b,v13.16b,v13.16b,#4
  1548. ext v17.16b,v17.16b,v17.16b,#4
  1549. ext v21.16b,v21.16b,v21.16b,#4
  1550. add v0.4s,v0.4s,v1.4s
  1551. add w5,w5,w9
  1552. add v4.4s,v4.4s,v5.4s
  1553. add w6,w6,w10
  1554. add v8.4s,v8.4s,v9.4s
  1555. add w7,w7,w11
  1556. add v12.4s,v12.4s,v13.4s
  1557. add w8,w8,w12
  1558. add v16.4s,v16.4s,v17.4s
  1559. eor w17,w17,w5
  1560. add v20.4s,v20.4s,v21.4s
  1561. eor w19,w19,w6
  1562. eor v3.16b,v3.16b,v0.16b
  1563. eor w20,w20,w7
  1564. eor v7.16b,v7.16b,v4.16b
  1565. eor w21,w21,w8
  1566. eor v11.16b,v11.16b,v8.16b
  1567. ror w17,w17,#16
  1568. eor v15.16b,v15.16b,v12.16b
  1569. ror w19,w19,#16
  1570. eor v19.16b,v19.16b,v16.16b
  1571. ror w20,w20,#16
  1572. eor v23.16b,v23.16b,v20.16b
  1573. ror w21,w21,#16
  1574. rev32 v3.8h,v3.8h
  1575. add w13,w13,w17
  1576. rev32 v7.8h,v7.8h
  1577. add w14,w14,w19
  1578. rev32 v11.8h,v11.8h
  1579. add w15,w15,w20
  1580. rev32 v15.8h,v15.8h
  1581. add w16,w16,w21
  1582. rev32 v19.8h,v19.8h
  1583. eor w9,w9,w13
  1584. rev32 v23.8h,v23.8h
  1585. eor w10,w10,w14
  1586. add v2.4s,v2.4s,v3.4s
  1587. eor w11,w11,w15
  1588. add v6.4s,v6.4s,v7.4s
  1589. eor w12,w12,w16
  1590. add v10.4s,v10.4s,v11.4s
  1591. ror w9,w9,#20
  1592. add v14.4s,v14.4s,v15.4s
  1593. ror w10,w10,#20
  1594. add v18.4s,v18.4s,v19.4s
  1595. ror w11,w11,#20
  1596. add v22.4s,v22.4s,v23.4s
  1597. ror w12,w12,#20
  1598. eor v24.16b,v1.16b,v2.16b
  1599. add w5,w5,w9
  1600. eor v25.16b,v5.16b,v6.16b
  1601. add w6,w6,w10
  1602. eor v26.16b,v9.16b,v10.16b
  1603. add w7,w7,w11
  1604. eor v27.16b,v13.16b,v14.16b
  1605. add w8,w8,w12
  1606. eor v28.16b,v17.16b,v18.16b
  1607. eor w17,w17,w5
  1608. eor v29.16b,v21.16b,v22.16b
  1609. eor w19,w19,w6
  1610. ushr v1.4s,v24.4s,#20
  1611. eor w20,w20,w7
  1612. ushr v5.4s,v25.4s,#20
  1613. eor w21,w21,w8
  1614. ushr v9.4s,v26.4s,#20
  1615. ror w17,w17,#24
  1616. ushr v13.4s,v27.4s,#20
  1617. ror w19,w19,#24
  1618. ushr v17.4s,v28.4s,#20
  1619. ror w20,w20,#24
  1620. ushr v21.4s,v29.4s,#20
  1621. ror w21,w21,#24
  1622. sli v1.4s,v24.4s,#12
  1623. add w13,w13,w17
  1624. sli v5.4s,v25.4s,#12
  1625. add w14,w14,w19
  1626. sli v9.4s,v26.4s,#12
  1627. add w15,w15,w20
  1628. sli v13.4s,v27.4s,#12
  1629. add w16,w16,w21
  1630. sli v17.4s,v28.4s,#12
  1631. eor w9,w9,w13
  1632. sli v21.4s,v29.4s,#12
  1633. eor w10,w10,w14
  1634. add v0.4s,v0.4s,v1.4s
  1635. eor w11,w11,w15
  1636. add v4.4s,v4.4s,v5.4s
  1637. eor w12,w12,w16
  1638. add v8.4s,v8.4s,v9.4s
  1639. ror w9,w9,#25
  1640. add v12.4s,v12.4s,v13.4s
  1641. ror w10,w10,#25
  1642. add v16.4s,v16.4s,v17.4s
  1643. ror w11,w11,#25
  1644. add v20.4s,v20.4s,v21.4s
  1645. ror w12,w12,#25
  1646. eor v24.16b,v3.16b,v0.16b
  1647. add w5,w5,w10
  1648. eor v25.16b,v7.16b,v4.16b
  1649. add w6,w6,w11
  1650. eor v26.16b,v11.16b,v8.16b
  1651. add w7,w7,w12
  1652. eor v27.16b,v15.16b,v12.16b
  1653. add w8,w8,w9
  1654. eor v28.16b,v19.16b,v16.16b
  1655. eor w21,w21,w5
  1656. eor v29.16b,v23.16b,v20.16b
  1657. eor w17,w17,w6
  1658. ushr v3.4s,v24.4s,#24
  1659. eor w19,w19,w7
  1660. ushr v7.4s,v25.4s,#24
  1661. eor w20,w20,w8
  1662. ushr v11.4s,v26.4s,#24
  1663. ror w21,w21,#16
  1664. ushr v15.4s,v27.4s,#24
  1665. ror w17,w17,#16
  1666. ushr v19.4s,v28.4s,#24
  1667. ror w19,w19,#16
  1668. ushr v23.4s,v29.4s,#24
  1669. ror w20,w20,#16
  1670. sli v3.4s,v24.4s,#8
  1671. add w15,w15,w21
  1672. sli v7.4s,v25.4s,#8
  1673. add w16,w16,w17
  1674. sli v11.4s,v26.4s,#8
  1675. add w13,w13,w19
  1676. sli v15.4s,v27.4s,#8
  1677. add w14,w14,w20
  1678. sli v19.4s,v28.4s,#8
  1679. eor w10,w10,w15
  1680. sli v23.4s,v29.4s,#8
  1681. eor w11,w11,w16
  1682. add v2.4s,v2.4s,v3.4s
  1683. eor w12,w12,w13
  1684. add v6.4s,v6.4s,v7.4s
  1685. eor w9,w9,w14
  1686. add v10.4s,v10.4s,v11.4s
  1687. ror w10,w10,#20
  1688. add v14.4s,v14.4s,v15.4s
  1689. ror w11,w11,#20
  1690. add v18.4s,v18.4s,v19.4s
  1691. ror w12,w12,#20
  1692. add v22.4s,v22.4s,v23.4s
  1693. ror w9,w9,#20
  1694. eor v24.16b,v1.16b,v2.16b
  1695. add w5,w5,w10
  1696. eor v25.16b,v5.16b,v6.16b
  1697. add w6,w6,w11
  1698. eor v26.16b,v9.16b,v10.16b
  1699. add w7,w7,w12
  1700. eor v27.16b,v13.16b,v14.16b
  1701. add w8,w8,w9
  1702. eor v28.16b,v17.16b,v18.16b
  1703. eor w21,w21,w5
  1704. eor v29.16b,v21.16b,v22.16b
  1705. eor w17,w17,w6
  1706. ushr v1.4s,v24.4s,#25
  1707. eor w19,w19,w7
  1708. ushr v5.4s,v25.4s,#25
  1709. eor w20,w20,w8
  1710. ushr v9.4s,v26.4s,#25
  1711. ror w21,w21,#24
  1712. ushr v13.4s,v27.4s,#25
  1713. ror w17,w17,#24
  1714. ushr v17.4s,v28.4s,#25
  1715. ror w19,w19,#24
  1716. ushr v21.4s,v29.4s,#25
  1717. ror w20,w20,#24
  1718. sli v1.4s,v24.4s,#7
  1719. add w15,w15,w21
  1720. sli v5.4s,v25.4s,#7
  1721. add w16,w16,w17
  1722. sli v9.4s,v26.4s,#7
  1723. add w13,w13,w19
  1724. sli v13.4s,v27.4s,#7
  1725. add w14,w14,w20
  1726. sli v17.4s,v28.4s,#7
  1727. eor w10,w10,w15
  1728. sli v21.4s,v29.4s,#7
  1729. eor w11,w11,w16
  1730. ext v2.16b,v2.16b,v2.16b,#8
  1731. eor w12,w12,w13
  1732. ext v6.16b,v6.16b,v6.16b,#8
  1733. eor w9,w9,w14
  1734. ext v10.16b,v10.16b,v10.16b,#8
  1735. ror w10,w10,#25
  1736. ext v14.16b,v14.16b,v14.16b,#8
  1737. ror w11,w11,#25
  1738. ext v18.16b,v18.16b,v18.16b,#8
  1739. ror w12,w12,#25
  1740. ext v22.16b,v22.16b,v22.16b,#8
  1741. ror w9,w9,#25
  1742. ext v3.16b,v3.16b,v3.16b,#4
  1743. ext v7.16b,v7.16b,v7.16b,#4
  1744. ext v11.16b,v11.16b,v11.16b,#4
  1745. ext v15.16b,v15.16b,v15.16b,#4
  1746. ext v19.16b,v19.16b,v19.16b,#4
  1747. ext v23.16b,v23.16b,v23.16b,#4
  1748. ext v1.16b,v1.16b,v1.16b,#12
  1749. ext v5.16b,v5.16b,v5.16b,#12
  1750. ext v9.16b,v9.16b,v9.16b,#12
  1751. ext v13.16b,v13.16b,v13.16b,#12
  1752. ext v17.16b,v17.16b,v17.16b,#12
  1753. ext v21.16b,v21.16b,v21.16b,#12
  1754. cbnz x4,.Loop_lower_neon
  1755. add w5,w5,w22 // accumulate key block
  1756. ldp q24,q25,[sp,#0]
  1757. add x6,x6,x22,lsr#32
  1758. ldp q26,q27,[sp,#32]
  1759. add w7,w7,w23
  1760. ldp q28,q29,[sp,#64]
  1761. add x8,x8,x23,lsr#32
  1762. add v0.4s,v0.4s,v24.4s
  1763. add w9,w9,w24
  1764. add v4.4s,v4.4s,v24.4s
  1765. add x10,x10,x24,lsr#32
  1766. add v8.4s,v8.4s,v24.4s
  1767. add w11,w11,w25
  1768. add v12.4s,v12.4s,v24.4s
  1769. add x12,x12,x25,lsr#32
  1770. add v16.4s,v16.4s,v24.4s
  1771. add w13,w13,w26
  1772. add v20.4s,v20.4s,v24.4s
  1773. add x14,x14,x26,lsr#32
  1774. add v2.4s,v2.4s,v26.4s
  1775. add w15,w15,w27
  1776. add v6.4s,v6.4s,v26.4s
  1777. add x16,x16,x27,lsr#32
  1778. add v10.4s,v10.4s,v26.4s
  1779. add w17,w17,w28
  1780. add v14.4s,v14.4s,v26.4s
  1781. add x19,x19,x28,lsr#32
  1782. add v18.4s,v18.4s,v26.4s
  1783. add w20,w20,w30
  1784. add v22.4s,v22.4s,v26.4s
  1785. add x21,x21,x30,lsr#32
  1786. add v19.4s,v19.4s,v31.4s // +4
  1787. add x5,x5,x6,lsl#32 // pack
  1788. add v23.4s,v23.4s,v31.4s // +4
  1789. add x7,x7,x8,lsl#32
  1790. add v3.4s,v3.4s,v27.4s
  1791. ldp x6,x8,[x1,#0] // load input
  1792. add v7.4s,v7.4s,v28.4s
  1793. add x9,x9,x10,lsl#32
  1794. add v11.4s,v11.4s,v29.4s
  1795. add x11,x11,x12,lsl#32
  1796. add v15.4s,v15.4s,v30.4s
  1797. ldp x10,x12,[x1,#16]
  1798. add v19.4s,v19.4s,v27.4s
  1799. add x13,x13,x14,lsl#32
  1800. add v23.4s,v23.4s,v28.4s
  1801. add x15,x15,x16,lsl#32
  1802. add v1.4s,v1.4s,v25.4s
  1803. ldp x14,x16,[x1,#32]
  1804. add v5.4s,v5.4s,v25.4s
  1805. add x17,x17,x19,lsl#32
  1806. add v9.4s,v9.4s,v25.4s
  1807. add x20,x20,x21,lsl#32
  1808. add v13.4s,v13.4s,v25.4s
  1809. ldp x19,x21,[x1,#48]
  1810. add v17.4s,v17.4s,v25.4s
  1811. add x1,x1,#64
  1812. add v21.4s,v21.4s,v25.4s
  1813. #ifdef __ARMEB__
  1814. rev x5,x5
  1815. rev x7,x7
  1816. rev x9,x9
  1817. rev x11,x11
  1818. rev x13,x13
  1819. rev x15,x15
  1820. rev x17,x17
  1821. rev x20,x20
  1822. #endif
  1823. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1824. eor x5,x5,x6
  1825. eor x7,x7,x8
  1826. eor x9,x9,x10
  1827. eor x11,x11,x12
  1828. eor x13,x13,x14
  1829. eor v0.16b,v0.16b,v24.16b
  1830. eor x15,x15,x16
  1831. eor v1.16b,v1.16b,v25.16b
  1832. eor x17,x17,x19
  1833. eor v2.16b,v2.16b,v26.16b
  1834. eor x20,x20,x21
  1835. eor v3.16b,v3.16b,v27.16b
  1836. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1837. stp x5,x7,[x0,#0] // store output
  1838. add x28,x28,#7 // increment counter
  1839. stp x9,x11,[x0,#16]
  1840. stp x13,x15,[x0,#32]
  1841. stp x17,x20,[x0,#48]
  1842. add x0,x0,#64
  1843. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1844. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1845. eor v4.16b,v4.16b,v24.16b
  1846. eor v5.16b,v5.16b,v25.16b
  1847. eor v6.16b,v6.16b,v26.16b
  1848. eor v7.16b,v7.16b,v27.16b
  1849. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1850. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1851. eor v8.16b,v8.16b,v0.16b
  1852. ldp q24,q25,[sp,#0]
  1853. eor v9.16b,v9.16b,v1.16b
  1854. ldp q26,q27,[sp,#32]
  1855. eor v10.16b,v10.16b,v2.16b
  1856. eor v11.16b,v11.16b,v3.16b
  1857. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1858. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1859. eor v12.16b,v12.16b,v4.16b
  1860. eor v13.16b,v13.16b,v5.16b
  1861. eor v14.16b,v14.16b,v6.16b
  1862. eor v15.16b,v15.16b,v7.16b
  1863. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1864. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1865. eor v16.16b,v16.16b,v8.16b
  1866. eor v17.16b,v17.16b,v9.16b
  1867. eor v18.16b,v18.16b,v10.16b
  1868. eor v19.16b,v19.16b,v11.16b
  1869. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1870. shl v0.4s,v31.4s,#1 // 4 -> 8
  1871. eor v20.16b,v20.16b,v12.16b
  1872. eor v21.16b,v21.16b,v13.16b
  1873. eor v22.16b,v22.16b,v14.16b
  1874. eor v23.16b,v23.16b,v15.16b
  1875. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1876. add v27.4s,v27.4s,v0.4s // += 8
  1877. add v28.4s,v28.4s,v0.4s
  1878. add v29.4s,v29.4s,v0.4s
  1879. add v30.4s,v30.4s,v0.4s
  1880. b.hs .Loop_outer_512_neon
  1881. adds x2,x2,#512
  1882. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1883. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1884. ldp d10,d11,[sp,#128+16]
  1885. ldp d12,d13,[sp,#128+32]
  1886. ldp d14,d15,[sp,#128+48]
  1887. stp q24,q31,[sp,#0] // wipe off-load area
  1888. stp q24,q31,[sp,#32]
  1889. stp q24,q31,[sp,#64]
  1890. b.eq .Ldone_512_neon
  1891. cmp x2,#192
  1892. sub v27.4s,v27.4s,v0.4s // -= 1
  1893. sub v28.4s,v28.4s,v0.4s
  1894. sub v29.4s,v29.4s,v0.4s
  1895. add sp,sp,#128
  1896. b.hs .Loop_outer_neon
  1897. eor v25.16b,v25.16b,v25.16b
  1898. eor v26.16b,v26.16b,v26.16b
  1899. eor v27.16b,v27.16b,v27.16b
  1900. eor v28.16b,v28.16b,v28.16b
  1901. eor v29.16b,v29.16b,v29.16b
  1902. eor v30.16b,v30.16b,v30.16b
  1903. b .Loop_outer
  1904. .Ldone_512_neon:
  1905. ldp x19,x20,[x29,#16]
  1906. add sp,sp,#128+64
  1907. ldp x21,x22,[x29,#32]
  1908. ldp x23,x24,[x29,#48]
  1909. ldp x25,x26,[x29,#64]
  1910. ldp x27,x28,[x29,#80]
  1911. ldp x29,x30,[sp],#96
  1912. AARCH64_VALIDATE_LINK_REGISTER
  1913. ret
  1914. .size ChaCha20_512_neon,.-ChaCha20_512_neon
  1915. #endif
  1916. #endif // !OPENSSL_NO_ASM
  1917. .section .note.GNU-stack,"",%progbits