chacha-armv8.S 40 KB


  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .section .rodata
  16. .align 5
  17. Lsigma:
  18. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  19. Lone:
  20. .long 1,0,0,0
  21. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  22. .align 2
  23. .text
  24. .globl ChaCha20_ctr32
  25. .def ChaCha20_ctr32
  26. .type 32
  27. .endef
  28. .align 5
  29. ChaCha20_ctr32:
  30. AARCH64_VALID_CALL_TARGET
  31. cbz x2,Labort
  32. #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
  33. adrp x5,:pg_hi21_nc:OPENSSL_armcap_P
  34. #else
  35. adrp x5,OPENSSL_armcap_P
  36. #endif
  37. cmp x2,#192
  38. b.lo Lshort
  39. ldr w17,[x5,:lo12:OPENSSL_armcap_P]
  40. tst w17,#ARMV7_NEON
  41. b.ne ChaCha20_neon
  42. Lshort:
  43. AARCH64_SIGN_LINK_REGISTER
  44. stp x29,x30,[sp,#-96]!
  45. add x29,sp,#0
  46. adrp x5,Lsigma
  47. add x5,x5,:lo12:Lsigma
  48. stp x19,x20,[sp,#16]
  49. stp x21,x22,[sp,#32]
  50. stp x23,x24,[sp,#48]
  51. stp x25,x26,[sp,#64]
  52. stp x27,x28,[sp,#80]
  53. sub sp,sp,#64
  54. ldp x22,x23,[x5] // load sigma
  55. ldp x24,x25,[x3] // load key
  56. ldp x26,x27,[x3,#16]
  57. ldp x28,x30,[x4] // load counter
  58. #ifdef __ARMEB__
  59. ror x24,x24,#32
  60. ror x25,x25,#32
  61. ror x26,x26,#32
  62. ror x27,x27,#32
  63. ror x28,x28,#32
  64. ror x30,x30,#32
  65. #endif
  66. Loop_outer:
  67. mov w5,w22 // unpack key block
  68. lsr x6,x22,#32
  69. mov w7,w23
  70. lsr x8,x23,#32
  71. mov w9,w24
  72. lsr x10,x24,#32
  73. mov w11,w25
  74. lsr x12,x25,#32
  75. mov w13,w26
  76. lsr x14,x26,#32
  77. mov w15,w27
  78. lsr x16,x27,#32
  79. mov w17,w28
  80. lsr x19,x28,#32
  81. mov w20,w30
  82. lsr x21,x30,#32
  83. mov x4,#10
  84. subs x2,x2,#64
  85. Loop:
  86. sub x4,x4,#1
  87. add w5,w5,w9
  88. add w6,w6,w10
  89. add w7,w7,w11
  90. add w8,w8,w12
  91. eor w17,w17,w5
  92. eor w19,w19,w6
  93. eor w20,w20,w7
  94. eor w21,w21,w8
  95. ror w17,w17,#16
  96. ror w19,w19,#16
  97. ror w20,w20,#16
  98. ror w21,w21,#16
  99. add w13,w13,w17
  100. add w14,w14,w19
  101. add w15,w15,w20
  102. add w16,w16,w21
  103. eor w9,w9,w13
  104. eor w10,w10,w14
  105. eor w11,w11,w15
  106. eor w12,w12,w16
  107. ror w9,w9,#20
  108. ror w10,w10,#20
  109. ror w11,w11,#20
  110. ror w12,w12,#20
  111. add w5,w5,w9
  112. add w6,w6,w10
  113. add w7,w7,w11
  114. add w8,w8,w12
  115. eor w17,w17,w5
  116. eor w19,w19,w6
  117. eor w20,w20,w7
  118. eor w21,w21,w8
  119. ror w17,w17,#24
  120. ror w19,w19,#24
  121. ror w20,w20,#24
  122. ror w21,w21,#24
  123. add w13,w13,w17
  124. add w14,w14,w19
  125. add w15,w15,w20
  126. add w16,w16,w21
  127. eor w9,w9,w13
  128. eor w10,w10,w14
  129. eor w11,w11,w15
  130. eor w12,w12,w16
  131. ror w9,w9,#25
  132. ror w10,w10,#25
  133. ror w11,w11,#25
  134. ror w12,w12,#25
  135. add w5,w5,w10
  136. add w6,w6,w11
  137. add w7,w7,w12
  138. add w8,w8,w9
  139. eor w21,w21,w5
  140. eor w17,w17,w6
  141. eor w19,w19,w7
  142. eor w20,w20,w8
  143. ror w21,w21,#16
  144. ror w17,w17,#16
  145. ror w19,w19,#16
  146. ror w20,w20,#16
  147. add w15,w15,w21
  148. add w16,w16,w17
  149. add w13,w13,w19
  150. add w14,w14,w20
  151. eor w10,w10,w15
  152. eor w11,w11,w16
  153. eor w12,w12,w13
  154. eor w9,w9,w14
  155. ror w10,w10,#20
  156. ror w11,w11,#20
  157. ror w12,w12,#20
  158. ror w9,w9,#20
  159. add w5,w5,w10
  160. add w6,w6,w11
  161. add w7,w7,w12
  162. add w8,w8,w9
  163. eor w21,w21,w5
  164. eor w17,w17,w6
  165. eor w19,w19,w7
  166. eor w20,w20,w8
  167. ror w21,w21,#24
  168. ror w17,w17,#24
  169. ror w19,w19,#24
  170. ror w20,w20,#24
  171. add w15,w15,w21
  172. add w16,w16,w17
  173. add w13,w13,w19
  174. add w14,w14,w20
  175. eor w10,w10,w15
  176. eor w11,w11,w16
  177. eor w12,w12,w13
  178. eor w9,w9,w14
  179. ror w10,w10,#25
  180. ror w11,w11,#25
  181. ror w12,w12,#25
  182. ror w9,w9,#25
  183. cbnz x4,Loop
  184. add w5,w5,w22 // accumulate key block
  185. add x6,x6,x22,lsr#32
  186. add w7,w7,w23
  187. add x8,x8,x23,lsr#32
  188. add w9,w9,w24
  189. add x10,x10,x24,lsr#32
  190. add w11,w11,w25
  191. add x12,x12,x25,lsr#32
  192. add w13,w13,w26
  193. add x14,x14,x26,lsr#32
  194. add w15,w15,w27
  195. add x16,x16,x27,lsr#32
  196. add w17,w17,w28
  197. add x19,x19,x28,lsr#32
  198. add w20,w20,w30
  199. add x21,x21,x30,lsr#32
  200. b.lo Ltail
  201. add x5,x5,x6,lsl#32 // pack
  202. add x7,x7,x8,lsl#32
  203. ldp x6,x8,[x1,#0] // load input
  204. add x9,x9,x10,lsl#32
  205. add x11,x11,x12,lsl#32
  206. ldp x10,x12,[x1,#16]
  207. add x13,x13,x14,lsl#32
  208. add x15,x15,x16,lsl#32
  209. ldp x14,x16,[x1,#32]
  210. add x17,x17,x19,lsl#32
  211. add x20,x20,x21,lsl#32
  212. ldp x19,x21,[x1,#48]
  213. add x1,x1,#64
  214. #ifdef __ARMEB__
  215. rev x5,x5
  216. rev x7,x7
  217. rev x9,x9
  218. rev x11,x11
  219. rev x13,x13
  220. rev x15,x15
  221. rev x17,x17
  222. rev x20,x20
  223. #endif
  224. eor x5,x5,x6
  225. eor x7,x7,x8
  226. eor x9,x9,x10
  227. eor x11,x11,x12
  228. eor x13,x13,x14
  229. eor x15,x15,x16
  230. eor x17,x17,x19
  231. eor x20,x20,x21
  232. stp x5,x7,[x0,#0] // store output
  233. add x28,x28,#1 // increment counter
  234. stp x9,x11,[x0,#16]
  235. stp x13,x15,[x0,#32]
  236. stp x17,x20,[x0,#48]
  237. add x0,x0,#64
  238. b.hi Loop_outer
  239. ldp x19,x20,[x29,#16]
  240. add sp,sp,#64
  241. ldp x21,x22,[x29,#32]
  242. ldp x23,x24,[x29,#48]
  243. ldp x25,x26,[x29,#64]
  244. ldp x27,x28,[x29,#80]
  245. ldp x29,x30,[sp],#96
  246. AARCH64_VALIDATE_LINK_REGISTER
  247. Labort:
  248. ret
  249. .align 4
  250. Ltail:
  251. add x2,x2,#64
  252. Less_than_64:
  253. sub x0,x0,#1
  254. add x1,x1,x2
  255. add x0,x0,x2
  256. add x4,sp,x2
  257. neg x2,x2
  258. add x5,x5,x6,lsl#32 // pack
  259. add x7,x7,x8,lsl#32
  260. add x9,x9,x10,lsl#32
  261. add x11,x11,x12,lsl#32
  262. add x13,x13,x14,lsl#32
  263. add x15,x15,x16,lsl#32
  264. add x17,x17,x19,lsl#32
  265. add x20,x20,x21,lsl#32
  266. #ifdef __ARMEB__
  267. rev x5,x5
  268. rev x7,x7
  269. rev x9,x9
  270. rev x11,x11
  271. rev x13,x13
  272. rev x15,x15
  273. rev x17,x17
  274. rev x20,x20
  275. #endif
  276. stp x5,x7,[sp,#0]
  277. stp x9,x11,[sp,#16]
  278. stp x13,x15,[sp,#32]
  279. stp x17,x20,[sp,#48]
  280. Loop_tail:
  281. ldrb w10,[x1,x2]
  282. ldrb w11,[x4,x2]
  283. add x2,x2,#1
  284. eor w10,w10,w11
  285. strb w10,[x0,x2]
  286. cbnz x2,Loop_tail
  287. stp xzr,xzr,[sp,#0]
  288. stp xzr,xzr,[sp,#16]
  289. stp xzr,xzr,[sp,#32]
  290. stp xzr,xzr,[sp,#48]
  291. ldp x19,x20,[x29,#16]
  292. add sp,sp,#64
  293. ldp x21,x22,[x29,#32]
  294. ldp x23,x24,[x29,#48]
  295. ldp x25,x26,[x29,#64]
  296. ldp x27,x28,[x29,#80]
  297. ldp x29,x30,[sp],#96
  298. AARCH64_VALIDATE_LINK_REGISTER
  299. ret
  300. .def ChaCha20_neon
  301. .type 32
  302. .endef
  303. .align 5
  304. ChaCha20_neon:
  305. AARCH64_SIGN_LINK_REGISTER
  306. stp x29,x30,[sp,#-96]!
  307. add x29,sp,#0
  308. adrp x5,Lsigma
  309. add x5,x5,:lo12:Lsigma
  310. stp x19,x20,[sp,#16]
  311. stp x21,x22,[sp,#32]
  312. stp x23,x24,[sp,#48]
  313. stp x25,x26,[sp,#64]
  314. stp x27,x28,[sp,#80]
  315. cmp x2,#512
  316. b.hs L512_or_more_neon
  317. sub sp,sp,#64
  318. ldp x22,x23,[x5] // load sigma
  319. ld1 {v24.4s},[x5],#16
  320. ldp x24,x25,[x3] // load key
  321. ldp x26,x27,[x3,#16]
  322. ld1 {v25.4s,v26.4s},[x3]
  323. ldp x28,x30,[x4] // load counter
  324. ld1 {v27.4s},[x4]
  325. ld1 {v31.4s},[x5]
  326. #ifdef __ARMEB__
  327. rev64 v24.4s,v24.4s
  328. ror x24,x24,#32
  329. ror x25,x25,#32
  330. ror x26,x26,#32
  331. ror x27,x27,#32
  332. ror x28,x28,#32
  333. ror x30,x30,#32
  334. #endif
  335. add v27.4s,v27.4s,v31.4s // += 1
  336. add v28.4s,v27.4s,v31.4s
  337. add v29.4s,v28.4s,v31.4s
  338. shl v31.4s,v31.4s,#2 // 1 -> 4
  339. Loop_outer_neon:
  340. mov w5,w22 // unpack key block
  341. lsr x6,x22,#32
  342. mov v0.16b,v24.16b
  343. mov w7,w23
  344. lsr x8,x23,#32
  345. mov v4.16b,v24.16b
  346. mov w9,w24
  347. lsr x10,x24,#32
  348. mov v16.16b,v24.16b
  349. mov w11,w25
  350. mov v1.16b,v25.16b
  351. lsr x12,x25,#32
  352. mov v5.16b,v25.16b
  353. mov w13,w26
  354. mov v17.16b,v25.16b
  355. lsr x14,x26,#32
  356. mov v3.16b,v27.16b
  357. mov w15,w27
  358. mov v7.16b,v28.16b
  359. lsr x16,x27,#32
  360. mov v19.16b,v29.16b
  361. mov w17,w28
  362. mov v2.16b,v26.16b
  363. lsr x19,x28,#32
  364. mov v6.16b,v26.16b
  365. mov w20,w30
  366. mov v18.16b,v26.16b
  367. lsr x21,x30,#32
  368. mov x4,#10
  369. subs x2,x2,#256
  370. Loop_neon:
  371. sub x4,x4,#1
  372. add v0.4s,v0.4s,v1.4s
  373. add w5,w5,w9
  374. add v4.4s,v4.4s,v5.4s
  375. add w6,w6,w10
  376. add v16.4s,v16.4s,v17.4s
  377. add w7,w7,w11
  378. eor v3.16b,v3.16b,v0.16b
  379. add w8,w8,w12
  380. eor v7.16b,v7.16b,v4.16b
  381. eor w17,w17,w5
  382. eor v19.16b,v19.16b,v16.16b
  383. eor w19,w19,w6
  384. rev32 v3.8h,v3.8h
  385. eor w20,w20,w7
  386. rev32 v7.8h,v7.8h
  387. eor w21,w21,w8
  388. rev32 v19.8h,v19.8h
  389. ror w17,w17,#16
  390. add v2.4s,v2.4s,v3.4s
  391. ror w19,w19,#16
  392. add v6.4s,v6.4s,v7.4s
  393. ror w20,w20,#16
  394. add v18.4s,v18.4s,v19.4s
  395. ror w21,w21,#16
  396. eor v20.16b,v1.16b,v2.16b
  397. add w13,w13,w17
  398. eor v21.16b,v5.16b,v6.16b
  399. add w14,w14,w19
  400. eor v22.16b,v17.16b,v18.16b
  401. add w15,w15,w20
  402. ushr v1.4s,v20.4s,#20
  403. add w16,w16,w21
  404. ushr v5.4s,v21.4s,#20
  405. eor w9,w9,w13
  406. ushr v17.4s,v22.4s,#20
  407. eor w10,w10,w14
  408. sli v1.4s,v20.4s,#12
  409. eor w11,w11,w15
  410. sli v5.4s,v21.4s,#12
  411. eor w12,w12,w16
  412. sli v17.4s,v22.4s,#12
  413. ror w9,w9,#20
  414. add v0.4s,v0.4s,v1.4s
  415. ror w10,w10,#20
  416. add v4.4s,v4.4s,v5.4s
  417. ror w11,w11,#20
  418. add v16.4s,v16.4s,v17.4s
  419. ror w12,w12,#20
  420. eor v20.16b,v3.16b,v0.16b
  421. add w5,w5,w9
  422. eor v21.16b,v7.16b,v4.16b
  423. add w6,w6,w10
  424. eor v22.16b,v19.16b,v16.16b
  425. add w7,w7,w11
  426. ushr v3.4s,v20.4s,#24
  427. add w8,w8,w12
  428. ushr v7.4s,v21.4s,#24
  429. eor w17,w17,w5
  430. ushr v19.4s,v22.4s,#24
  431. eor w19,w19,w6
  432. sli v3.4s,v20.4s,#8
  433. eor w20,w20,w7
  434. sli v7.4s,v21.4s,#8
  435. eor w21,w21,w8
  436. sli v19.4s,v22.4s,#8
  437. ror w17,w17,#24
  438. add v2.4s,v2.4s,v3.4s
  439. ror w19,w19,#24
  440. add v6.4s,v6.4s,v7.4s
  441. ror w20,w20,#24
  442. add v18.4s,v18.4s,v19.4s
  443. ror w21,w21,#24
  444. eor v20.16b,v1.16b,v2.16b
  445. add w13,w13,w17
  446. eor v21.16b,v5.16b,v6.16b
  447. add w14,w14,w19
  448. eor v22.16b,v17.16b,v18.16b
  449. add w15,w15,w20
  450. ushr v1.4s,v20.4s,#25
  451. add w16,w16,w21
  452. ushr v5.4s,v21.4s,#25
  453. eor w9,w9,w13
  454. ushr v17.4s,v22.4s,#25
  455. eor w10,w10,w14
  456. sli v1.4s,v20.4s,#7
  457. eor w11,w11,w15
  458. sli v5.4s,v21.4s,#7
  459. eor w12,w12,w16
  460. sli v17.4s,v22.4s,#7
  461. ror w9,w9,#25
  462. ext v2.16b,v2.16b,v2.16b,#8
  463. ror w10,w10,#25
  464. ext v6.16b,v6.16b,v6.16b,#8
  465. ror w11,w11,#25
  466. ext v18.16b,v18.16b,v18.16b,#8
  467. ror w12,w12,#25
  468. ext v3.16b,v3.16b,v3.16b,#12
  469. ext v7.16b,v7.16b,v7.16b,#12
  470. ext v19.16b,v19.16b,v19.16b,#12
  471. ext v1.16b,v1.16b,v1.16b,#4
  472. ext v5.16b,v5.16b,v5.16b,#4
  473. ext v17.16b,v17.16b,v17.16b,#4
  474. add v0.4s,v0.4s,v1.4s
  475. add w5,w5,w10
  476. add v4.4s,v4.4s,v5.4s
  477. add w6,w6,w11
  478. add v16.4s,v16.4s,v17.4s
  479. add w7,w7,w12
  480. eor v3.16b,v3.16b,v0.16b
  481. add w8,w8,w9
  482. eor v7.16b,v7.16b,v4.16b
  483. eor w21,w21,w5
  484. eor v19.16b,v19.16b,v16.16b
  485. eor w17,w17,w6
  486. rev32 v3.8h,v3.8h
  487. eor w19,w19,w7
  488. rev32 v7.8h,v7.8h
  489. eor w20,w20,w8
  490. rev32 v19.8h,v19.8h
  491. ror w21,w21,#16
  492. add v2.4s,v2.4s,v3.4s
  493. ror w17,w17,#16
  494. add v6.4s,v6.4s,v7.4s
  495. ror w19,w19,#16
  496. add v18.4s,v18.4s,v19.4s
  497. ror w20,w20,#16
  498. eor v20.16b,v1.16b,v2.16b
  499. add w15,w15,w21
  500. eor v21.16b,v5.16b,v6.16b
  501. add w16,w16,w17
  502. eor v22.16b,v17.16b,v18.16b
  503. add w13,w13,w19
  504. ushr v1.4s,v20.4s,#20
  505. add w14,w14,w20
  506. ushr v5.4s,v21.4s,#20
  507. eor w10,w10,w15
  508. ushr v17.4s,v22.4s,#20
  509. eor w11,w11,w16
  510. sli v1.4s,v20.4s,#12
  511. eor w12,w12,w13
  512. sli v5.4s,v21.4s,#12
  513. eor w9,w9,w14
  514. sli v17.4s,v22.4s,#12
  515. ror w10,w10,#20
  516. add v0.4s,v0.4s,v1.4s
  517. ror w11,w11,#20
  518. add v4.4s,v4.4s,v5.4s
  519. ror w12,w12,#20
  520. add v16.4s,v16.4s,v17.4s
  521. ror w9,w9,#20
  522. eor v20.16b,v3.16b,v0.16b
  523. add w5,w5,w10
  524. eor v21.16b,v7.16b,v4.16b
  525. add w6,w6,w11
  526. eor v22.16b,v19.16b,v16.16b
  527. add w7,w7,w12
  528. ushr v3.4s,v20.4s,#24
  529. add w8,w8,w9
  530. ushr v7.4s,v21.4s,#24
  531. eor w21,w21,w5
  532. ushr v19.4s,v22.4s,#24
  533. eor w17,w17,w6
  534. sli v3.4s,v20.4s,#8
  535. eor w19,w19,w7
  536. sli v7.4s,v21.4s,#8
  537. eor w20,w20,w8
  538. sli v19.4s,v22.4s,#8
  539. ror w21,w21,#24
  540. add v2.4s,v2.4s,v3.4s
  541. ror w17,w17,#24
  542. add v6.4s,v6.4s,v7.4s
  543. ror w19,w19,#24
  544. add v18.4s,v18.4s,v19.4s
  545. ror w20,w20,#24
  546. eor v20.16b,v1.16b,v2.16b
  547. add w15,w15,w21
  548. eor v21.16b,v5.16b,v6.16b
  549. add w16,w16,w17
  550. eor v22.16b,v17.16b,v18.16b
  551. add w13,w13,w19
  552. ushr v1.4s,v20.4s,#25
  553. add w14,w14,w20
  554. ushr v5.4s,v21.4s,#25
  555. eor w10,w10,w15
  556. ushr v17.4s,v22.4s,#25
  557. eor w11,w11,w16
  558. sli v1.4s,v20.4s,#7
  559. eor w12,w12,w13
  560. sli v5.4s,v21.4s,#7
  561. eor w9,w9,w14
  562. sli v17.4s,v22.4s,#7
  563. ror w10,w10,#25
  564. ext v2.16b,v2.16b,v2.16b,#8
  565. ror w11,w11,#25
  566. ext v6.16b,v6.16b,v6.16b,#8
  567. ror w12,w12,#25
  568. ext v18.16b,v18.16b,v18.16b,#8
  569. ror w9,w9,#25
  570. ext v3.16b,v3.16b,v3.16b,#4
  571. ext v7.16b,v7.16b,v7.16b,#4
  572. ext v19.16b,v19.16b,v19.16b,#4
  573. ext v1.16b,v1.16b,v1.16b,#12
  574. ext v5.16b,v5.16b,v5.16b,#12
  575. ext v17.16b,v17.16b,v17.16b,#12
  576. cbnz x4,Loop_neon
  577. add w5,w5,w22 // accumulate key block
  578. add v0.4s,v0.4s,v24.4s
  579. add x6,x6,x22,lsr#32
  580. add v4.4s,v4.4s,v24.4s
  581. add w7,w7,w23
  582. add v16.4s,v16.4s,v24.4s
  583. add x8,x8,x23,lsr#32
  584. add v2.4s,v2.4s,v26.4s
  585. add w9,w9,w24
  586. add v6.4s,v6.4s,v26.4s
  587. add x10,x10,x24,lsr#32
  588. add v18.4s,v18.4s,v26.4s
  589. add w11,w11,w25
  590. add v3.4s,v3.4s,v27.4s
  591. add x12,x12,x25,lsr#32
  592. add w13,w13,w26
  593. add v7.4s,v7.4s,v28.4s
  594. add x14,x14,x26,lsr#32
  595. add w15,w15,w27
  596. add v19.4s,v19.4s,v29.4s
  597. add x16,x16,x27,lsr#32
  598. add w17,w17,w28
  599. add v1.4s,v1.4s,v25.4s
  600. add x19,x19,x28,lsr#32
  601. add w20,w20,w30
  602. add v5.4s,v5.4s,v25.4s
  603. add x21,x21,x30,lsr#32
  604. add v17.4s,v17.4s,v25.4s
  605. b.lo Ltail_neon
  606. add x5,x5,x6,lsl#32 // pack
  607. add x7,x7,x8,lsl#32
  608. ldp x6,x8,[x1,#0] // load input
  609. add x9,x9,x10,lsl#32
  610. add x11,x11,x12,lsl#32
  611. ldp x10,x12,[x1,#16]
  612. add x13,x13,x14,lsl#32
  613. add x15,x15,x16,lsl#32
  614. ldp x14,x16,[x1,#32]
  615. add x17,x17,x19,lsl#32
  616. add x20,x20,x21,lsl#32
  617. ldp x19,x21,[x1,#48]
  618. add x1,x1,#64
  619. #ifdef __ARMEB__
  620. rev x5,x5
  621. rev x7,x7
  622. rev x9,x9
  623. rev x11,x11
  624. rev x13,x13
  625. rev x15,x15
  626. rev x17,x17
  627. rev x20,x20
  628. #endif
  629. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  630. eor x5,x5,x6
  631. eor x7,x7,x8
  632. eor x9,x9,x10
  633. eor x11,x11,x12
  634. eor x13,x13,x14
  635. eor v0.16b,v0.16b,v20.16b
  636. eor x15,x15,x16
  637. eor v1.16b,v1.16b,v21.16b
  638. eor x17,x17,x19
  639. eor v2.16b,v2.16b,v22.16b
  640. eor x20,x20,x21
  641. eor v3.16b,v3.16b,v23.16b
  642. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  643. stp x5,x7,[x0,#0] // store output
  644. add x28,x28,#4 // increment counter
  645. stp x9,x11,[x0,#16]
  646. add v27.4s,v27.4s,v31.4s // += 4
  647. stp x13,x15,[x0,#32]
  648. add v28.4s,v28.4s,v31.4s
  649. stp x17,x20,[x0,#48]
  650. add v29.4s,v29.4s,v31.4s
  651. add x0,x0,#64
  652. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  653. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  654. eor v4.16b,v4.16b,v20.16b
  655. eor v5.16b,v5.16b,v21.16b
  656. eor v6.16b,v6.16b,v22.16b
  657. eor v7.16b,v7.16b,v23.16b
  658. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  659. eor v16.16b,v16.16b,v0.16b
  660. eor v17.16b,v17.16b,v1.16b
  661. eor v18.16b,v18.16b,v2.16b
  662. eor v19.16b,v19.16b,v3.16b
  663. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  664. b.hi Loop_outer_neon
  665. ldp x19,x20,[x29,#16]
  666. add sp,sp,#64
  667. ldp x21,x22,[x29,#32]
  668. ldp x23,x24,[x29,#48]
  669. ldp x25,x26,[x29,#64]
  670. ldp x27,x28,[x29,#80]
  671. ldp x29,x30,[sp],#96
  672. AARCH64_VALIDATE_LINK_REGISTER
  673. ret
  674. Ltail_neon:
  675. add x2,x2,#256
  676. cmp x2,#64
  677. b.lo Less_than_64
  678. add x5,x5,x6,lsl#32 // pack
  679. add x7,x7,x8,lsl#32
  680. ldp x6,x8,[x1,#0] // load input
  681. add x9,x9,x10,lsl#32
  682. add x11,x11,x12,lsl#32
  683. ldp x10,x12,[x1,#16]
  684. add x13,x13,x14,lsl#32
  685. add x15,x15,x16,lsl#32
  686. ldp x14,x16,[x1,#32]
  687. add x17,x17,x19,lsl#32
  688. add x20,x20,x21,lsl#32
  689. ldp x19,x21,[x1,#48]
  690. add x1,x1,#64
  691. #ifdef __ARMEB__
  692. rev x5,x5
  693. rev x7,x7
  694. rev x9,x9
  695. rev x11,x11
  696. rev x13,x13
  697. rev x15,x15
  698. rev x17,x17
  699. rev x20,x20
  700. #endif
  701. eor x5,x5,x6
  702. eor x7,x7,x8
  703. eor x9,x9,x10
  704. eor x11,x11,x12
  705. eor x13,x13,x14
  706. eor x15,x15,x16
  707. eor x17,x17,x19
  708. eor x20,x20,x21
  709. stp x5,x7,[x0,#0] // store output
  710. add x28,x28,#4 // increment counter
  711. stp x9,x11,[x0,#16]
  712. stp x13,x15,[x0,#32]
  713. stp x17,x20,[x0,#48]
  714. add x0,x0,#64
  715. b.eq Ldone_neon
  716. sub x2,x2,#64
  717. cmp x2,#64
  718. b.lo Less_than_128
  719. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  720. eor v0.16b,v0.16b,v20.16b
  721. eor v1.16b,v1.16b,v21.16b
  722. eor v2.16b,v2.16b,v22.16b
  723. eor v3.16b,v3.16b,v23.16b
  724. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  725. b.eq Ldone_neon
  726. sub x2,x2,#64
  727. cmp x2,#64
  728. b.lo Less_than_192
  729. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  730. eor v4.16b,v4.16b,v20.16b
  731. eor v5.16b,v5.16b,v21.16b
  732. eor v6.16b,v6.16b,v22.16b
  733. eor v7.16b,v7.16b,v23.16b
  734. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  735. b.eq Ldone_neon
  736. sub x2,x2,#64
  737. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  738. b Last_neon
  739. Less_than_128:
  740. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  741. b Last_neon
  742. Less_than_192:
  743. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  744. b Last_neon
  745. .align 4
  746. Last_neon:
  747. sub x0,x0,#1
  748. add x1,x1,x2
  749. add x0,x0,x2
  750. add x4,sp,x2
  751. neg x2,x2
  752. Loop_tail_neon:
  753. ldrb w10,[x1,x2]
  754. ldrb w11,[x4,x2]
  755. add x2,x2,#1
  756. eor w10,w10,w11
  757. strb w10,[x0,x2]
  758. cbnz x2,Loop_tail_neon
  759. stp xzr,xzr,[sp,#0]
  760. stp xzr,xzr,[sp,#16]
  761. stp xzr,xzr,[sp,#32]
  762. stp xzr,xzr,[sp,#48]
  763. Ldone_neon:
  764. ldp x19,x20,[x29,#16]
  765. add sp,sp,#64
  766. ldp x21,x22,[x29,#32]
  767. ldp x23,x24,[x29,#48]
  768. ldp x25,x26,[x29,#64]
  769. ldp x27,x28,[x29,#80]
  770. ldp x29,x30,[sp],#96
  771. AARCH64_VALIDATE_LINK_REGISTER
  772. ret
  773. .def ChaCha20_512_neon
  774. .type 32
  775. .endef
  776. .align 5
  777. ChaCha20_512_neon:
  778. AARCH64_SIGN_LINK_REGISTER
  779. stp x29,x30,[sp,#-96]!
  780. add x29,sp,#0
  781. adrp x5,Lsigma
  782. add x5,x5,:lo12:Lsigma
  783. stp x19,x20,[sp,#16]
  784. stp x21,x22,[sp,#32]
  785. stp x23,x24,[sp,#48]
  786. stp x25,x26,[sp,#64]
  787. stp x27,x28,[sp,#80]
  788. L512_or_more_neon:
  789. sub sp,sp,#128+64
  790. ldp x22,x23,[x5] // load sigma
  791. ld1 {v24.4s},[x5],#16
  792. ldp x24,x25,[x3] // load key
  793. ldp x26,x27,[x3,#16]
  794. ld1 {v25.4s,v26.4s},[x3]
  795. ldp x28,x30,[x4] // load counter
  796. ld1 {v27.4s},[x4]
  797. ld1 {v31.4s},[x5]
  798. #ifdef __ARMEB__
  799. rev64 v24.4s,v24.4s
  800. ror x24,x24,#32
  801. ror x25,x25,#32
  802. ror x26,x26,#32
  803. ror x27,x27,#32
  804. ror x28,x28,#32
  805. ror x30,x30,#32
  806. #endif
  807. add v27.4s,v27.4s,v31.4s // += 1
  808. stp q24,q25,[sp,#0] // off-load key block, invariant part
  809. add v27.4s,v27.4s,v31.4s // not typo
  810. str q26,[sp,#32]
  811. add v28.4s,v27.4s,v31.4s
  812. add v29.4s,v28.4s,v31.4s
  813. add v30.4s,v29.4s,v31.4s
  814. shl v31.4s,v31.4s,#2 // 1 -> 4
  815. stp d8,d9,[sp,#128+0] // meet ABI requirements
  816. stp d10,d11,[sp,#128+16]
  817. stp d12,d13,[sp,#128+32]
  818. stp d14,d15,[sp,#128+48]
  819. sub x2,x2,#512 // not typo
  820. Loop_outer_512_neon:
  821. mov v0.16b,v24.16b
  822. mov v4.16b,v24.16b
  823. mov v8.16b,v24.16b
  824. mov v12.16b,v24.16b
  825. mov v16.16b,v24.16b
  826. mov v20.16b,v24.16b
  827. mov v1.16b,v25.16b
  828. mov w5,w22 // unpack key block
  829. mov v5.16b,v25.16b
  830. lsr x6,x22,#32
  831. mov v9.16b,v25.16b
  832. mov w7,w23
  833. mov v13.16b,v25.16b
  834. lsr x8,x23,#32
  835. mov v17.16b,v25.16b
  836. mov w9,w24
  837. mov v21.16b,v25.16b
  838. lsr x10,x24,#32
  839. mov v3.16b,v27.16b
  840. mov w11,w25
  841. mov v7.16b,v28.16b
  842. lsr x12,x25,#32
  843. mov v11.16b,v29.16b
  844. mov w13,w26
  845. mov v15.16b,v30.16b
  846. lsr x14,x26,#32
  847. mov v2.16b,v26.16b
  848. mov w15,w27
  849. mov v6.16b,v26.16b
  850. lsr x16,x27,#32
  851. add v19.4s,v3.4s,v31.4s // +4
  852. mov w17,w28
  853. add v23.4s,v7.4s,v31.4s // +4
  854. lsr x19,x28,#32
  855. mov v10.16b,v26.16b
  856. mov w20,w30
  857. mov v14.16b,v26.16b
  858. lsr x21,x30,#32
  859. mov v18.16b,v26.16b
  860. stp q27,q28,[sp,#48] // off-load key block, variable part
  861. mov v22.16b,v26.16b
  862. str q29,[sp,#80]
  863. mov x4,#5
  864. subs x2,x2,#512
  865. Loop_upper_neon:
  866. sub x4,x4,#1
  867. add v0.4s,v0.4s,v1.4s
  868. add w5,w5,w9
  869. add v4.4s,v4.4s,v5.4s
  870. add w6,w6,w10
  871. add v8.4s,v8.4s,v9.4s
  872. add w7,w7,w11
  873. add v12.4s,v12.4s,v13.4s
  874. add w8,w8,w12
  875. add v16.4s,v16.4s,v17.4s
  876. eor w17,w17,w5
  877. add v20.4s,v20.4s,v21.4s
  878. eor w19,w19,w6
  879. eor v3.16b,v3.16b,v0.16b
  880. eor w20,w20,w7
  881. eor v7.16b,v7.16b,v4.16b
  882. eor w21,w21,w8
  883. eor v11.16b,v11.16b,v8.16b
  884. ror w17,w17,#16
  885. eor v15.16b,v15.16b,v12.16b
  886. ror w19,w19,#16
  887. eor v19.16b,v19.16b,v16.16b
  888. ror w20,w20,#16
  889. eor v23.16b,v23.16b,v20.16b
  890. ror w21,w21,#16
  891. rev32 v3.8h,v3.8h
  892. add w13,w13,w17
  893. rev32 v7.8h,v7.8h
  894. add w14,w14,w19
  895. rev32 v11.8h,v11.8h
  896. add w15,w15,w20
  897. rev32 v15.8h,v15.8h
  898. add w16,w16,w21
  899. rev32 v19.8h,v19.8h
  900. eor w9,w9,w13
  901. rev32 v23.8h,v23.8h
  902. eor w10,w10,w14
  903. add v2.4s,v2.4s,v3.4s
  904. eor w11,w11,w15
  905. add v6.4s,v6.4s,v7.4s
  906. eor w12,w12,w16
  907. add v10.4s,v10.4s,v11.4s
  908. ror w9,w9,#20
  909. add v14.4s,v14.4s,v15.4s
  910. ror w10,w10,#20
  911. add v18.4s,v18.4s,v19.4s
  912. ror w11,w11,#20
  913. add v22.4s,v22.4s,v23.4s
  914. ror w12,w12,#20
  915. eor v24.16b,v1.16b,v2.16b
  916. add w5,w5,w9
  917. eor v25.16b,v5.16b,v6.16b
  918. add w6,w6,w10
  919. eor v26.16b,v9.16b,v10.16b
  920. add w7,w7,w11
  921. eor v27.16b,v13.16b,v14.16b
  922. add w8,w8,w12
  923. eor v28.16b,v17.16b,v18.16b
  924. eor w17,w17,w5
  925. eor v29.16b,v21.16b,v22.16b
  926. eor w19,w19,w6
  927. ushr v1.4s,v24.4s,#20
  928. eor w20,w20,w7
  929. ushr v5.4s,v25.4s,#20
  930. eor w21,w21,w8
  931. ushr v9.4s,v26.4s,#20
  932. ror w17,w17,#24
  933. ushr v13.4s,v27.4s,#20
  934. ror w19,w19,#24
  935. ushr v17.4s,v28.4s,#20
  936. ror w20,w20,#24
  937. ushr v21.4s,v29.4s,#20
  938. ror w21,w21,#24
  939. sli v1.4s,v24.4s,#12
  940. add w13,w13,w17
  941. sli v5.4s,v25.4s,#12
  942. add w14,w14,w19
  943. sli v9.4s,v26.4s,#12
  944. add w15,w15,w20
  945. sli v13.4s,v27.4s,#12
  946. add w16,w16,w21
  947. sli v17.4s,v28.4s,#12
  948. eor w9,w9,w13
  949. sli v21.4s,v29.4s,#12
  950. eor w10,w10,w14
  951. add v0.4s,v0.4s,v1.4s
  952. eor w11,w11,w15
  953. add v4.4s,v4.4s,v5.4s
  954. eor w12,w12,w16
  955. add v8.4s,v8.4s,v9.4s
  956. ror w9,w9,#25
  957. add v12.4s,v12.4s,v13.4s
  958. ror w10,w10,#25
  959. add v16.4s,v16.4s,v17.4s
  960. ror w11,w11,#25
  961. add v20.4s,v20.4s,v21.4s
  962. ror w12,w12,#25
  963. eor v24.16b,v3.16b,v0.16b
  964. add w5,w5,w10
  965. eor v25.16b,v7.16b,v4.16b
  966. add w6,w6,w11
  967. eor v26.16b,v11.16b,v8.16b
  968. add w7,w7,w12
  969. eor v27.16b,v15.16b,v12.16b
  970. add w8,w8,w9
  971. eor v28.16b,v19.16b,v16.16b
  972. eor w21,w21,w5
  973. eor v29.16b,v23.16b,v20.16b
  974. eor w17,w17,w6
  975. ushr v3.4s,v24.4s,#24
  976. eor w19,w19,w7
  977. ushr v7.4s,v25.4s,#24
  978. eor w20,w20,w8
  979. ushr v11.4s,v26.4s,#24
  980. ror w21,w21,#16
  981. ushr v15.4s,v27.4s,#24
  982. ror w17,w17,#16
  983. ushr v19.4s,v28.4s,#24
  984. ror w19,w19,#16
  985. ushr v23.4s,v29.4s,#24
  986. ror w20,w20,#16
  987. sli v3.4s,v24.4s,#8
  988. add w15,w15,w21
  989. sli v7.4s,v25.4s,#8
  990. add w16,w16,w17
  991. sli v11.4s,v26.4s,#8
  992. add w13,w13,w19
  993. sli v15.4s,v27.4s,#8
  994. add w14,w14,w20
  995. sli v19.4s,v28.4s,#8
  996. eor w10,w10,w15
  997. sli v23.4s,v29.4s,#8
  998. eor w11,w11,w16
  999. add v2.4s,v2.4s,v3.4s
  1000. eor w12,w12,w13
  1001. add v6.4s,v6.4s,v7.4s
  1002. eor w9,w9,w14
  1003. add v10.4s,v10.4s,v11.4s
  1004. ror w10,w10,#20
  1005. add v14.4s,v14.4s,v15.4s
  1006. ror w11,w11,#20
  1007. add v18.4s,v18.4s,v19.4s
  1008. ror w12,w12,#20
  1009. add v22.4s,v22.4s,v23.4s
  1010. ror w9,w9,#20
  1011. eor v24.16b,v1.16b,v2.16b
  1012. add w5,w5,w10
  1013. eor v25.16b,v5.16b,v6.16b
  1014. add w6,w6,w11
  1015. eor v26.16b,v9.16b,v10.16b
  1016. add w7,w7,w12
  1017. eor v27.16b,v13.16b,v14.16b
  1018. add w8,w8,w9
  1019. eor v28.16b,v17.16b,v18.16b
  1020. eor w21,w21,w5
  1021. eor v29.16b,v21.16b,v22.16b
  1022. eor w17,w17,w6
  1023. ushr v1.4s,v24.4s,#25
  1024. eor w19,w19,w7
  1025. ushr v5.4s,v25.4s,#25
  1026. eor w20,w20,w8
  1027. ushr v9.4s,v26.4s,#25
  1028. ror w21,w21,#24
  1029. ushr v13.4s,v27.4s,#25
  1030. ror w17,w17,#24
  1031. ushr v17.4s,v28.4s,#25
  1032. ror w19,w19,#24
  1033. ushr v21.4s,v29.4s,#25
  1034. ror w20,w20,#24
  1035. sli v1.4s,v24.4s,#7
  1036. add w15,w15,w21
  1037. sli v5.4s,v25.4s,#7
  1038. add w16,w16,w17
  1039. sli v9.4s,v26.4s,#7
  1040. add w13,w13,w19
  1041. sli v13.4s,v27.4s,#7
  1042. add w14,w14,w20
  1043. sli v17.4s,v28.4s,#7
  1044. eor w10,w10,w15
  1045. sli v21.4s,v29.4s,#7
  1046. eor w11,w11,w16
  1047. ext v2.16b,v2.16b,v2.16b,#8
  1048. eor w12,w12,w13
  1049. ext v6.16b,v6.16b,v6.16b,#8
  1050. eor w9,w9,w14
  1051. ext v10.16b,v10.16b,v10.16b,#8
  1052. ror w10,w10,#25
  1053. ext v14.16b,v14.16b,v14.16b,#8
  1054. ror w11,w11,#25
  1055. ext v18.16b,v18.16b,v18.16b,#8
  1056. ror w12,w12,#25
  1057. ext v22.16b,v22.16b,v22.16b,#8
  1058. ror w9,w9,#25
  1059. ext v3.16b,v3.16b,v3.16b,#12
  1060. ext v7.16b,v7.16b,v7.16b,#12
  1061. ext v11.16b,v11.16b,v11.16b,#12
  1062. ext v15.16b,v15.16b,v15.16b,#12
  1063. ext v19.16b,v19.16b,v19.16b,#12
  1064. ext v23.16b,v23.16b,v23.16b,#12
  1065. ext v1.16b,v1.16b,v1.16b,#4
  1066. ext v5.16b,v5.16b,v5.16b,#4
  1067. ext v9.16b,v9.16b,v9.16b,#4
  1068. ext v13.16b,v13.16b,v13.16b,#4
  1069. ext v17.16b,v17.16b,v17.16b,#4
  1070. ext v21.16b,v21.16b,v21.16b,#4
  1071. add v0.4s,v0.4s,v1.4s
  1072. add w5,w5,w9
  1073. add v4.4s,v4.4s,v5.4s
  1074. add w6,w6,w10
  1075. add v8.4s,v8.4s,v9.4s
  1076. add w7,w7,w11
  1077. add v12.4s,v12.4s,v13.4s
  1078. add w8,w8,w12
  1079. add v16.4s,v16.4s,v17.4s
  1080. eor w17,w17,w5
  1081. add v20.4s,v20.4s,v21.4s
  1082. eor w19,w19,w6
  1083. eor v3.16b,v3.16b,v0.16b
  1084. eor w20,w20,w7
  1085. eor v7.16b,v7.16b,v4.16b
  1086. eor w21,w21,w8
  1087. eor v11.16b,v11.16b,v8.16b
  1088. ror w17,w17,#16
  1089. eor v15.16b,v15.16b,v12.16b
  1090. ror w19,w19,#16
  1091. eor v19.16b,v19.16b,v16.16b
  1092. ror w20,w20,#16
  1093. eor v23.16b,v23.16b,v20.16b
  1094. ror w21,w21,#16
  1095. rev32 v3.8h,v3.8h
  1096. add w13,w13,w17
  1097. rev32 v7.8h,v7.8h
  1098. add w14,w14,w19
  1099. rev32 v11.8h,v11.8h
  1100. add w15,w15,w20
  1101. rev32 v15.8h,v15.8h
  1102. add w16,w16,w21
  1103. rev32 v19.8h,v19.8h
  1104. eor w9,w9,w13
  1105. rev32 v23.8h,v23.8h
  1106. eor w10,w10,w14
  1107. add v2.4s,v2.4s,v3.4s
  1108. eor w11,w11,w15
  1109. add v6.4s,v6.4s,v7.4s
  1110. eor w12,w12,w16
  1111. add v10.4s,v10.4s,v11.4s
  1112. ror w9,w9,#20
  1113. add v14.4s,v14.4s,v15.4s
  1114. ror w10,w10,#20
  1115. add v18.4s,v18.4s,v19.4s
  1116. ror w11,w11,#20
  1117. add v22.4s,v22.4s,v23.4s
  1118. ror w12,w12,#20
  1119. eor v24.16b,v1.16b,v2.16b
  1120. add w5,w5,w9
  1121. eor v25.16b,v5.16b,v6.16b
  1122. add w6,w6,w10
  1123. eor v26.16b,v9.16b,v10.16b
  1124. add w7,w7,w11
  1125. eor v27.16b,v13.16b,v14.16b
  1126. add w8,w8,w12
  1127. eor v28.16b,v17.16b,v18.16b
  1128. eor w17,w17,w5
  1129. eor v29.16b,v21.16b,v22.16b
  1130. eor w19,w19,w6
  1131. ushr v1.4s,v24.4s,#20
  1132. eor w20,w20,w7
  1133. ushr v5.4s,v25.4s,#20
  1134. eor w21,w21,w8
  1135. ushr v9.4s,v26.4s,#20
  1136. ror w17,w17,#24
  1137. ushr v13.4s,v27.4s,#20
  1138. ror w19,w19,#24
  1139. ushr v17.4s,v28.4s,#20
  1140. ror w20,w20,#24
  1141. ushr v21.4s,v29.4s,#20
  1142. ror w21,w21,#24
  1143. sli v1.4s,v24.4s,#12
  1144. add w13,w13,w17
  1145. sli v5.4s,v25.4s,#12
  1146. add w14,w14,w19
  1147. sli v9.4s,v26.4s,#12
  1148. add w15,w15,w20
  1149. sli v13.4s,v27.4s,#12
  1150. add w16,w16,w21
  1151. sli v17.4s,v28.4s,#12
  1152. eor w9,w9,w13
  1153. sli v21.4s,v29.4s,#12
  1154. eor w10,w10,w14
  1155. add v0.4s,v0.4s,v1.4s
  1156. eor w11,w11,w15
  1157. add v4.4s,v4.4s,v5.4s
  1158. eor w12,w12,w16
  1159. add v8.4s,v8.4s,v9.4s
  1160. ror w9,w9,#25
  1161. add v12.4s,v12.4s,v13.4s
  1162. ror w10,w10,#25
  1163. add v16.4s,v16.4s,v17.4s
  1164. ror w11,w11,#25
  1165. add v20.4s,v20.4s,v21.4s
  1166. ror w12,w12,#25
  1167. eor v24.16b,v3.16b,v0.16b
  1168. add w5,w5,w10
  1169. eor v25.16b,v7.16b,v4.16b
  1170. add w6,w6,w11
  1171. eor v26.16b,v11.16b,v8.16b
  1172. add w7,w7,w12
  1173. eor v27.16b,v15.16b,v12.16b
  1174. add w8,w8,w9
  1175. eor v28.16b,v19.16b,v16.16b
  1176. eor w21,w21,w5
  1177. eor v29.16b,v23.16b,v20.16b
  1178. eor w17,w17,w6
  1179. ushr v3.4s,v24.4s,#24
  1180. eor w19,w19,w7
  1181. ushr v7.4s,v25.4s,#24
  1182. eor w20,w20,w8
  1183. ushr v11.4s,v26.4s,#24
  1184. ror w21,w21,#16
  1185. ushr v15.4s,v27.4s,#24
  1186. ror w17,w17,#16
  1187. ushr v19.4s,v28.4s,#24
  1188. ror w19,w19,#16
  1189. ushr v23.4s,v29.4s,#24
  1190. ror w20,w20,#16
  1191. sli v3.4s,v24.4s,#8
  1192. add w15,w15,w21
  1193. sli v7.4s,v25.4s,#8
  1194. add w16,w16,w17
  1195. sli v11.4s,v26.4s,#8
  1196. add w13,w13,w19
  1197. sli v15.4s,v27.4s,#8
  1198. add w14,w14,w20
  1199. sli v19.4s,v28.4s,#8
  1200. eor w10,w10,w15
  1201. sli v23.4s,v29.4s,#8
  1202. eor w11,w11,w16
  1203. add v2.4s,v2.4s,v3.4s
  1204. eor w12,w12,w13
  1205. add v6.4s,v6.4s,v7.4s
  1206. eor w9,w9,w14
  1207. add v10.4s,v10.4s,v11.4s
  1208. ror w10,w10,#20
  1209. add v14.4s,v14.4s,v15.4s
  1210. ror w11,w11,#20
  1211. add v18.4s,v18.4s,v19.4s
  1212. ror w12,w12,#20
  1213. add v22.4s,v22.4s,v23.4s
  1214. ror w9,w9,#20
  1215. eor v24.16b,v1.16b,v2.16b
  1216. add w5,w5,w10
  1217. eor v25.16b,v5.16b,v6.16b
  1218. add w6,w6,w11
  1219. eor v26.16b,v9.16b,v10.16b
  1220. add w7,w7,w12
  1221. eor v27.16b,v13.16b,v14.16b
  1222. add w8,w8,w9
  1223. eor v28.16b,v17.16b,v18.16b
  1224. eor w21,w21,w5
  1225. eor v29.16b,v21.16b,v22.16b
  1226. eor w17,w17,w6
  1227. ushr v1.4s,v24.4s,#25
  1228. eor w19,w19,w7
  1229. ushr v5.4s,v25.4s,#25
  1230. eor w20,w20,w8
  1231. ushr v9.4s,v26.4s,#25
  1232. ror w21,w21,#24
  1233. ushr v13.4s,v27.4s,#25
  1234. ror w17,w17,#24
  1235. ushr v17.4s,v28.4s,#25
  1236. ror w19,w19,#24
  1237. ushr v21.4s,v29.4s,#25
  1238. ror w20,w20,#24
  1239. sli v1.4s,v24.4s,#7
  1240. add w15,w15,w21
  1241. sli v5.4s,v25.4s,#7
  1242. add w16,w16,w17
  1243. sli v9.4s,v26.4s,#7
  1244. add w13,w13,w19
  1245. sli v13.4s,v27.4s,#7
  1246. add w14,w14,w20
  1247. sli v17.4s,v28.4s,#7
  1248. eor w10,w10,w15
  1249. sli v21.4s,v29.4s,#7
  1250. eor w11,w11,w16
  1251. ext v2.16b,v2.16b,v2.16b,#8
  1252. eor w12,w12,w13
  1253. ext v6.16b,v6.16b,v6.16b,#8
  1254. eor w9,w9,w14
  1255. ext v10.16b,v10.16b,v10.16b,#8
  1256. ror w10,w10,#25
  1257. ext v14.16b,v14.16b,v14.16b,#8
  1258. ror w11,w11,#25
  1259. ext v18.16b,v18.16b,v18.16b,#8
  1260. ror w12,w12,#25
  1261. ext v22.16b,v22.16b,v22.16b,#8
  1262. ror w9,w9,#25
  1263. ext v3.16b,v3.16b,v3.16b,#4
  1264. ext v7.16b,v7.16b,v7.16b,#4
  1265. ext v11.16b,v11.16b,v11.16b,#4
  1266. ext v15.16b,v15.16b,v15.16b,#4
  1267. ext v19.16b,v19.16b,v19.16b,#4
  1268. ext v23.16b,v23.16b,v23.16b,#4
  1269. ext v1.16b,v1.16b,v1.16b,#12
  1270. ext v5.16b,v5.16b,v5.16b,#12
  1271. ext v9.16b,v9.16b,v9.16b,#12
  1272. ext v13.16b,v13.16b,v13.16b,#12
  1273. ext v17.16b,v17.16b,v17.16b,#12
  1274. ext v21.16b,v21.16b,v21.16b,#12
  1275. cbnz x4,Loop_upper_neon
  1276. add w5,w5,w22 // accumulate key block
  1277. add x6,x6,x22,lsr#32
  1278. add w7,w7,w23
  1279. add x8,x8,x23,lsr#32
  1280. add w9,w9,w24
  1281. add x10,x10,x24,lsr#32
  1282. add w11,w11,w25
  1283. add x12,x12,x25,lsr#32
  1284. add w13,w13,w26
  1285. add x14,x14,x26,lsr#32
  1286. add w15,w15,w27
  1287. add x16,x16,x27,lsr#32
  1288. add w17,w17,w28
  1289. add x19,x19,x28,lsr#32
  1290. add w20,w20,w30
  1291. add x21,x21,x30,lsr#32
  1292. add x5,x5,x6,lsl#32 // pack
  1293. add x7,x7,x8,lsl#32
  1294. ldp x6,x8,[x1,#0] // load input
  1295. add x9,x9,x10,lsl#32
  1296. add x11,x11,x12,lsl#32
  1297. ldp x10,x12,[x1,#16]
  1298. add x13,x13,x14,lsl#32
  1299. add x15,x15,x16,lsl#32
  1300. ldp x14,x16,[x1,#32]
  1301. add x17,x17,x19,lsl#32
  1302. add x20,x20,x21,lsl#32
  1303. ldp x19,x21,[x1,#48]
  1304. add x1,x1,#64
  1305. #ifdef __ARMEB__
  1306. rev x5,x5
  1307. rev x7,x7
  1308. rev x9,x9
  1309. rev x11,x11
  1310. rev x13,x13
  1311. rev x15,x15
  1312. rev x17,x17
  1313. rev x20,x20
  1314. #endif
  1315. eor x5,x5,x6
  1316. eor x7,x7,x8
  1317. eor x9,x9,x10
  1318. eor x11,x11,x12
  1319. eor x13,x13,x14
  1320. eor x15,x15,x16
  1321. eor x17,x17,x19
  1322. eor x20,x20,x21
  1323. stp x5,x7,[x0,#0] // store output
  1324. add x28,x28,#1 // increment counter
  1325. mov w5,w22 // unpack key block
  1326. lsr x6,x22,#32
  1327. stp x9,x11,[x0,#16]
  1328. mov w7,w23
  1329. lsr x8,x23,#32
  1330. stp x13,x15,[x0,#32]
  1331. mov w9,w24
  1332. lsr x10,x24,#32
  1333. stp x17,x20,[x0,#48]
  1334. add x0,x0,#64
  1335. mov w11,w25
  1336. lsr x12,x25,#32
  1337. mov w13,w26
  1338. lsr x14,x26,#32
  1339. mov w15,w27
  1340. lsr x16,x27,#32
  1341. mov w17,w28
  1342. lsr x19,x28,#32
  1343. mov w20,w30
  1344. lsr x21,x30,#32
  1345. mov x4,#5
  1346. Loop_lower_neon:
  1347. sub x4,x4,#1
  1348. add v0.4s,v0.4s,v1.4s
  1349. add w5,w5,w9
  1350. add v4.4s,v4.4s,v5.4s
  1351. add w6,w6,w10
  1352. add v8.4s,v8.4s,v9.4s
  1353. add w7,w7,w11
  1354. add v12.4s,v12.4s,v13.4s
  1355. add w8,w8,w12
  1356. add v16.4s,v16.4s,v17.4s
  1357. eor w17,w17,w5
  1358. add v20.4s,v20.4s,v21.4s
  1359. eor w19,w19,w6
  1360. eor v3.16b,v3.16b,v0.16b
  1361. eor w20,w20,w7
  1362. eor v7.16b,v7.16b,v4.16b
  1363. eor w21,w21,w8
  1364. eor v11.16b,v11.16b,v8.16b
  1365. ror w17,w17,#16
  1366. eor v15.16b,v15.16b,v12.16b
  1367. ror w19,w19,#16
  1368. eor v19.16b,v19.16b,v16.16b
  1369. ror w20,w20,#16
  1370. eor v23.16b,v23.16b,v20.16b
  1371. ror w21,w21,#16
  1372. rev32 v3.8h,v3.8h
  1373. add w13,w13,w17
  1374. rev32 v7.8h,v7.8h
  1375. add w14,w14,w19
  1376. rev32 v11.8h,v11.8h
  1377. add w15,w15,w20
  1378. rev32 v15.8h,v15.8h
  1379. add w16,w16,w21
  1380. rev32 v19.8h,v19.8h
  1381. eor w9,w9,w13
  1382. rev32 v23.8h,v23.8h
  1383. eor w10,w10,w14
  1384. add v2.4s,v2.4s,v3.4s
  1385. eor w11,w11,w15
  1386. add v6.4s,v6.4s,v7.4s
  1387. eor w12,w12,w16
  1388. add v10.4s,v10.4s,v11.4s
  1389. ror w9,w9,#20
  1390. add v14.4s,v14.4s,v15.4s
  1391. ror w10,w10,#20
  1392. add v18.4s,v18.4s,v19.4s
  1393. ror w11,w11,#20
  1394. add v22.4s,v22.4s,v23.4s
  1395. ror w12,w12,#20
  1396. eor v24.16b,v1.16b,v2.16b
  1397. add w5,w5,w9
  1398. eor v25.16b,v5.16b,v6.16b
  1399. add w6,w6,w10
  1400. eor v26.16b,v9.16b,v10.16b
  1401. add w7,w7,w11
  1402. eor v27.16b,v13.16b,v14.16b
  1403. add w8,w8,w12
  1404. eor v28.16b,v17.16b,v18.16b
  1405. eor w17,w17,w5
  1406. eor v29.16b,v21.16b,v22.16b
  1407. eor w19,w19,w6
  1408. ushr v1.4s,v24.4s,#20
  1409. eor w20,w20,w7
  1410. ushr v5.4s,v25.4s,#20
  1411. eor w21,w21,w8
  1412. ushr v9.4s,v26.4s,#20
  1413. ror w17,w17,#24
  1414. ushr v13.4s,v27.4s,#20
  1415. ror w19,w19,#24
  1416. ushr v17.4s,v28.4s,#20
  1417. ror w20,w20,#24
  1418. ushr v21.4s,v29.4s,#20
  1419. ror w21,w21,#24
  1420. sli v1.4s,v24.4s,#12
  1421. add w13,w13,w17
  1422. sli v5.4s,v25.4s,#12
  1423. add w14,w14,w19
  1424. sli v9.4s,v26.4s,#12
  1425. add w15,w15,w20
  1426. sli v13.4s,v27.4s,#12
  1427. add w16,w16,w21
  1428. sli v17.4s,v28.4s,#12
  1429. eor w9,w9,w13
  1430. sli v21.4s,v29.4s,#12
  1431. eor w10,w10,w14
  1432. add v0.4s,v0.4s,v1.4s
  1433. eor w11,w11,w15
  1434. add v4.4s,v4.4s,v5.4s
  1435. eor w12,w12,w16
  1436. add v8.4s,v8.4s,v9.4s
  1437. ror w9,w9,#25
  1438. add v12.4s,v12.4s,v13.4s
  1439. ror w10,w10,#25
  1440. add v16.4s,v16.4s,v17.4s
  1441. ror w11,w11,#25
  1442. add v20.4s,v20.4s,v21.4s
  1443. ror w12,w12,#25
  1444. eor v24.16b,v3.16b,v0.16b
  1445. add w5,w5,w10
  1446. eor v25.16b,v7.16b,v4.16b
  1447. add w6,w6,w11
  1448. eor v26.16b,v11.16b,v8.16b
  1449. add w7,w7,w12
  1450. eor v27.16b,v15.16b,v12.16b
  1451. add w8,w8,w9
  1452. eor v28.16b,v19.16b,v16.16b
  1453. eor w21,w21,w5
  1454. eor v29.16b,v23.16b,v20.16b
  1455. eor w17,w17,w6
  1456. ushr v3.4s,v24.4s,#24
  1457. eor w19,w19,w7
  1458. ushr v7.4s,v25.4s,#24
  1459. eor w20,w20,w8
  1460. ushr v11.4s,v26.4s,#24
  1461. ror w21,w21,#16
  1462. ushr v15.4s,v27.4s,#24
  1463. ror w17,w17,#16
  1464. ushr v19.4s,v28.4s,#24
  1465. ror w19,w19,#16
  1466. ushr v23.4s,v29.4s,#24
  1467. ror w20,w20,#16
  1468. sli v3.4s,v24.4s,#8
  1469. add w15,w15,w21
  1470. sli v7.4s,v25.4s,#8
  1471. add w16,w16,w17
  1472. sli v11.4s,v26.4s,#8
  1473. add w13,w13,w19
  1474. sli v15.4s,v27.4s,#8
  1475. add w14,w14,w20
  1476. sli v19.4s,v28.4s,#8
  1477. eor w10,w10,w15
  1478. sli v23.4s,v29.4s,#8
  1479. eor w11,w11,w16
  1480. add v2.4s,v2.4s,v3.4s
  1481. eor w12,w12,w13
  1482. add v6.4s,v6.4s,v7.4s
  1483. eor w9,w9,w14
  1484. add v10.4s,v10.4s,v11.4s
  1485. ror w10,w10,#20
  1486. add v14.4s,v14.4s,v15.4s
  1487. ror w11,w11,#20
  1488. add v18.4s,v18.4s,v19.4s
  1489. ror w12,w12,#20
  1490. add v22.4s,v22.4s,v23.4s
  1491. ror w9,w9,#20
  1492. eor v24.16b,v1.16b,v2.16b
  1493. add w5,w5,w10
  1494. eor v25.16b,v5.16b,v6.16b
  1495. add w6,w6,w11
  1496. eor v26.16b,v9.16b,v10.16b
  1497. add w7,w7,w12
  1498. eor v27.16b,v13.16b,v14.16b
  1499. add w8,w8,w9
  1500. eor v28.16b,v17.16b,v18.16b
  1501. eor w21,w21,w5
  1502. eor v29.16b,v21.16b,v22.16b
  1503. eor w17,w17,w6
  1504. ushr v1.4s,v24.4s,#25
  1505. eor w19,w19,w7
  1506. ushr v5.4s,v25.4s,#25
  1507. eor w20,w20,w8
  1508. ushr v9.4s,v26.4s,#25
  1509. ror w21,w21,#24
  1510. ushr v13.4s,v27.4s,#25
  1511. ror w17,w17,#24
  1512. ushr v17.4s,v28.4s,#25
  1513. ror w19,w19,#24
  1514. ushr v21.4s,v29.4s,#25
  1515. ror w20,w20,#24
  1516. sli v1.4s,v24.4s,#7
  1517. add w15,w15,w21
  1518. sli v5.4s,v25.4s,#7
  1519. add w16,w16,w17
  1520. sli v9.4s,v26.4s,#7
  1521. add w13,w13,w19
  1522. sli v13.4s,v27.4s,#7
  1523. add w14,w14,w20
  1524. sli v17.4s,v28.4s,#7
  1525. eor w10,w10,w15
  1526. sli v21.4s,v29.4s,#7
  1527. eor w11,w11,w16
  1528. ext v2.16b,v2.16b,v2.16b,#8
  1529. eor w12,w12,w13
  1530. ext v6.16b,v6.16b,v6.16b,#8
  1531. eor w9,w9,w14
  1532. ext v10.16b,v10.16b,v10.16b,#8
  1533. ror w10,w10,#25
  1534. ext v14.16b,v14.16b,v14.16b,#8
  1535. ror w11,w11,#25
  1536. ext v18.16b,v18.16b,v18.16b,#8
  1537. ror w12,w12,#25
  1538. ext v22.16b,v22.16b,v22.16b,#8
  1539. ror w9,w9,#25
  1540. ext v3.16b,v3.16b,v3.16b,#12
  1541. ext v7.16b,v7.16b,v7.16b,#12
  1542. ext v11.16b,v11.16b,v11.16b,#12
  1543. ext v15.16b,v15.16b,v15.16b,#12
  1544. ext v19.16b,v19.16b,v19.16b,#12
  1545. ext v23.16b,v23.16b,v23.16b,#12
  1546. ext v1.16b,v1.16b,v1.16b,#4
  1547. ext v5.16b,v5.16b,v5.16b,#4
  1548. ext v9.16b,v9.16b,v9.16b,#4
  1549. ext v13.16b,v13.16b,v13.16b,#4
  1550. ext v17.16b,v17.16b,v17.16b,#4
  1551. ext v21.16b,v21.16b,v21.16b,#4
  1552. add v0.4s,v0.4s,v1.4s
  1553. add w5,w5,w9
  1554. add v4.4s,v4.4s,v5.4s
  1555. add w6,w6,w10
  1556. add v8.4s,v8.4s,v9.4s
  1557. add w7,w7,w11
  1558. add v12.4s,v12.4s,v13.4s
  1559. add w8,w8,w12
  1560. add v16.4s,v16.4s,v17.4s
  1561. eor w17,w17,w5
  1562. add v20.4s,v20.4s,v21.4s
  1563. eor w19,w19,w6
  1564. eor v3.16b,v3.16b,v0.16b
  1565. eor w20,w20,w7
  1566. eor v7.16b,v7.16b,v4.16b
  1567. eor w21,w21,w8
  1568. eor v11.16b,v11.16b,v8.16b
  1569. ror w17,w17,#16
  1570. eor v15.16b,v15.16b,v12.16b
  1571. ror w19,w19,#16
  1572. eor v19.16b,v19.16b,v16.16b
  1573. ror w20,w20,#16
  1574. eor v23.16b,v23.16b,v20.16b
  1575. ror w21,w21,#16
  1576. rev32 v3.8h,v3.8h
  1577. add w13,w13,w17
  1578. rev32 v7.8h,v7.8h
  1579. add w14,w14,w19
  1580. rev32 v11.8h,v11.8h
  1581. add w15,w15,w20
  1582. rev32 v15.8h,v15.8h
  1583. add w16,w16,w21
  1584. rev32 v19.8h,v19.8h
  1585. eor w9,w9,w13
  1586. rev32 v23.8h,v23.8h
  1587. eor w10,w10,w14
  1588. add v2.4s,v2.4s,v3.4s
  1589. eor w11,w11,w15
  1590. add v6.4s,v6.4s,v7.4s
  1591. eor w12,w12,w16
  1592. add v10.4s,v10.4s,v11.4s
  1593. ror w9,w9,#20
  1594. add v14.4s,v14.4s,v15.4s
  1595. ror w10,w10,#20
  1596. add v18.4s,v18.4s,v19.4s
  1597. ror w11,w11,#20
  1598. add v22.4s,v22.4s,v23.4s
  1599. ror w12,w12,#20
  1600. eor v24.16b,v1.16b,v2.16b
  1601. add w5,w5,w9
  1602. eor v25.16b,v5.16b,v6.16b
  1603. add w6,w6,w10
  1604. eor v26.16b,v9.16b,v10.16b
  1605. add w7,w7,w11
  1606. eor v27.16b,v13.16b,v14.16b
  1607. add w8,w8,w12
  1608. eor v28.16b,v17.16b,v18.16b
  1609. eor w17,w17,w5
  1610. eor v29.16b,v21.16b,v22.16b
  1611. eor w19,w19,w6
  1612. ushr v1.4s,v24.4s,#20
  1613. eor w20,w20,w7
  1614. ushr v5.4s,v25.4s,#20
  1615. eor w21,w21,w8
  1616. ushr v9.4s,v26.4s,#20
  1617. ror w17,w17,#24
  1618. ushr v13.4s,v27.4s,#20
  1619. ror w19,w19,#24
  1620. ushr v17.4s,v28.4s,#20
  1621. ror w20,w20,#24
  1622. ushr v21.4s,v29.4s,#20
  1623. ror w21,w21,#24
  1624. sli v1.4s,v24.4s,#12
  1625. add w13,w13,w17
  1626. sli v5.4s,v25.4s,#12
  1627. add w14,w14,w19
  1628. sli v9.4s,v26.4s,#12
  1629. add w15,w15,w20
  1630. sli v13.4s,v27.4s,#12
  1631. add w16,w16,w21
  1632. sli v17.4s,v28.4s,#12
  1633. eor w9,w9,w13
  1634. sli v21.4s,v29.4s,#12
  1635. eor w10,w10,w14
  1636. add v0.4s,v0.4s,v1.4s
  1637. eor w11,w11,w15
  1638. add v4.4s,v4.4s,v5.4s
  1639. eor w12,w12,w16
  1640. add v8.4s,v8.4s,v9.4s
  1641. ror w9,w9,#25
  1642. add v12.4s,v12.4s,v13.4s
  1643. ror w10,w10,#25
  1644. add v16.4s,v16.4s,v17.4s
  1645. ror w11,w11,#25
  1646. add v20.4s,v20.4s,v21.4s
  1647. ror w12,w12,#25
  1648. eor v24.16b,v3.16b,v0.16b
  1649. add w5,w5,w10
  1650. eor v25.16b,v7.16b,v4.16b
  1651. add w6,w6,w11
  1652. eor v26.16b,v11.16b,v8.16b
  1653. add w7,w7,w12
  1654. eor v27.16b,v15.16b,v12.16b
  1655. add w8,w8,w9
  1656. eor v28.16b,v19.16b,v16.16b
  1657. eor w21,w21,w5
  1658. eor v29.16b,v23.16b,v20.16b
  1659. eor w17,w17,w6
  1660. ushr v3.4s,v24.4s,#24
  1661. eor w19,w19,w7
  1662. ushr v7.4s,v25.4s,#24
  1663. eor w20,w20,w8
  1664. ushr v11.4s,v26.4s,#24
  1665. ror w21,w21,#16
  1666. ushr v15.4s,v27.4s,#24
  1667. ror w17,w17,#16
  1668. ushr v19.4s,v28.4s,#24
  1669. ror w19,w19,#16
  1670. ushr v23.4s,v29.4s,#24
  1671. ror w20,w20,#16
  1672. sli v3.4s,v24.4s,#8
  1673. add w15,w15,w21
  1674. sli v7.4s,v25.4s,#8
  1675. add w16,w16,w17
  1676. sli v11.4s,v26.4s,#8
  1677. add w13,w13,w19
  1678. sli v15.4s,v27.4s,#8
  1679. add w14,w14,w20
  1680. sli v19.4s,v28.4s,#8
  1681. eor w10,w10,w15
  1682. sli v23.4s,v29.4s,#8
  1683. eor w11,w11,w16
  1684. add v2.4s,v2.4s,v3.4s
  1685. eor w12,w12,w13
  1686. add v6.4s,v6.4s,v7.4s
  1687. eor w9,w9,w14
  1688. add v10.4s,v10.4s,v11.4s
  1689. ror w10,w10,#20
  1690. add v14.4s,v14.4s,v15.4s
  1691. ror w11,w11,#20
  1692. add v18.4s,v18.4s,v19.4s
  1693. ror w12,w12,#20
  1694. add v22.4s,v22.4s,v23.4s
  1695. ror w9,w9,#20
  1696. eor v24.16b,v1.16b,v2.16b
  1697. add w5,w5,w10
  1698. eor v25.16b,v5.16b,v6.16b
  1699. add w6,w6,w11
  1700. eor v26.16b,v9.16b,v10.16b
  1701. add w7,w7,w12
  1702. eor v27.16b,v13.16b,v14.16b
  1703. add w8,w8,w9
  1704. eor v28.16b,v17.16b,v18.16b
  1705. eor w21,w21,w5
  1706. eor v29.16b,v21.16b,v22.16b
  1707. eor w17,w17,w6
  1708. ushr v1.4s,v24.4s,#25
  1709. eor w19,w19,w7
  1710. ushr v5.4s,v25.4s,#25
  1711. eor w20,w20,w8
  1712. ushr v9.4s,v26.4s,#25
  1713. ror w21,w21,#24
  1714. ushr v13.4s,v27.4s,#25
  1715. ror w17,w17,#24
  1716. ushr v17.4s,v28.4s,#25
  1717. ror w19,w19,#24
  1718. ushr v21.4s,v29.4s,#25
  1719. ror w20,w20,#24
  1720. sli v1.4s,v24.4s,#7
  1721. add w15,w15,w21
  1722. sli v5.4s,v25.4s,#7
  1723. add w16,w16,w17
  1724. sli v9.4s,v26.4s,#7
  1725. add w13,w13,w19
  1726. sli v13.4s,v27.4s,#7
  1727. add w14,w14,w20
  1728. sli v17.4s,v28.4s,#7
  1729. eor w10,w10,w15
  1730. sli v21.4s,v29.4s,#7
  1731. eor w11,w11,w16
  1732. ext v2.16b,v2.16b,v2.16b,#8
  1733. eor w12,w12,w13
  1734. ext v6.16b,v6.16b,v6.16b,#8
  1735. eor w9,w9,w14
  1736. ext v10.16b,v10.16b,v10.16b,#8
  1737. ror w10,w10,#25
  1738. ext v14.16b,v14.16b,v14.16b,#8
  1739. ror w11,w11,#25
  1740. ext v18.16b,v18.16b,v18.16b,#8
  1741. ror w12,w12,#25
  1742. ext v22.16b,v22.16b,v22.16b,#8
  1743. ror w9,w9,#25
  1744. ext v3.16b,v3.16b,v3.16b,#4
  1745. ext v7.16b,v7.16b,v7.16b,#4
  1746. ext v11.16b,v11.16b,v11.16b,#4
  1747. ext v15.16b,v15.16b,v15.16b,#4
  1748. ext v19.16b,v19.16b,v19.16b,#4
  1749. ext v23.16b,v23.16b,v23.16b,#4
  1750. ext v1.16b,v1.16b,v1.16b,#12
  1751. ext v5.16b,v5.16b,v5.16b,#12
  1752. ext v9.16b,v9.16b,v9.16b,#12
  1753. ext v13.16b,v13.16b,v13.16b,#12
  1754. ext v17.16b,v17.16b,v17.16b,#12
  1755. ext v21.16b,v21.16b,v21.16b,#12
  1756. cbnz x4,Loop_lower_neon
  1757. add w5,w5,w22 // accumulate key block
  1758. ldp q24,q25,[sp,#0]
  1759. add x6,x6,x22,lsr#32
  1760. ldp q26,q27,[sp,#32]
  1761. add w7,w7,w23
  1762. ldp q28,q29,[sp,#64]
  1763. add x8,x8,x23,lsr#32
  1764. add v0.4s,v0.4s,v24.4s
  1765. add w9,w9,w24
  1766. add v4.4s,v4.4s,v24.4s
  1767. add x10,x10,x24,lsr#32
  1768. add v8.4s,v8.4s,v24.4s
  1769. add w11,w11,w25
  1770. add v12.4s,v12.4s,v24.4s
  1771. add x12,x12,x25,lsr#32
  1772. add v16.4s,v16.4s,v24.4s
  1773. add w13,w13,w26
  1774. add v20.4s,v20.4s,v24.4s
  1775. add x14,x14,x26,lsr#32
  1776. add v2.4s,v2.4s,v26.4s
  1777. add w15,w15,w27
  1778. add v6.4s,v6.4s,v26.4s
  1779. add x16,x16,x27,lsr#32
  1780. add v10.4s,v10.4s,v26.4s
  1781. add w17,w17,w28
  1782. add v14.4s,v14.4s,v26.4s
  1783. add x19,x19,x28,lsr#32
  1784. add v18.4s,v18.4s,v26.4s
  1785. add w20,w20,w30
  1786. add v22.4s,v22.4s,v26.4s
  1787. add x21,x21,x30,lsr#32
  1788. add v19.4s,v19.4s,v31.4s // +4
  1789. add x5,x5,x6,lsl#32 // pack
  1790. add v23.4s,v23.4s,v31.4s // +4
  1791. add x7,x7,x8,lsl#32
  1792. add v3.4s,v3.4s,v27.4s
  1793. ldp x6,x8,[x1,#0] // load input
  1794. add v7.4s,v7.4s,v28.4s
  1795. add x9,x9,x10,lsl#32
  1796. add v11.4s,v11.4s,v29.4s
  1797. add x11,x11,x12,lsl#32
  1798. add v15.4s,v15.4s,v30.4s
  1799. ldp x10,x12,[x1,#16]
  1800. add v19.4s,v19.4s,v27.4s
  1801. add x13,x13,x14,lsl#32
  1802. add v23.4s,v23.4s,v28.4s
  1803. add x15,x15,x16,lsl#32
  1804. add v1.4s,v1.4s,v25.4s
  1805. ldp x14,x16,[x1,#32]
  1806. add v5.4s,v5.4s,v25.4s
  1807. add x17,x17,x19,lsl#32
  1808. add v9.4s,v9.4s,v25.4s
  1809. add x20,x20,x21,lsl#32
  1810. add v13.4s,v13.4s,v25.4s
  1811. ldp x19,x21,[x1,#48]
  1812. add v17.4s,v17.4s,v25.4s
  1813. add x1,x1,#64
  1814. add v21.4s,v21.4s,v25.4s
  1815. #ifdef __ARMEB__
  1816. rev x5,x5
  1817. rev x7,x7
  1818. rev x9,x9
  1819. rev x11,x11
  1820. rev x13,x13
  1821. rev x15,x15
  1822. rev x17,x17
  1823. rev x20,x20
  1824. #endif
  1825. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1826. eor x5,x5,x6
  1827. eor x7,x7,x8
  1828. eor x9,x9,x10
  1829. eor x11,x11,x12
  1830. eor x13,x13,x14
  1831. eor v0.16b,v0.16b,v24.16b
  1832. eor x15,x15,x16
  1833. eor v1.16b,v1.16b,v25.16b
  1834. eor x17,x17,x19
  1835. eor v2.16b,v2.16b,v26.16b
  1836. eor x20,x20,x21
  1837. eor v3.16b,v3.16b,v27.16b
  1838. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1839. stp x5,x7,[x0,#0] // store output
  1840. add x28,x28,#7 // increment counter
  1841. stp x9,x11,[x0,#16]
  1842. stp x13,x15,[x0,#32]
  1843. stp x17,x20,[x0,#48]
  1844. add x0,x0,#64
  1845. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1846. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1847. eor v4.16b,v4.16b,v24.16b
  1848. eor v5.16b,v5.16b,v25.16b
  1849. eor v6.16b,v6.16b,v26.16b
  1850. eor v7.16b,v7.16b,v27.16b
  1851. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1852. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1853. eor v8.16b,v8.16b,v0.16b
  1854. ldp q24,q25,[sp,#0]
  1855. eor v9.16b,v9.16b,v1.16b
  1856. ldp q26,q27,[sp,#32]
  1857. eor v10.16b,v10.16b,v2.16b
  1858. eor v11.16b,v11.16b,v3.16b
  1859. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1860. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1861. eor v12.16b,v12.16b,v4.16b
  1862. eor v13.16b,v13.16b,v5.16b
  1863. eor v14.16b,v14.16b,v6.16b
  1864. eor v15.16b,v15.16b,v7.16b
  1865. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1866. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1867. eor v16.16b,v16.16b,v8.16b
  1868. eor v17.16b,v17.16b,v9.16b
  1869. eor v18.16b,v18.16b,v10.16b
  1870. eor v19.16b,v19.16b,v11.16b
  1871. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1872. shl v0.4s,v31.4s,#1 // 4 -> 8
  1873. eor v20.16b,v20.16b,v12.16b
  1874. eor v21.16b,v21.16b,v13.16b
  1875. eor v22.16b,v22.16b,v14.16b
  1876. eor v23.16b,v23.16b,v15.16b
  1877. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1878. add v27.4s,v27.4s,v0.4s // += 8
  1879. add v28.4s,v28.4s,v0.4s
  1880. add v29.4s,v29.4s,v0.4s
  1881. add v30.4s,v30.4s,v0.4s
  1882. b.hs Loop_outer_512_neon
  1883. adds x2,x2,#512
  1884. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1885. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1886. ldp d10,d11,[sp,#128+16]
  1887. ldp d12,d13,[sp,#128+32]
  1888. ldp d14,d15,[sp,#128+48]
  1889. stp q24,q31,[sp,#0] // wipe off-load area
  1890. stp q24,q31,[sp,#32]
  1891. stp q24,q31,[sp,#64]
  1892. b.eq Ldone_512_neon
  1893. cmp x2,#192
  1894. sub v27.4s,v27.4s,v0.4s // -= 1
  1895. sub v28.4s,v28.4s,v0.4s
  1896. sub v29.4s,v29.4s,v0.4s
  1897. add sp,sp,#128
  1898. b.hs Loop_outer_neon
  1899. eor v25.16b,v25.16b,v25.16b
  1900. eor v26.16b,v26.16b,v26.16b
  1901. eor v27.16b,v27.16b,v27.16b
  1902. eor v28.16b,v28.16b,v28.16b
  1903. eor v29.16b,v29.16b,v29.16b
  1904. eor v30.16b,v30.16b,v30.16b
  1905. b Loop_outer
  1906. Ldone_512_neon:
  1907. ldp x19,x20,[x29,#16]
  1908. add sp,sp,#128+64
  1909. ldp x21,x22,[x29,#32]
  1910. ldp x23,x24,[x29,#48]
  1911. ldp x25,x26,[x29,#64]
  1912. ldp x27,x28,[x29,#80]
  1913. ldp x29,x30,[sp],#96
  1914. AARCH64_VALIDATE_LINK_REGISTER
  1915. ret
  1916. #endif
  1917. #endif // !OPENSSL_NO_ASM