chacha-armv8.S 40 KB


  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. .private_extern _OPENSSL_armcap_P
  15. .section __TEXT,__const
  16. .align 5
  17. Lsigma:
  18. .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
  19. Lone:
  20. .long 1,0,0,0
  21. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  22. .align 2
  23. .text
  24. .globl _ChaCha20_ctr32
  25. .private_extern _ChaCha20_ctr32
  26. .align 5
  27. _ChaCha20_ctr32:
  28. AARCH64_VALID_CALL_TARGET
  29. cbz x2,Labort
  30. #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
  31. adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P
  32. #else
  33. adrp x5,_OPENSSL_armcap_P@PAGE
  34. #endif
  35. cmp x2,#192
  36. b.lo Lshort
  37. ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
  38. tst w17,#ARMV7_NEON
  39. b.ne ChaCha20_neon
  40. Lshort:
  41. AARCH64_SIGN_LINK_REGISTER
  42. stp x29,x30,[sp,#-96]!
  43. add x29,sp,#0
  44. adrp x5,Lsigma@PAGE
  45. add x5,x5,Lsigma@PAGEOFF
  46. stp x19,x20,[sp,#16]
  47. stp x21,x22,[sp,#32]
  48. stp x23,x24,[sp,#48]
  49. stp x25,x26,[sp,#64]
  50. stp x27,x28,[sp,#80]
  51. sub sp,sp,#64
  52. ldp x22,x23,[x5] // load sigma
  53. ldp x24,x25,[x3] // load key
  54. ldp x26,x27,[x3,#16]
  55. ldp x28,x30,[x4] // load counter
  56. #ifdef __ARMEB__
  57. ror x24,x24,#32
  58. ror x25,x25,#32
  59. ror x26,x26,#32
  60. ror x27,x27,#32
  61. ror x28,x28,#32
  62. ror x30,x30,#32
  63. #endif
  64. Loop_outer:
  65. mov w5,w22 // unpack key block
  66. lsr x6,x22,#32
  67. mov w7,w23
  68. lsr x8,x23,#32
  69. mov w9,w24
  70. lsr x10,x24,#32
  71. mov w11,w25
  72. lsr x12,x25,#32
  73. mov w13,w26
  74. lsr x14,x26,#32
  75. mov w15,w27
  76. lsr x16,x27,#32
  77. mov w17,w28
  78. lsr x19,x28,#32
  79. mov w20,w30
  80. lsr x21,x30,#32
  81. mov x4,#10
  82. subs x2,x2,#64
  83. Loop:
  84. sub x4,x4,#1
  85. add w5,w5,w9
  86. add w6,w6,w10
  87. add w7,w7,w11
  88. add w8,w8,w12
  89. eor w17,w17,w5
  90. eor w19,w19,w6
  91. eor w20,w20,w7
  92. eor w21,w21,w8
  93. ror w17,w17,#16
  94. ror w19,w19,#16
  95. ror w20,w20,#16
  96. ror w21,w21,#16
  97. add w13,w13,w17
  98. add w14,w14,w19
  99. add w15,w15,w20
  100. add w16,w16,w21
  101. eor w9,w9,w13
  102. eor w10,w10,w14
  103. eor w11,w11,w15
  104. eor w12,w12,w16
  105. ror w9,w9,#20
  106. ror w10,w10,#20
  107. ror w11,w11,#20
  108. ror w12,w12,#20
  109. add w5,w5,w9
  110. add w6,w6,w10
  111. add w7,w7,w11
  112. add w8,w8,w12
  113. eor w17,w17,w5
  114. eor w19,w19,w6
  115. eor w20,w20,w7
  116. eor w21,w21,w8
  117. ror w17,w17,#24
  118. ror w19,w19,#24
  119. ror w20,w20,#24
  120. ror w21,w21,#24
  121. add w13,w13,w17
  122. add w14,w14,w19
  123. add w15,w15,w20
  124. add w16,w16,w21
  125. eor w9,w9,w13
  126. eor w10,w10,w14
  127. eor w11,w11,w15
  128. eor w12,w12,w16
  129. ror w9,w9,#25
  130. ror w10,w10,#25
  131. ror w11,w11,#25
  132. ror w12,w12,#25
  133. add w5,w5,w10
  134. add w6,w6,w11
  135. add w7,w7,w12
  136. add w8,w8,w9
  137. eor w21,w21,w5
  138. eor w17,w17,w6
  139. eor w19,w19,w7
  140. eor w20,w20,w8
  141. ror w21,w21,#16
  142. ror w17,w17,#16
  143. ror w19,w19,#16
  144. ror w20,w20,#16
  145. add w15,w15,w21
  146. add w16,w16,w17
  147. add w13,w13,w19
  148. add w14,w14,w20
  149. eor w10,w10,w15
  150. eor w11,w11,w16
  151. eor w12,w12,w13
  152. eor w9,w9,w14
  153. ror w10,w10,#20
  154. ror w11,w11,#20
  155. ror w12,w12,#20
  156. ror w9,w9,#20
  157. add w5,w5,w10
  158. add w6,w6,w11
  159. add w7,w7,w12
  160. add w8,w8,w9
  161. eor w21,w21,w5
  162. eor w17,w17,w6
  163. eor w19,w19,w7
  164. eor w20,w20,w8
  165. ror w21,w21,#24
  166. ror w17,w17,#24
  167. ror w19,w19,#24
  168. ror w20,w20,#24
  169. add w15,w15,w21
  170. add w16,w16,w17
  171. add w13,w13,w19
  172. add w14,w14,w20
  173. eor w10,w10,w15
  174. eor w11,w11,w16
  175. eor w12,w12,w13
  176. eor w9,w9,w14
  177. ror w10,w10,#25
  178. ror w11,w11,#25
  179. ror w12,w12,#25
  180. ror w9,w9,#25
  181. cbnz x4,Loop
  182. add w5,w5,w22 // accumulate key block
  183. add x6,x6,x22,lsr#32
  184. add w7,w7,w23
  185. add x8,x8,x23,lsr#32
  186. add w9,w9,w24
  187. add x10,x10,x24,lsr#32
  188. add w11,w11,w25
  189. add x12,x12,x25,lsr#32
  190. add w13,w13,w26
  191. add x14,x14,x26,lsr#32
  192. add w15,w15,w27
  193. add x16,x16,x27,lsr#32
  194. add w17,w17,w28
  195. add x19,x19,x28,lsr#32
  196. add w20,w20,w30
  197. add x21,x21,x30,lsr#32
  198. b.lo Ltail
  199. add x5,x5,x6,lsl#32 // pack
  200. add x7,x7,x8,lsl#32
  201. ldp x6,x8,[x1,#0] // load input
  202. add x9,x9,x10,lsl#32
  203. add x11,x11,x12,lsl#32
  204. ldp x10,x12,[x1,#16]
  205. add x13,x13,x14,lsl#32
  206. add x15,x15,x16,lsl#32
  207. ldp x14,x16,[x1,#32]
  208. add x17,x17,x19,lsl#32
  209. add x20,x20,x21,lsl#32
  210. ldp x19,x21,[x1,#48]
  211. add x1,x1,#64
  212. #ifdef __ARMEB__
  213. rev x5,x5
  214. rev x7,x7
  215. rev x9,x9
  216. rev x11,x11
  217. rev x13,x13
  218. rev x15,x15
  219. rev x17,x17
  220. rev x20,x20
  221. #endif
  222. eor x5,x5,x6
  223. eor x7,x7,x8
  224. eor x9,x9,x10
  225. eor x11,x11,x12
  226. eor x13,x13,x14
  227. eor x15,x15,x16
  228. eor x17,x17,x19
  229. eor x20,x20,x21
  230. stp x5,x7,[x0,#0] // store output
  231. add x28,x28,#1 // increment counter
  232. stp x9,x11,[x0,#16]
  233. stp x13,x15,[x0,#32]
  234. stp x17,x20,[x0,#48]
  235. add x0,x0,#64
  236. b.hi Loop_outer
  237. ldp x19,x20,[x29,#16]
  238. add sp,sp,#64
  239. ldp x21,x22,[x29,#32]
  240. ldp x23,x24,[x29,#48]
  241. ldp x25,x26,[x29,#64]
  242. ldp x27,x28,[x29,#80]
  243. ldp x29,x30,[sp],#96
  244. AARCH64_VALIDATE_LINK_REGISTER
  245. Labort:
  246. ret
  247. .align 4
  248. Ltail:
  249. add x2,x2,#64
  250. Less_than_64:
  251. sub x0,x0,#1
  252. add x1,x1,x2
  253. add x0,x0,x2
  254. add x4,sp,x2
  255. neg x2,x2
  256. add x5,x5,x6,lsl#32 // pack
  257. add x7,x7,x8,lsl#32
  258. add x9,x9,x10,lsl#32
  259. add x11,x11,x12,lsl#32
  260. add x13,x13,x14,lsl#32
  261. add x15,x15,x16,lsl#32
  262. add x17,x17,x19,lsl#32
  263. add x20,x20,x21,lsl#32
  264. #ifdef __ARMEB__
  265. rev x5,x5
  266. rev x7,x7
  267. rev x9,x9
  268. rev x11,x11
  269. rev x13,x13
  270. rev x15,x15
  271. rev x17,x17
  272. rev x20,x20
  273. #endif
  274. stp x5,x7,[sp,#0]
  275. stp x9,x11,[sp,#16]
  276. stp x13,x15,[sp,#32]
  277. stp x17,x20,[sp,#48]
  278. Loop_tail:
  279. ldrb w10,[x1,x2]
  280. ldrb w11,[x4,x2]
  281. add x2,x2,#1
  282. eor w10,w10,w11
  283. strb w10,[x0,x2]
  284. cbnz x2,Loop_tail
  285. stp xzr,xzr,[sp,#0]
  286. stp xzr,xzr,[sp,#16]
  287. stp xzr,xzr,[sp,#32]
  288. stp xzr,xzr,[sp,#48]
  289. ldp x19,x20,[x29,#16]
  290. add sp,sp,#64
  291. ldp x21,x22,[x29,#32]
  292. ldp x23,x24,[x29,#48]
  293. ldp x25,x26,[x29,#64]
  294. ldp x27,x28,[x29,#80]
  295. ldp x29,x30,[sp],#96
  296. AARCH64_VALIDATE_LINK_REGISTER
  297. ret
  298. .align 5
  299. ChaCha20_neon:
  300. AARCH64_SIGN_LINK_REGISTER
  301. stp x29,x30,[sp,#-96]!
  302. add x29,sp,#0
  303. adrp x5,Lsigma@PAGE
  304. add x5,x5,Lsigma@PAGEOFF
  305. stp x19,x20,[sp,#16]
  306. stp x21,x22,[sp,#32]
  307. stp x23,x24,[sp,#48]
  308. stp x25,x26,[sp,#64]
  309. stp x27,x28,[sp,#80]
  310. cmp x2,#512
  311. b.hs L512_or_more_neon
  312. sub sp,sp,#64
  313. ldp x22,x23,[x5] // load sigma
  314. ld1 {v24.4s},[x5],#16
  315. ldp x24,x25,[x3] // load key
  316. ldp x26,x27,[x3,#16]
  317. ld1 {v25.4s,v26.4s},[x3]
  318. ldp x28,x30,[x4] // load counter
  319. ld1 {v27.4s},[x4]
  320. ld1 {v31.4s},[x5]
  321. #ifdef __ARMEB__
  322. rev64 v24.4s,v24.4s
  323. ror x24,x24,#32
  324. ror x25,x25,#32
  325. ror x26,x26,#32
  326. ror x27,x27,#32
  327. ror x28,x28,#32
  328. ror x30,x30,#32
  329. #endif
  330. add v27.4s,v27.4s,v31.4s // += 1
  331. add v28.4s,v27.4s,v31.4s
  332. add v29.4s,v28.4s,v31.4s
  333. shl v31.4s,v31.4s,#2 // 1 -> 4
  334. Loop_outer_neon:
  335. mov w5,w22 // unpack key block
  336. lsr x6,x22,#32
  337. mov v0.16b,v24.16b
  338. mov w7,w23
  339. lsr x8,x23,#32
  340. mov v4.16b,v24.16b
  341. mov w9,w24
  342. lsr x10,x24,#32
  343. mov v16.16b,v24.16b
  344. mov w11,w25
  345. mov v1.16b,v25.16b
  346. lsr x12,x25,#32
  347. mov v5.16b,v25.16b
  348. mov w13,w26
  349. mov v17.16b,v25.16b
  350. lsr x14,x26,#32
  351. mov v3.16b,v27.16b
  352. mov w15,w27
  353. mov v7.16b,v28.16b
  354. lsr x16,x27,#32
  355. mov v19.16b,v29.16b
  356. mov w17,w28
  357. mov v2.16b,v26.16b
  358. lsr x19,x28,#32
  359. mov v6.16b,v26.16b
  360. mov w20,w30
  361. mov v18.16b,v26.16b
  362. lsr x21,x30,#32
  363. mov x4,#10
  364. subs x2,x2,#256
  365. Loop_neon:
  366. sub x4,x4,#1
  367. add v0.4s,v0.4s,v1.4s
  368. add w5,w5,w9
  369. add v4.4s,v4.4s,v5.4s
  370. add w6,w6,w10
  371. add v16.4s,v16.4s,v17.4s
  372. add w7,w7,w11
  373. eor v3.16b,v3.16b,v0.16b
  374. add w8,w8,w12
  375. eor v7.16b,v7.16b,v4.16b
  376. eor w17,w17,w5
  377. eor v19.16b,v19.16b,v16.16b
  378. eor w19,w19,w6
  379. rev32 v3.8h,v3.8h
  380. eor w20,w20,w7
  381. rev32 v7.8h,v7.8h
  382. eor w21,w21,w8
  383. rev32 v19.8h,v19.8h
  384. ror w17,w17,#16
  385. add v2.4s,v2.4s,v3.4s
  386. ror w19,w19,#16
  387. add v6.4s,v6.4s,v7.4s
  388. ror w20,w20,#16
  389. add v18.4s,v18.4s,v19.4s
  390. ror w21,w21,#16
  391. eor v20.16b,v1.16b,v2.16b
  392. add w13,w13,w17
  393. eor v21.16b,v5.16b,v6.16b
  394. add w14,w14,w19
  395. eor v22.16b,v17.16b,v18.16b
  396. add w15,w15,w20
  397. ushr v1.4s,v20.4s,#20
  398. add w16,w16,w21
  399. ushr v5.4s,v21.4s,#20
  400. eor w9,w9,w13
  401. ushr v17.4s,v22.4s,#20
  402. eor w10,w10,w14
  403. sli v1.4s,v20.4s,#12
  404. eor w11,w11,w15
  405. sli v5.4s,v21.4s,#12
  406. eor w12,w12,w16
  407. sli v17.4s,v22.4s,#12
  408. ror w9,w9,#20
  409. add v0.4s,v0.4s,v1.4s
  410. ror w10,w10,#20
  411. add v4.4s,v4.4s,v5.4s
  412. ror w11,w11,#20
  413. add v16.4s,v16.4s,v17.4s
  414. ror w12,w12,#20
  415. eor v20.16b,v3.16b,v0.16b
  416. add w5,w5,w9
  417. eor v21.16b,v7.16b,v4.16b
  418. add w6,w6,w10
  419. eor v22.16b,v19.16b,v16.16b
  420. add w7,w7,w11
  421. ushr v3.4s,v20.4s,#24
  422. add w8,w8,w12
  423. ushr v7.4s,v21.4s,#24
  424. eor w17,w17,w5
  425. ushr v19.4s,v22.4s,#24
  426. eor w19,w19,w6
  427. sli v3.4s,v20.4s,#8
  428. eor w20,w20,w7
  429. sli v7.4s,v21.4s,#8
  430. eor w21,w21,w8
  431. sli v19.4s,v22.4s,#8
  432. ror w17,w17,#24
  433. add v2.4s,v2.4s,v3.4s
  434. ror w19,w19,#24
  435. add v6.4s,v6.4s,v7.4s
  436. ror w20,w20,#24
  437. add v18.4s,v18.4s,v19.4s
  438. ror w21,w21,#24
  439. eor v20.16b,v1.16b,v2.16b
  440. add w13,w13,w17
  441. eor v21.16b,v5.16b,v6.16b
  442. add w14,w14,w19
  443. eor v22.16b,v17.16b,v18.16b
  444. add w15,w15,w20
  445. ushr v1.4s,v20.4s,#25
  446. add w16,w16,w21
  447. ushr v5.4s,v21.4s,#25
  448. eor w9,w9,w13
  449. ushr v17.4s,v22.4s,#25
  450. eor w10,w10,w14
  451. sli v1.4s,v20.4s,#7
  452. eor w11,w11,w15
  453. sli v5.4s,v21.4s,#7
  454. eor w12,w12,w16
  455. sli v17.4s,v22.4s,#7
  456. ror w9,w9,#25
  457. ext v2.16b,v2.16b,v2.16b,#8
  458. ror w10,w10,#25
  459. ext v6.16b,v6.16b,v6.16b,#8
  460. ror w11,w11,#25
  461. ext v18.16b,v18.16b,v18.16b,#8
  462. ror w12,w12,#25
  463. ext v3.16b,v3.16b,v3.16b,#12
  464. ext v7.16b,v7.16b,v7.16b,#12
  465. ext v19.16b,v19.16b,v19.16b,#12
  466. ext v1.16b,v1.16b,v1.16b,#4
  467. ext v5.16b,v5.16b,v5.16b,#4
  468. ext v17.16b,v17.16b,v17.16b,#4
  469. add v0.4s,v0.4s,v1.4s
  470. add w5,w5,w10
  471. add v4.4s,v4.4s,v5.4s
  472. add w6,w6,w11
  473. add v16.4s,v16.4s,v17.4s
  474. add w7,w7,w12
  475. eor v3.16b,v3.16b,v0.16b
  476. add w8,w8,w9
  477. eor v7.16b,v7.16b,v4.16b
  478. eor w21,w21,w5
  479. eor v19.16b,v19.16b,v16.16b
  480. eor w17,w17,w6
  481. rev32 v3.8h,v3.8h
  482. eor w19,w19,w7
  483. rev32 v7.8h,v7.8h
  484. eor w20,w20,w8
  485. rev32 v19.8h,v19.8h
  486. ror w21,w21,#16
  487. add v2.4s,v2.4s,v3.4s
  488. ror w17,w17,#16
  489. add v6.4s,v6.4s,v7.4s
  490. ror w19,w19,#16
  491. add v18.4s,v18.4s,v19.4s
  492. ror w20,w20,#16
  493. eor v20.16b,v1.16b,v2.16b
  494. add w15,w15,w21
  495. eor v21.16b,v5.16b,v6.16b
  496. add w16,w16,w17
  497. eor v22.16b,v17.16b,v18.16b
  498. add w13,w13,w19
  499. ushr v1.4s,v20.4s,#20
  500. add w14,w14,w20
  501. ushr v5.4s,v21.4s,#20
  502. eor w10,w10,w15
  503. ushr v17.4s,v22.4s,#20
  504. eor w11,w11,w16
  505. sli v1.4s,v20.4s,#12
  506. eor w12,w12,w13
  507. sli v5.4s,v21.4s,#12
  508. eor w9,w9,w14
  509. sli v17.4s,v22.4s,#12
  510. ror w10,w10,#20
  511. add v0.4s,v0.4s,v1.4s
  512. ror w11,w11,#20
  513. add v4.4s,v4.4s,v5.4s
  514. ror w12,w12,#20
  515. add v16.4s,v16.4s,v17.4s
  516. ror w9,w9,#20
  517. eor v20.16b,v3.16b,v0.16b
  518. add w5,w5,w10
  519. eor v21.16b,v7.16b,v4.16b
  520. add w6,w6,w11
  521. eor v22.16b,v19.16b,v16.16b
  522. add w7,w7,w12
  523. ushr v3.4s,v20.4s,#24
  524. add w8,w8,w9
  525. ushr v7.4s,v21.4s,#24
  526. eor w21,w21,w5
  527. ushr v19.4s,v22.4s,#24
  528. eor w17,w17,w6
  529. sli v3.4s,v20.4s,#8
  530. eor w19,w19,w7
  531. sli v7.4s,v21.4s,#8
  532. eor w20,w20,w8
  533. sli v19.4s,v22.4s,#8
  534. ror w21,w21,#24
  535. add v2.4s,v2.4s,v3.4s
  536. ror w17,w17,#24
  537. add v6.4s,v6.4s,v7.4s
  538. ror w19,w19,#24
  539. add v18.4s,v18.4s,v19.4s
  540. ror w20,w20,#24
  541. eor v20.16b,v1.16b,v2.16b
  542. add w15,w15,w21
  543. eor v21.16b,v5.16b,v6.16b
  544. add w16,w16,w17
  545. eor v22.16b,v17.16b,v18.16b
  546. add w13,w13,w19
  547. ushr v1.4s,v20.4s,#25
  548. add w14,w14,w20
  549. ushr v5.4s,v21.4s,#25
  550. eor w10,w10,w15
  551. ushr v17.4s,v22.4s,#25
  552. eor w11,w11,w16
  553. sli v1.4s,v20.4s,#7
  554. eor w12,w12,w13
  555. sli v5.4s,v21.4s,#7
  556. eor w9,w9,w14
  557. sli v17.4s,v22.4s,#7
  558. ror w10,w10,#25
  559. ext v2.16b,v2.16b,v2.16b,#8
  560. ror w11,w11,#25
  561. ext v6.16b,v6.16b,v6.16b,#8
  562. ror w12,w12,#25
  563. ext v18.16b,v18.16b,v18.16b,#8
  564. ror w9,w9,#25
  565. ext v3.16b,v3.16b,v3.16b,#4
  566. ext v7.16b,v7.16b,v7.16b,#4
  567. ext v19.16b,v19.16b,v19.16b,#4
  568. ext v1.16b,v1.16b,v1.16b,#12
  569. ext v5.16b,v5.16b,v5.16b,#12
  570. ext v17.16b,v17.16b,v17.16b,#12
  571. cbnz x4,Loop_neon
  572. add w5,w5,w22 // accumulate key block
  573. add v0.4s,v0.4s,v24.4s
  574. add x6,x6,x22,lsr#32
  575. add v4.4s,v4.4s,v24.4s
  576. add w7,w7,w23
  577. add v16.4s,v16.4s,v24.4s
  578. add x8,x8,x23,lsr#32
  579. add v2.4s,v2.4s,v26.4s
  580. add w9,w9,w24
  581. add v6.4s,v6.4s,v26.4s
  582. add x10,x10,x24,lsr#32
  583. add v18.4s,v18.4s,v26.4s
  584. add w11,w11,w25
  585. add v3.4s,v3.4s,v27.4s
  586. add x12,x12,x25,lsr#32
  587. add w13,w13,w26
  588. add v7.4s,v7.4s,v28.4s
  589. add x14,x14,x26,lsr#32
  590. add w15,w15,w27
  591. add v19.4s,v19.4s,v29.4s
  592. add x16,x16,x27,lsr#32
  593. add w17,w17,w28
  594. add v1.4s,v1.4s,v25.4s
  595. add x19,x19,x28,lsr#32
  596. add w20,w20,w30
  597. add v5.4s,v5.4s,v25.4s
  598. add x21,x21,x30,lsr#32
  599. add v17.4s,v17.4s,v25.4s
  600. b.lo Ltail_neon
  601. add x5,x5,x6,lsl#32 // pack
  602. add x7,x7,x8,lsl#32
  603. ldp x6,x8,[x1,#0] // load input
  604. add x9,x9,x10,lsl#32
  605. add x11,x11,x12,lsl#32
  606. ldp x10,x12,[x1,#16]
  607. add x13,x13,x14,lsl#32
  608. add x15,x15,x16,lsl#32
  609. ldp x14,x16,[x1,#32]
  610. add x17,x17,x19,lsl#32
  611. add x20,x20,x21,lsl#32
  612. ldp x19,x21,[x1,#48]
  613. add x1,x1,#64
  614. #ifdef __ARMEB__
  615. rev x5,x5
  616. rev x7,x7
  617. rev x9,x9
  618. rev x11,x11
  619. rev x13,x13
  620. rev x15,x15
  621. rev x17,x17
  622. rev x20,x20
  623. #endif
  624. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  625. eor x5,x5,x6
  626. eor x7,x7,x8
  627. eor x9,x9,x10
  628. eor x11,x11,x12
  629. eor x13,x13,x14
  630. eor v0.16b,v0.16b,v20.16b
  631. eor x15,x15,x16
  632. eor v1.16b,v1.16b,v21.16b
  633. eor x17,x17,x19
  634. eor v2.16b,v2.16b,v22.16b
  635. eor x20,x20,x21
  636. eor v3.16b,v3.16b,v23.16b
  637. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  638. stp x5,x7,[x0,#0] // store output
  639. add x28,x28,#4 // increment counter
  640. stp x9,x11,[x0,#16]
  641. add v27.4s,v27.4s,v31.4s // += 4
  642. stp x13,x15,[x0,#32]
  643. add v28.4s,v28.4s,v31.4s
  644. stp x17,x20,[x0,#48]
  645. add v29.4s,v29.4s,v31.4s
  646. add x0,x0,#64
  647. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  648. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  649. eor v4.16b,v4.16b,v20.16b
  650. eor v5.16b,v5.16b,v21.16b
  651. eor v6.16b,v6.16b,v22.16b
  652. eor v7.16b,v7.16b,v23.16b
  653. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  654. eor v16.16b,v16.16b,v0.16b
  655. eor v17.16b,v17.16b,v1.16b
  656. eor v18.16b,v18.16b,v2.16b
  657. eor v19.16b,v19.16b,v3.16b
  658. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  659. b.hi Loop_outer_neon
  660. ldp x19,x20,[x29,#16]
  661. add sp,sp,#64
  662. ldp x21,x22,[x29,#32]
  663. ldp x23,x24,[x29,#48]
  664. ldp x25,x26,[x29,#64]
  665. ldp x27,x28,[x29,#80]
  666. ldp x29,x30,[sp],#96
  667. AARCH64_VALIDATE_LINK_REGISTER
  668. ret
  669. Ltail_neon:
  670. add x2,x2,#256
  671. cmp x2,#64
  672. b.lo Less_than_64
  673. add x5,x5,x6,lsl#32 // pack
  674. add x7,x7,x8,lsl#32
  675. ldp x6,x8,[x1,#0] // load input
  676. add x9,x9,x10,lsl#32
  677. add x11,x11,x12,lsl#32
  678. ldp x10,x12,[x1,#16]
  679. add x13,x13,x14,lsl#32
  680. add x15,x15,x16,lsl#32
  681. ldp x14,x16,[x1,#32]
  682. add x17,x17,x19,lsl#32
  683. add x20,x20,x21,lsl#32
  684. ldp x19,x21,[x1,#48]
  685. add x1,x1,#64
  686. #ifdef __ARMEB__
  687. rev x5,x5
  688. rev x7,x7
  689. rev x9,x9
  690. rev x11,x11
  691. rev x13,x13
  692. rev x15,x15
  693. rev x17,x17
  694. rev x20,x20
  695. #endif
  696. eor x5,x5,x6
  697. eor x7,x7,x8
  698. eor x9,x9,x10
  699. eor x11,x11,x12
  700. eor x13,x13,x14
  701. eor x15,x15,x16
  702. eor x17,x17,x19
  703. eor x20,x20,x21
  704. stp x5,x7,[x0,#0] // store output
  705. add x28,x28,#4 // increment counter
  706. stp x9,x11,[x0,#16]
  707. stp x13,x15,[x0,#32]
  708. stp x17,x20,[x0,#48]
  709. add x0,x0,#64
  710. b.eq Ldone_neon
  711. sub x2,x2,#64
  712. cmp x2,#64
  713. b.lo Less_than_128
  714. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  715. eor v0.16b,v0.16b,v20.16b
  716. eor v1.16b,v1.16b,v21.16b
  717. eor v2.16b,v2.16b,v22.16b
  718. eor v3.16b,v3.16b,v23.16b
  719. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  720. b.eq Ldone_neon
  721. sub x2,x2,#64
  722. cmp x2,#64
  723. b.lo Less_than_192
  724. ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
  725. eor v4.16b,v4.16b,v20.16b
  726. eor v5.16b,v5.16b,v21.16b
  727. eor v6.16b,v6.16b,v22.16b
  728. eor v7.16b,v7.16b,v23.16b
  729. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  730. b.eq Ldone_neon
  731. sub x2,x2,#64
  732. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
  733. b Last_neon
  734. Less_than_128:
  735. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
  736. b Last_neon
  737. Less_than_192:
  738. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
  739. b Last_neon
  740. .align 4
  741. Last_neon:
  742. sub x0,x0,#1
  743. add x1,x1,x2
  744. add x0,x0,x2
  745. add x4,sp,x2
  746. neg x2,x2
  747. Loop_tail_neon:
  748. ldrb w10,[x1,x2]
  749. ldrb w11,[x4,x2]
  750. add x2,x2,#1
  751. eor w10,w10,w11
  752. strb w10,[x0,x2]
  753. cbnz x2,Loop_tail_neon
  754. stp xzr,xzr,[sp,#0]
  755. stp xzr,xzr,[sp,#16]
  756. stp xzr,xzr,[sp,#32]
  757. stp xzr,xzr,[sp,#48]
  758. Ldone_neon:
  759. ldp x19,x20,[x29,#16]
  760. add sp,sp,#64
  761. ldp x21,x22,[x29,#32]
  762. ldp x23,x24,[x29,#48]
  763. ldp x25,x26,[x29,#64]
  764. ldp x27,x28,[x29,#80]
  765. ldp x29,x30,[sp],#96
  766. AARCH64_VALIDATE_LINK_REGISTER
  767. ret
  768. .align 5
  769. ChaCha20_512_neon:
  770. AARCH64_SIGN_LINK_REGISTER
  771. stp x29,x30,[sp,#-96]!
  772. add x29,sp,#0
  773. adrp x5,Lsigma@PAGE
  774. add x5,x5,Lsigma@PAGEOFF
  775. stp x19,x20,[sp,#16]
  776. stp x21,x22,[sp,#32]
  777. stp x23,x24,[sp,#48]
  778. stp x25,x26,[sp,#64]
  779. stp x27,x28,[sp,#80]
  780. L512_or_more_neon:
  781. sub sp,sp,#128+64
  782. ldp x22,x23,[x5] // load sigma
  783. ld1 {v24.4s},[x5],#16
  784. ldp x24,x25,[x3] // load key
  785. ldp x26,x27,[x3,#16]
  786. ld1 {v25.4s,v26.4s},[x3]
  787. ldp x28,x30,[x4] // load counter
  788. ld1 {v27.4s},[x4]
  789. ld1 {v31.4s},[x5]
  790. #ifdef __ARMEB__
  791. rev64 v24.4s,v24.4s
  792. ror x24,x24,#32
  793. ror x25,x25,#32
  794. ror x26,x26,#32
  795. ror x27,x27,#32
  796. ror x28,x28,#32
  797. ror x30,x30,#32
  798. #endif
  799. add v27.4s,v27.4s,v31.4s // += 1
  800. stp q24,q25,[sp,#0] // off-load key block, invariant part
  801. add v27.4s,v27.4s,v31.4s // not typo
  802. str q26,[sp,#32]
  803. add v28.4s,v27.4s,v31.4s
  804. add v29.4s,v28.4s,v31.4s
  805. add v30.4s,v29.4s,v31.4s
  806. shl v31.4s,v31.4s,#2 // 1 -> 4
  807. stp d8,d9,[sp,#128+0] // meet ABI requirements
  808. stp d10,d11,[sp,#128+16]
  809. stp d12,d13,[sp,#128+32]
  810. stp d14,d15,[sp,#128+48]
  811. sub x2,x2,#512 // not typo
  812. Loop_outer_512_neon:
  813. mov v0.16b,v24.16b
  814. mov v4.16b,v24.16b
  815. mov v8.16b,v24.16b
  816. mov v12.16b,v24.16b
  817. mov v16.16b,v24.16b
  818. mov v20.16b,v24.16b
  819. mov v1.16b,v25.16b
  820. mov w5,w22 // unpack key block
  821. mov v5.16b,v25.16b
  822. lsr x6,x22,#32
  823. mov v9.16b,v25.16b
  824. mov w7,w23
  825. mov v13.16b,v25.16b
  826. lsr x8,x23,#32
  827. mov v17.16b,v25.16b
  828. mov w9,w24
  829. mov v21.16b,v25.16b
  830. lsr x10,x24,#32
  831. mov v3.16b,v27.16b
  832. mov w11,w25
  833. mov v7.16b,v28.16b
  834. lsr x12,x25,#32
  835. mov v11.16b,v29.16b
  836. mov w13,w26
  837. mov v15.16b,v30.16b
  838. lsr x14,x26,#32
  839. mov v2.16b,v26.16b
  840. mov w15,w27
  841. mov v6.16b,v26.16b
  842. lsr x16,x27,#32
  843. add v19.4s,v3.4s,v31.4s // +4
  844. mov w17,w28
  845. add v23.4s,v7.4s,v31.4s // +4
  846. lsr x19,x28,#32
  847. mov v10.16b,v26.16b
  848. mov w20,w30
  849. mov v14.16b,v26.16b
  850. lsr x21,x30,#32
  851. mov v18.16b,v26.16b
  852. stp q27,q28,[sp,#48] // off-load key block, variable part
  853. mov v22.16b,v26.16b
  854. str q29,[sp,#80]
  855. mov x4,#5
  856. subs x2,x2,#512
  857. Loop_upper_neon:
  858. sub x4,x4,#1
  859. add v0.4s,v0.4s,v1.4s
  860. add w5,w5,w9
  861. add v4.4s,v4.4s,v5.4s
  862. add w6,w6,w10
  863. add v8.4s,v8.4s,v9.4s
  864. add w7,w7,w11
  865. add v12.4s,v12.4s,v13.4s
  866. add w8,w8,w12
  867. add v16.4s,v16.4s,v17.4s
  868. eor w17,w17,w5
  869. add v20.4s,v20.4s,v21.4s
  870. eor w19,w19,w6
  871. eor v3.16b,v3.16b,v0.16b
  872. eor w20,w20,w7
  873. eor v7.16b,v7.16b,v4.16b
  874. eor w21,w21,w8
  875. eor v11.16b,v11.16b,v8.16b
  876. ror w17,w17,#16
  877. eor v15.16b,v15.16b,v12.16b
  878. ror w19,w19,#16
  879. eor v19.16b,v19.16b,v16.16b
  880. ror w20,w20,#16
  881. eor v23.16b,v23.16b,v20.16b
  882. ror w21,w21,#16
  883. rev32 v3.8h,v3.8h
  884. add w13,w13,w17
  885. rev32 v7.8h,v7.8h
  886. add w14,w14,w19
  887. rev32 v11.8h,v11.8h
  888. add w15,w15,w20
  889. rev32 v15.8h,v15.8h
  890. add w16,w16,w21
  891. rev32 v19.8h,v19.8h
  892. eor w9,w9,w13
  893. rev32 v23.8h,v23.8h
  894. eor w10,w10,w14
  895. add v2.4s,v2.4s,v3.4s
  896. eor w11,w11,w15
  897. add v6.4s,v6.4s,v7.4s
  898. eor w12,w12,w16
  899. add v10.4s,v10.4s,v11.4s
  900. ror w9,w9,#20
  901. add v14.4s,v14.4s,v15.4s
  902. ror w10,w10,#20
  903. add v18.4s,v18.4s,v19.4s
  904. ror w11,w11,#20
  905. add v22.4s,v22.4s,v23.4s
  906. ror w12,w12,#20
  907. eor v24.16b,v1.16b,v2.16b
  908. add w5,w5,w9
  909. eor v25.16b,v5.16b,v6.16b
  910. add w6,w6,w10
  911. eor v26.16b,v9.16b,v10.16b
  912. add w7,w7,w11
  913. eor v27.16b,v13.16b,v14.16b
  914. add w8,w8,w12
  915. eor v28.16b,v17.16b,v18.16b
  916. eor w17,w17,w5
  917. eor v29.16b,v21.16b,v22.16b
  918. eor w19,w19,w6
  919. ushr v1.4s,v24.4s,#20
  920. eor w20,w20,w7
  921. ushr v5.4s,v25.4s,#20
  922. eor w21,w21,w8
  923. ushr v9.4s,v26.4s,#20
  924. ror w17,w17,#24
  925. ushr v13.4s,v27.4s,#20
  926. ror w19,w19,#24
  927. ushr v17.4s,v28.4s,#20
  928. ror w20,w20,#24
  929. ushr v21.4s,v29.4s,#20
  930. ror w21,w21,#24
  931. sli v1.4s,v24.4s,#12
  932. add w13,w13,w17
  933. sli v5.4s,v25.4s,#12
  934. add w14,w14,w19
  935. sli v9.4s,v26.4s,#12
  936. add w15,w15,w20
  937. sli v13.4s,v27.4s,#12
  938. add w16,w16,w21
  939. sli v17.4s,v28.4s,#12
  940. eor w9,w9,w13
  941. sli v21.4s,v29.4s,#12
  942. eor w10,w10,w14
  943. add v0.4s,v0.4s,v1.4s
  944. eor w11,w11,w15
  945. add v4.4s,v4.4s,v5.4s
  946. eor w12,w12,w16
  947. add v8.4s,v8.4s,v9.4s
  948. ror w9,w9,#25
  949. add v12.4s,v12.4s,v13.4s
  950. ror w10,w10,#25
  951. add v16.4s,v16.4s,v17.4s
  952. ror w11,w11,#25
  953. add v20.4s,v20.4s,v21.4s
  954. ror w12,w12,#25
  955. eor v24.16b,v3.16b,v0.16b
  956. add w5,w5,w10
  957. eor v25.16b,v7.16b,v4.16b
  958. add w6,w6,w11
  959. eor v26.16b,v11.16b,v8.16b
  960. add w7,w7,w12
  961. eor v27.16b,v15.16b,v12.16b
  962. add w8,w8,w9
  963. eor v28.16b,v19.16b,v16.16b
  964. eor w21,w21,w5
  965. eor v29.16b,v23.16b,v20.16b
  966. eor w17,w17,w6
  967. ushr v3.4s,v24.4s,#24
  968. eor w19,w19,w7
  969. ushr v7.4s,v25.4s,#24
  970. eor w20,w20,w8
  971. ushr v11.4s,v26.4s,#24
  972. ror w21,w21,#16
  973. ushr v15.4s,v27.4s,#24
  974. ror w17,w17,#16
  975. ushr v19.4s,v28.4s,#24
  976. ror w19,w19,#16
  977. ushr v23.4s,v29.4s,#24
  978. ror w20,w20,#16
  979. sli v3.4s,v24.4s,#8
  980. add w15,w15,w21
  981. sli v7.4s,v25.4s,#8
  982. add w16,w16,w17
  983. sli v11.4s,v26.4s,#8
  984. add w13,w13,w19
  985. sli v15.4s,v27.4s,#8
  986. add w14,w14,w20
  987. sli v19.4s,v28.4s,#8
  988. eor w10,w10,w15
  989. sli v23.4s,v29.4s,#8
  990. eor w11,w11,w16
  991. add v2.4s,v2.4s,v3.4s
  992. eor w12,w12,w13
  993. add v6.4s,v6.4s,v7.4s
  994. eor w9,w9,w14
  995. add v10.4s,v10.4s,v11.4s
  996. ror w10,w10,#20
  997. add v14.4s,v14.4s,v15.4s
  998. ror w11,w11,#20
  999. add v18.4s,v18.4s,v19.4s
  1000. ror w12,w12,#20
  1001. add v22.4s,v22.4s,v23.4s
  1002. ror w9,w9,#20
  1003. eor v24.16b,v1.16b,v2.16b
  1004. add w5,w5,w10
  1005. eor v25.16b,v5.16b,v6.16b
  1006. add w6,w6,w11
  1007. eor v26.16b,v9.16b,v10.16b
  1008. add w7,w7,w12
  1009. eor v27.16b,v13.16b,v14.16b
  1010. add w8,w8,w9
  1011. eor v28.16b,v17.16b,v18.16b
  1012. eor w21,w21,w5
  1013. eor v29.16b,v21.16b,v22.16b
  1014. eor w17,w17,w6
  1015. ushr v1.4s,v24.4s,#25
  1016. eor w19,w19,w7
  1017. ushr v5.4s,v25.4s,#25
  1018. eor w20,w20,w8
  1019. ushr v9.4s,v26.4s,#25
  1020. ror w21,w21,#24
  1021. ushr v13.4s,v27.4s,#25
  1022. ror w17,w17,#24
  1023. ushr v17.4s,v28.4s,#25
  1024. ror w19,w19,#24
  1025. ushr v21.4s,v29.4s,#25
  1026. ror w20,w20,#24
  1027. sli v1.4s,v24.4s,#7
  1028. add w15,w15,w21
  1029. sli v5.4s,v25.4s,#7
  1030. add w16,w16,w17
  1031. sli v9.4s,v26.4s,#7
  1032. add w13,w13,w19
  1033. sli v13.4s,v27.4s,#7
  1034. add w14,w14,w20
  1035. sli v17.4s,v28.4s,#7
  1036. eor w10,w10,w15
  1037. sli v21.4s,v29.4s,#7
  1038. eor w11,w11,w16
  1039. ext v2.16b,v2.16b,v2.16b,#8
  1040. eor w12,w12,w13
  1041. ext v6.16b,v6.16b,v6.16b,#8
  1042. eor w9,w9,w14
  1043. ext v10.16b,v10.16b,v10.16b,#8
  1044. ror w10,w10,#25
  1045. ext v14.16b,v14.16b,v14.16b,#8
  1046. ror w11,w11,#25
  1047. ext v18.16b,v18.16b,v18.16b,#8
  1048. ror w12,w12,#25
  1049. ext v22.16b,v22.16b,v22.16b,#8
  1050. ror w9,w9,#25
  1051. ext v3.16b,v3.16b,v3.16b,#12
  1052. ext v7.16b,v7.16b,v7.16b,#12
  1053. ext v11.16b,v11.16b,v11.16b,#12
  1054. ext v15.16b,v15.16b,v15.16b,#12
  1055. ext v19.16b,v19.16b,v19.16b,#12
  1056. ext v23.16b,v23.16b,v23.16b,#12
  1057. ext v1.16b,v1.16b,v1.16b,#4
  1058. ext v5.16b,v5.16b,v5.16b,#4
  1059. ext v9.16b,v9.16b,v9.16b,#4
  1060. ext v13.16b,v13.16b,v13.16b,#4
  1061. ext v17.16b,v17.16b,v17.16b,#4
  1062. ext v21.16b,v21.16b,v21.16b,#4
  1063. add v0.4s,v0.4s,v1.4s
  1064. add w5,w5,w9
  1065. add v4.4s,v4.4s,v5.4s
  1066. add w6,w6,w10
  1067. add v8.4s,v8.4s,v9.4s
  1068. add w7,w7,w11
  1069. add v12.4s,v12.4s,v13.4s
  1070. add w8,w8,w12
  1071. add v16.4s,v16.4s,v17.4s
  1072. eor w17,w17,w5
  1073. add v20.4s,v20.4s,v21.4s
  1074. eor w19,w19,w6
  1075. eor v3.16b,v3.16b,v0.16b
  1076. eor w20,w20,w7
  1077. eor v7.16b,v7.16b,v4.16b
  1078. eor w21,w21,w8
  1079. eor v11.16b,v11.16b,v8.16b
  1080. ror w17,w17,#16
  1081. eor v15.16b,v15.16b,v12.16b
  1082. ror w19,w19,#16
  1083. eor v19.16b,v19.16b,v16.16b
  1084. ror w20,w20,#16
  1085. eor v23.16b,v23.16b,v20.16b
  1086. ror w21,w21,#16
  1087. rev32 v3.8h,v3.8h
  1088. add w13,w13,w17
  1089. rev32 v7.8h,v7.8h
  1090. add w14,w14,w19
  1091. rev32 v11.8h,v11.8h
  1092. add w15,w15,w20
  1093. rev32 v15.8h,v15.8h
  1094. add w16,w16,w21
  1095. rev32 v19.8h,v19.8h
  1096. eor w9,w9,w13
  1097. rev32 v23.8h,v23.8h
  1098. eor w10,w10,w14
  1099. add v2.4s,v2.4s,v3.4s
  1100. eor w11,w11,w15
  1101. add v6.4s,v6.4s,v7.4s
  1102. eor w12,w12,w16
  1103. add v10.4s,v10.4s,v11.4s
  1104. ror w9,w9,#20
  1105. add v14.4s,v14.4s,v15.4s
  1106. ror w10,w10,#20
  1107. add v18.4s,v18.4s,v19.4s
  1108. ror w11,w11,#20
  1109. add v22.4s,v22.4s,v23.4s
  1110. ror w12,w12,#20
  1111. eor v24.16b,v1.16b,v2.16b
  1112. add w5,w5,w9
  1113. eor v25.16b,v5.16b,v6.16b
  1114. add w6,w6,w10
  1115. eor v26.16b,v9.16b,v10.16b
  1116. add w7,w7,w11
  1117. eor v27.16b,v13.16b,v14.16b
  1118. add w8,w8,w12
  1119. eor v28.16b,v17.16b,v18.16b
  1120. eor w17,w17,w5
  1121. eor v29.16b,v21.16b,v22.16b
  1122. eor w19,w19,w6
  1123. ushr v1.4s,v24.4s,#20
  1124. eor w20,w20,w7
  1125. ushr v5.4s,v25.4s,#20
  1126. eor w21,w21,w8
  1127. ushr v9.4s,v26.4s,#20
  1128. ror w17,w17,#24
  1129. ushr v13.4s,v27.4s,#20
  1130. ror w19,w19,#24
  1131. ushr v17.4s,v28.4s,#20
  1132. ror w20,w20,#24
  1133. ushr v21.4s,v29.4s,#20
  1134. ror w21,w21,#24
  1135. sli v1.4s,v24.4s,#12
  1136. add w13,w13,w17
  1137. sli v5.4s,v25.4s,#12
  1138. add w14,w14,w19
  1139. sli v9.4s,v26.4s,#12
  1140. add w15,w15,w20
  1141. sli v13.4s,v27.4s,#12
  1142. add w16,w16,w21
  1143. sli v17.4s,v28.4s,#12
  1144. eor w9,w9,w13
  1145. sli v21.4s,v29.4s,#12
  1146. eor w10,w10,w14
  1147. add v0.4s,v0.4s,v1.4s
  1148. eor w11,w11,w15
  1149. add v4.4s,v4.4s,v5.4s
  1150. eor w12,w12,w16
  1151. add v8.4s,v8.4s,v9.4s
  1152. ror w9,w9,#25
  1153. add v12.4s,v12.4s,v13.4s
  1154. ror w10,w10,#25
  1155. add v16.4s,v16.4s,v17.4s
  1156. ror w11,w11,#25
  1157. add v20.4s,v20.4s,v21.4s
  1158. ror w12,w12,#25
  1159. eor v24.16b,v3.16b,v0.16b
  1160. add w5,w5,w10
  1161. eor v25.16b,v7.16b,v4.16b
  1162. add w6,w6,w11
  1163. eor v26.16b,v11.16b,v8.16b
  1164. add w7,w7,w12
  1165. eor v27.16b,v15.16b,v12.16b
  1166. add w8,w8,w9
  1167. eor v28.16b,v19.16b,v16.16b
  1168. eor w21,w21,w5
  1169. eor v29.16b,v23.16b,v20.16b
  1170. eor w17,w17,w6
  1171. ushr v3.4s,v24.4s,#24
  1172. eor w19,w19,w7
  1173. ushr v7.4s,v25.4s,#24
  1174. eor w20,w20,w8
  1175. ushr v11.4s,v26.4s,#24
  1176. ror w21,w21,#16
  1177. ushr v15.4s,v27.4s,#24
  1178. ror w17,w17,#16
  1179. ushr v19.4s,v28.4s,#24
  1180. ror w19,w19,#16
  1181. ushr v23.4s,v29.4s,#24
  1182. ror w20,w20,#16
  1183. sli v3.4s,v24.4s,#8
  1184. add w15,w15,w21
  1185. sli v7.4s,v25.4s,#8
  1186. add w16,w16,w17
  1187. sli v11.4s,v26.4s,#8
  1188. add w13,w13,w19
  1189. sli v15.4s,v27.4s,#8
  1190. add w14,w14,w20
  1191. sli v19.4s,v28.4s,#8
  1192. eor w10,w10,w15
  1193. sli v23.4s,v29.4s,#8
  1194. eor w11,w11,w16
  1195. add v2.4s,v2.4s,v3.4s
  1196. eor w12,w12,w13
  1197. add v6.4s,v6.4s,v7.4s
  1198. eor w9,w9,w14
  1199. add v10.4s,v10.4s,v11.4s
  1200. ror w10,w10,#20
  1201. add v14.4s,v14.4s,v15.4s
  1202. ror w11,w11,#20
  1203. add v18.4s,v18.4s,v19.4s
  1204. ror w12,w12,#20
  1205. add v22.4s,v22.4s,v23.4s
  1206. ror w9,w9,#20
  1207. eor v24.16b,v1.16b,v2.16b
  1208. add w5,w5,w10
  1209. eor v25.16b,v5.16b,v6.16b
  1210. add w6,w6,w11
  1211. eor v26.16b,v9.16b,v10.16b
  1212. add w7,w7,w12
  1213. eor v27.16b,v13.16b,v14.16b
  1214. add w8,w8,w9
  1215. eor v28.16b,v17.16b,v18.16b
  1216. eor w21,w21,w5
  1217. eor v29.16b,v21.16b,v22.16b
  1218. eor w17,w17,w6
  1219. ushr v1.4s,v24.4s,#25
  1220. eor w19,w19,w7
  1221. ushr v5.4s,v25.4s,#25
  1222. eor w20,w20,w8
  1223. ushr v9.4s,v26.4s,#25
  1224. ror w21,w21,#24
  1225. ushr v13.4s,v27.4s,#25
  1226. ror w17,w17,#24
  1227. ushr v17.4s,v28.4s,#25
  1228. ror w19,w19,#24
  1229. ushr v21.4s,v29.4s,#25
  1230. ror w20,w20,#24
  1231. sli v1.4s,v24.4s,#7
  1232. add w15,w15,w21
  1233. sli v5.4s,v25.4s,#7
  1234. add w16,w16,w17
  1235. sli v9.4s,v26.4s,#7
  1236. add w13,w13,w19
  1237. sli v13.4s,v27.4s,#7
  1238. add w14,w14,w20
  1239. sli v17.4s,v28.4s,#7
  1240. eor w10,w10,w15
  1241. sli v21.4s,v29.4s,#7
  1242. eor w11,w11,w16
  1243. ext v2.16b,v2.16b,v2.16b,#8
  1244. eor w12,w12,w13
  1245. ext v6.16b,v6.16b,v6.16b,#8
  1246. eor w9,w9,w14
  1247. ext v10.16b,v10.16b,v10.16b,#8
  1248. ror w10,w10,#25
  1249. ext v14.16b,v14.16b,v14.16b,#8
  1250. ror w11,w11,#25
  1251. ext v18.16b,v18.16b,v18.16b,#8
  1252. ror w12,w12,#25
  1253. ext v22.16b,v22.16b,v22.16b,#8
  1254. ror w9,w9,#25
  1255. ext v3.16b,v3.16b,v3.16b,#4
  1256. ext v7.16b,v7.16b,v7.16b,#4
  1257. ext v11.16b,v11.16b,v11.16b,#4
  1258. ext v15.16b,v15.16b,v15.16b,#4
  1259. ext v19.16b,v19.16b,v19.16b,#4
  1260. ext v23.16b,v23.16b,v23.16b,#4
  1261. ext v1.16b,v1.16b,v1.16b,#12
  1262. ext v5.16b,v5.16b,v5.16b,#12
  1263. ext v9.16b,v9.16b,v9.16b,#12
  1264. ext v13.16b,v13.16b,v13.16b,#12
  1265. ext v17.16b,v17.16b,v17.16b,#12
  1266. ext v21.16b,v21.16b,v21.16b,#12
  1267. cbnz x4,Loop_upper_neon
  1268. add w5,w5,w22 // accumulate key block
  1269. add x6,x6,x22,lsr#32
  1270. add w7,w7,w23
  1271. add x8,x8,x23,lsr#32
  1272. add w9,w9,w24
  1273. add x10,x10,x24,lsr#32
  1274. add w11,w11,w25
  1275. add x12,x12,x25,lsr#32
  1276. add w13,w13,w26
  1277. add x14,x14,x26,lsr#32
  1278. add w15,w15,w27
  1279. add x16,x16,x27,lsr#32
  1280. add w17,w17,w28
  1281. add x19,x19,x28,lsr#32
  1282. add w20,w20,w30
  1283. add x21,x21,x30,lsr#32
  1284. add x5,x5,x6,lsl#32 // pack
  1285. add x7,x7,x8,lsl#32
  1286. ldp x6,x8,[x1,#0] // load input
  1287. add x9,x9,x10,lsl#32
  1288. add x11,x11,x12,lsl#32
  1289. ldp x10,x12,[x1,#16]
  1290. add x13,x13,x14,lsl#32
  1291. add x15,x15,x16,lsl#32
  1292. ldp x14,x16,[x1,#32]
  1293. add x17,x17,x19,lsl#32
  1294. add x20,x20,x21,lsl#32
  1295. ldp x19,x21,[x1,#48]
  1296. add x1,x1,#64
  1297. #ifdef __ARMEB__
  1298. rev x5,x5
  1299. rev x7,x7
  1300. rev x9,x9
  1301. rev x11,x11
  1302. rev x13,x13
  1303. rev x15,x15
  1304. rev x17,x17
  1305. rev x20,x20
  1306. #endif
  1307. eor x5,x5,x6
  1308. eor x7,x7,x8
  1309. eor x9,x9,x10
  1310. eor x11,x11,x12
  1311. eor x13,x13,x14
  1312. eor x15,x15,x16
  1313. eor x17,x17,x19
  1314. eor x20,x20,x21
  1315. stp x5,x7,[x0,#0] // store output
  1316. add x28,x28,#1 // increment counter
  1317. mov w5,w22 // unpack key block
  1318. lsr x6,x22,#32
  1319. stp x9,x11,[x0,#16]
  1320. mov w7,w23
  1321. lsr x8,x23,#32
  1322. stp x13,x15,[x0,#32]
  1323. mov w9,w24
  1324. lsr x10,x24,#32
  1325. stp x17,x20,[x0,#48]
  1326. add x0,x0,#64
  1327. mov w11,w25
  1328. lsr x12,x25,#32
  1329. mov w13,w26
  1330. lsr x14,x26,#32
  1331. mov w15,w27
  1332. lsr x16,x27,#32
  1333. mov w17,w28
  1334. lsr x19,x28,#32
  1335. mov w20,w30
  1336. lsr x21,x30,#32
  1337. mov x4,#5
  1338. Loop_lower_neon:
  1339. sub x4,x4,#1
  1340. add v0.4s,v0.4s,v1.4s
  1341. add w5,w5,w9
  1342. add v4.4s,v4.4s,v5.4s
  1343. add w6,w6,w10
  1344. add v8.4s,v8.4s,v9.4s
  1345. add w7,w7,w11
  1346. add v12.4s,v12.4s,v13.4s
  1347. add w8,w8,w12
  1348. add v16.4s,v16.4s,v17.4s
  1349. eor w17,w17,w5
  1350. add v20.4s,v20.4s,v21.4s
  1351. eor w19,w19,w6
  1352. eor v3.16b,v3.16b,v0.16b
  1353. eor w20,w20,w7
  1354. eor v7.16b,v7.16b,v4.16b
  1355. eor w21,w21,w8
  1356. eor v11.16b,v11.16b,v8.16b
  1357. ror w17,w17,#16
  1358. eor v15.16b,v15.16b,v12.16b
  1359. ror w19,w19,#16
  1360. eor v19.16b,v19.16b,v16.16b
  1361. ror w20,w20,#16
  1362. eor v23.16b,v23.16b,v20.16b
  1363. ror w21,w21,#16
  1364. rev32 v3.8h,v3.8h
  1365. add w13,w13,w17
  1366. rev32 v7.8h,v7.8h
  1367. add w14,w14,w19
  1368. rev32 v11.8h,v11.8h
  1369. add w15,w15,w20
  1370. rev32 v15.8h,v15.8h
  1371. add w16,w16,w21
  1372. rev32 v19.8h,v19.8h
  1373. eor w9,w9,w13
  1374. rev32 v23.8h,v23.8h
  1375. eor w10,w10,w14
  1376. add v2.4s,v2.4s,v3.4s
  1377. eor w11,w11,w15
  1378. add v6.4s,v6.4s,v7.4s
  1379. eor w12,w12,w16
  1380. add v10.4s,v10.4s,v11.4s
  1381. ror w9,w9,#20
  1382. add v14.4s,v14.4s,v15.4s
  1383. ror w10,w10,#20
  1384. add v18.4s,v18.4s,v19.4s
  1385. ror w11,w11,#20
  1386. add v22.4s,v22.4s,v23.4s
  1387. ror w12,w12,#20
  1388. eor v24.16b,v1.16b,v2.16b
  1389. add w5,w5,w9
  1390. eor v25.16b,v5.16b,v6.16b
  1391. add w6,w6,w10
  1392. eor v26.16b,v9.16b,v10.16b
  1393. add w7,w7,w11
  1394. eor v27.16b,v13.16b,v14.16b
  1395. add w8,w8,w12
  1396. eor v28.16b,v17.16b,v18.16b
  1397. eor w17,w17,w5
  1398. eor v29.16b,v21.16b,v22.16b
  1399. eor w19,w19,w6
  1400. ushr v1.4s,v24.4s,#20
  1401. eor w20,w20,w7
  1402. ushr v5.4s,v25.4s,#20
  1403. eor w21,w21,w8
  1404. ushr v9.4s,v26.4s,#20
  1405. ror w17,w17,#24
  1406. ushr v13.4s,v27.4s,#20
  1407. ror w19,w19,#24
  1408. ushr v17.4s,v28.4s,#20
  1409. ror w20,w20,#24
  1410. ushr v21.4s,v29.4s,#20
  1411. ror w21,w21,#24
  1412. sli v1.4s,v24.4s,#12
  1413. add w13,w13,w17
  1414. sli v5.4s,v25.4s,#12
  1415. add w14,w14,w19
  1416. sli v9.4s,v26.4s,#12
  1417. add w15,w15,w20
  1418. sli v13.4s,v27.4s,#12
  1419. add w16,w16,w21
  1420. sli v17.4s,v28.4s,#12
  1421. eor w9,w9,w13
  1422. sli v21.4s,v29.4s,#12
  1423. eor w10,w10,w14
  1424. add v0.4s,v0.4s,v1.4s
  1425. eor w11,w11,w15
  1426. add v4.4s,v4.4s,v5.4s
  1427. eor w12,w12,w16
  1428. add v8.4s,v8.4s,v9.4s
  1429. ror w9,w9,#25
  1430. add v12.4s,v12.4s,v13.4s
  1431. ror w10,w10,#25
  1432. add v16.4s,v16.4s,v17.4s
  1433. ror w11,w11,#25
  1434. add v20.4s,v20.4s,v21.4s
  1435. ror w12,w12,#25
  1436. eor v24.16b,v3.16b,v0.16b
  1437. add w5,w5,w10
  1438. eor v25.16b,v7.16b,v4.16b
  1439. add w6,w6,w11
  1440. eor v26.16b,v11.16b,v8.16b
  1441. add w7,w7,w12
  1442. eor v27.16b,v15.16b,v12.16b
  1443. add w8,w8,w9
  1444. eor v28.16b,v19.16b,v16.16b
  1445. eor w21,w21,w5
  1446. eor v29.16b,v23.16b,v20.16b
  1447. eor w17,w17,w6
  1448. ushr v3.4s,v24.4s,#24
  1449. eor w19,w19,w7
  1450. ushr v7.4s,v25.4s,#24
  1451. eor w20,w20,w8
  1452. ushr v11.4s,v26.4s,#24
  1453. ror w21,w21,#16
  1454. ushr v15.4s,v27.4s,#24
  1455. ror w17,w17,#16
  1456. ushr v19.4s,v28.4s,#24
  1457. ror w19,w19,#16
  1458. ushr v23.4s,v29.4s,#24
  1459. ror w20,w20,#16
  1460. sli v3.4s,v24.4s,#8
  1461. add w15,w15,w21
  1462. sli v7.4s,v25.4s,#8
  1463. add w16,w16,w17
  1464. sli v11.4s,v26.4s,#8
  1465. add w13,w13,w19
  1466. sli v15.4s,v27.4s,#8
  1467. add w14,w14,w20
  1468. sli v19.4s,v28.4s,#8
  1469. eor w10,w10,w15
  1470. sli v23.4s,v29.4s,#8
  1471. eor w11,w11,w16
  1472. add v2.4s,v2.4s,v3.4s
  1473. eor w12,w12,w13
  1474. add v6.4s,v6.4s,v7.4s
  1475. eor w9,w9,w14
  1476. add v10.4s,v10.4s,v11.4s
  1477. ror w10,w10,#20
  1478. add v14.4s,v14.4s,v15.4s
  1479. ror w11,w11,#20
  1480. add v18.4s,v18.4s,v19.4s
  1481. ror w12,w12,#20
  1482. add v22.4s,v22.4s,v23.4s
  1483. ror w9,w9,#20
  1484. eor v24.16b,v1.16b,v2.16b
  1485. add w5,w5,w10
  1486. eor v25.16b,v5.16b,v6.16b
  1487. add w6,w6,w11
  1488. eor v26.16b,v9.16b,v10.16b
  1489. add w7,w7,w12
  1490. eor v27.16b,v13.16b,v14.16b
  1491. add w8,w8,w9
  1492. eor v28.16b,v17.16b,v18.16b
  1493. eor w21,w21,w5
  1494. eor v29.16b,v21.16b,v22.16b
  1495. eor w17,w17,w6
  1496. ushr v1.4s,v24.4s,#25
  1497. eor w19,w19,w7
  1498. ushr v5.4s,v25.4s,#25
  1499. eor w20,w20,w8
  1500. ushr v9.4s,v26.4s,#25
  1501. ror w21,w21,#24
  1502. ushr v13.4s,v27.4s,#25
  1503. ror w17,w17,#24
  1504. ushr v17.4s,v28.4s,#25
  1505. ror w19,w19,#24
  1506. ushr v21.4s,v29.4s,#25
  1507. ror w20,w20,#24
  1508. sli v1.4s,v24.4s,#7
  1509. add w15,w15,w21
  1510. sli v5.4s,v25.4s,#7
  1511. add w16,w16,w17
  1512. sli v9.4s,v26.4s,#7
  1513. add w13,w13,w19
  1514. sli v13.4s,v27.4s,#7
  1515. add w14,w14,w20
  1516. sli v17.4s,v28.4s,#7
  1517. eor w10,w10,w15
  1518. sli v21.4s,v29.4s,#7
  1519. eor w11,w11,w16
  1520. ext v2.16b,v2.16b,v2.16b,#8
  1521. eor w12,w12,w13
  1522. ext v6.16b,v6.16b,v6.16b,#8
  1523. eor w9,w9,w14
  1524. ext v10.16b,v10.16b,v10.16b,#8
  1525. ror w10,w10,#25
  1526. ext v14.16b,v14.16b,v14.16b,#8
  1527. ror w11,w11,#25
  1528. ext v18.16b,v18.16b,v18.16b,#8
  1529. ror w12,w12,#25
  1530. ext v22.16b,v22.16b,v22.16b,#8
  1531. ror w9,w9,#25
  1532. ext v3.16b,v3.16b,v3.16b,#12
  1533. ext v7.16b,v7.16b,v7.16b,#12
  1534. ext v11.16b,v11.16b,v11.16b,#12
  1535. ext v15.16b,v15.16b,v15.16b,#12
  1536. ext v19.16b,v19.16b,v19.16b,#12
  1537. ext v23.16b,v23.16b,v23.16b,#12
  1538. ext v1.16b,v1.16b,v1.16b,#4
  1539. ext v5.16b,v5.16b,v5.16b,#4
  1540. ext v9.16b,v9.16b,v9.16b,#4
  1541. ext v13.16b,v13.16b,v13.16b,#4
  1542. ext v17.16b,v17.16b,v17.16b,#4
  1543. ext v21.16b,v21.16b,v21.16b,#4
  1544. add v0.4s,v0.4s,v1.4s
  1545. add w5,w5,w9
  1546. add v4.4s,v4.4s,v5.4s
  1547. add w6,w6,w10
  1548. add v8.4s,v8.4s,v9.4s
  1549. add w7,w7,w11
  1550. add v12.4s,v12.4s,v13.4s
  1551. add w8,w8,w12
  1552. add v16.4s,v16.4s,v17.4s
  1553. eor w17,w17,w5
  1554. add v20.4s,v20.4s,v21.4s
  1555. eor w19,w19,w6
  1556. eor v3.16b,v3.16b,v0.16b
  1557. eor w20,w20,w7
  1558. eor v7.16b,v7.16b,v4.16b
  1559. eor w21,w21,w8
  1560. eor v11.16b,v11.16b,v8.16b
  1561. ror w17,w17,#16
  1562. eor v15.16b,v15.16b,v12.16b
  1563. ror w19,w19,#16
  1564. eor v19.16b,v19.16b,v16.16b
  1565. ror w20,w20,#16
  1566. eor v23.16b,v23.16b,v20.16b
  1567. ror w21,w21,#16
  1568. rev32 v3.8h,v3.8h
  1569. add w13,w13,w17
  1570. rev32 v7.8h,v7.8h
  1571. add w14,w14,w19
  1572. rev32 v11.8h,v11.8h
  1573. add w15,w15,w20
  1574. rev32 v15.8h,v15.8h
  1575. add w16,w16,w21
  1576. rev32 v19.8h,v19.8h
  1577. eor w9,w9,w13
  1578. rev32 v23.8h,v23.8h
  1579. eor w10,w10,w14
  1580. add v2.4s,v2.4s,v3.4s
  1581. eor w11,w11,w15
  1582. add v6.4s,v6.4s,v7.4s
  1583. eor w12,w12,w16
  1584. add v10.4s,v10.4s,v11.4s
  1585. ror w9,w9,#20
  1586. add v14.4s,v14.4s,v15.4s
  1587. ror w10,w10,#20
  1588. add v18.4s,v18.4s,v19.4s
  1589. ror w11,w11,#20
  1590. add v22.4s,v22.4s,v23.4s
  1591. ror w12,w12,#20
  1592. eor v24.16b,v1.16b,v2.16b
  1593. add w5,w5,w9
  1594. eor v25.16b,v5.16b,v6.16b
  1595. add w6,w6,w10
  1596. eor v26.16b,v9.16b,v10.16b
  1597. add w7,w7,w11
  1598. eor v27.16b,v13.16b,v14.16b
  1599. add w8,w8,w12
  1600. eor v28.16b,v17.16b,v18.16b
  1601. eor w17,w17,w5
  1602. eor v29.16b,v21.16b,v22.16b
  1603. eor w19,w19,w6
  1604. ushr v1.4s,v24.4s,#20
  1605. eor w20,w20,w7
  1606. ushr v5.4s,v25.4s,#20
  1607. eor w21,w21,w8
  1608. ushr v9.4s,v26.4s,#20
  1609. ror w17,w17,#24
  1610. ushr v13.4s,v27.4s,#20
  1611. ror w19,w19,#24
  1612. ushr v17.4s,v28.4s,#20
  1613. ror w20,w20,#24
  1614. ushr v21.4s,v29.4s,#20
  1615. ror w21,w21,#24
  1616. sli v1.4s,v24.4s,#12
  1617. add w13,w13,w17
  1618. sli v5.4s,v25.4s,#12
  1619. add w14,w14,w19
  1620. sli v9.4s,v26.4s,#12
  1621. add w15,w15,w20
  1622. sli v13.4s,v27.4s,#12
  1623. add w16,w16,w21
  1624. sli v17.4s,v28.4s,#12
  1625. eor w9,w9,w13
  1626. sli v21.4s,v29.4s,#12
  1627. eor w10,w10,w14
  1628. add v0.4s,v0.4s,v1.4s
  1629. eor w11,w11,w15
  1630. add v4.4s,v4.4s,v5.4s
  1631. eor w12,w12,w16
  1632. add v8.4s,v8.4s,v9.4s
  1633. ror w9,w9,#25
  1634. add v12.4s,v12.4s,v13.4s
  1635. ror w10,w10,#25
  1636. add v16.4s,v16.4s,v17.4s
  1637. ror w11,w11,#25
  1638. add v20.4s,v20.4s,v21.4s
  1639. ror w12,w12,#25
  1640. eor v24.16b,v3.16b,v0.16b
  1641. add w5,w5,w10
  1642. eor v25.16b,v7.16b,v4.16b
  1643. add w6,w6,w11
  1644. eor v26.16b,v11.16b,v8.16b
  1645. add w7,w7,w12
  1646. eor v27.16b,v15.16b,v12.16b
  1647. add w8,w8,w9
  1648. eor v28.16b,v19.16b,v16.16b
  1649. eor w21,w21,w5
  1650. eor v29.16b,v23.16b,v20.16b
  1651. eor w17,w17,w6
  1652. ushr v3.4s,v24.4s,#24
  1653. eor w19,w19,w7
  1654. ushr v7.4s,v25.4s,#24
  1655. eor w20,w20,w8
  1656. ushr v11.4s,v26.4s,#24
  1657. ror w21,w21,#16
  1658. ushr v15.4s,v27.4s,#24
  1659. ror w17,w17,#16
  1660. ushr v19.4s,v28.4s,#24
  1661. ror w19,w19,#16
  1662. ushr v23.4s,v29.4s,#24
  1663. ror w20,w20,#16
  1664. sli v3.4s,v24.4s,#8
  1665. add w15,w15,w21
  1666. sli v7.4s,v25.4s,#8
  1667. add w16,w16,w17
  1668. sli v11.4s,v26.4s,#8
  1669. add w13,w13,w19
  1670. sli v15.4s,v27.4s,#8
  1671. add w14,w14,w20
  1672. sli v19.4s,v28.4s,#8
  1673. eor w10,w10,w15
  1674. sli v23.4s,v29.4s,#8
  1675. eor w11,w11,w16
  1676. add v2.4s,v2.4s,v3.4s
  1677. eor w12,w12,w13
  1678. add v6.4s,v6.4s,v7.4s
  1679. eor w9,w9,w14
  1680. add v10.4s,v10.4s,v11.4s
  1681. ror w10,w10,#20
  1682. add v14.4s,v14.4s,v15.4s
  1683. ror w11,w11,#20
  1684. add v18.4s,v18.4s,v19.4s
  1685. ror w12,w12,#20
  1686. add v22.4s,v22.4s,v23.4s
  1687. ror w9,w9,#20
  1688. eor v24.16b,v1.16b,v2.16b
  1689. add w5,w5,w10
  1690. eor v25.16b,v5.16b,v6.16b
  1691. add w6,w6,w11
  1692. eor v26.16b,v9.16b,v10.16b
  1693. add w7,w7,w12
  1694. eor v27.16b,v13.16b,v14.16b
  1695. add w8,w8,w9
  1696. eor v28.16b,v17.16b,v18.16b
  1697. eor w21,w21,w5
  1698. eor v29.16b,v21.16b,v22.16b
  1699. eor w17,w17,w6
  1700. ushr v1.4s,v24.4s,#25
  1701. eor w19,w19,w7
  1702. ushr v5.4s,v25.4s,#25
  1703. eor w20,w20,w8
  1704. ushr v9.4s,v26.4s,#25
  1705. ror w21,w21,#24
  1706. ushr v13.4s,v27.4s,#25
  1707. ror w17,w17,#24
  1708. ushr v17.4s,v28.4s,#25
  1709. ror w19,w19,#24
  1710. ushr v21.4s,v29.4s,#25
  1711. ror w20,w20,#24
  1712. sli v1.4s,v24.4s,#7
  1713. add w15,w15,w21
  1714. sli v5.4s,v25.4s,#7
  1715. add w16,w16,w17
  1716. sli v9.4s,v26.4s,#7
  1717. add w13,w13,w19
  1718. sli v13.4s,v27.4s,#7
  1719. add w14,w14,w20
  1720. sli v17.4s,v28.4s,#7
  1721. eor w10,w10,w15
  1722. sli v21.4s,v29.4s,#7
  1723. eor w11,w11,w16
  1724. ext v2.16b,v2.16b,v2.16b,#8
  1725. eor w12,w12,w13
  1726. ext v6.16b,v6.16b,v6.16b,#8
  1727. eor w9,w9,w14
  1728. ext v10.16b,v10.16b,v10.16b,#8
  1729. ror w10,w10,#25
  1730. ext v14.16b,v14.16b,v14.16b,#8
  1731. ror w11,w11,#25
  1732. ext v18.16b,v18.16b,v18.16b,#8
  1733. ror w12,w12,#25
  1734. ext v22.16b,v22.16b,v22.16b,#8
  1735. ror w9,w9,#25
  1736. ext v3.16b,v3.16b,v3.16b,#4
  1737. ext v7.16b,v7.16b,v7.16b,#4
  1738. ext v11.16b,v11.16b,v11.16b,#4
  1739. ext v15.16b,v15.16b,v15.16b,#4
  1740. ext v19.16b,v19.16b,v19.16b,#4
  1741. ext v23.16b,v23.16b,v23.16b,#4
  1742. ext v1.16b,v1.16b,v1.16b,#12
  1743. ext v5.16b,v5.16b,v5.16b,#12
  1744. ext v9.16b,v9.16b,v9.16b,#12
  1745. ext v13.16b,v13.16b,v13.16b,#12
  1746. ext v17.16b,v17.16b,v17.16b,#12
  1747. ext v21.16b,v21.16b,v21.16b,#12
  1748. cbnz x4,Loop_lower_neon
  1749. add w5,w5,w22 // accumulate key block
  1750. ldp q24,q25,[sp,#0]
  1751. add x6,x6,x22,lsr#32
  1752. ldp q26,q27,[sp,#32]
  1753. add w7,w7,w23
  1754. ldp q28,q29,[sp,#64]
  1755. add x8,x8,x23,lsr#32
  1756. add v0.4s,v0.4s,v24.4s
  1757. add w9,w9,w24
  1758. add v4.4s,v4.4s,v24.4s
  1759. add x10,x10,x24,lsr#32
  1760. add v8.4s,v8.4s,v24.4s
  1761. add w11,w11,w25
  1762. add v12.4s,v12.4s,v24.4s
  1763. add x12,x12,x25,lsr#32
  1764. add v16.4s,v16.4s,v24.4s
  1765. add w13,w13,w26
  1766. add v20.4s,v20.4s,v24.4s
  1767. add x14,x14,x26,lsr#32
  1768. add v2.4s,v2.4s,v26.4s
  1769. add w15,w15,w27
  1770. add v6.4s,v6.4s,v26.4s
  1771. add x16,x16,x27,lsr#32
  1772. add v10.4s,v10.4s,v26.4s
  1773. add w17,w17,w28
  1774. add v14.4s,v14.4s,v26.4s
  1775. add x19,x19,x28,lsr#32
  1776. add v18.4s,v18.4s,v26.4s
  1777. add w20,w20,w30
  1778. add v22.4s,v22.4s,v26.4s
  1779. add x21,x21,x30,lsr#32
  1780. add v19.4s,v19.4s,v31.4s // +4
  1781. add x5,x5,x6,lsl#32 // pack
  1782. add v23.4s,v23.4s,v31.4s // +4
  1783. add x7,x7,x8,lsl#32
  1784. add v3.4s,v3.4s,v27.4s
  1785. ldp x6,x8,[x1,#0] // load input
  1786. add v7.4s,v7.4s,v28.4s
  1787. add x9,x9,x10,lsl#32
  1788. add v11.4s,v11.4s,v29.4s
  1789. add x11,x11,x12,lsl#32
  1790. add v15.4s,v15.4s,v30.4s
  1791. ldp x10,x12,[x1,#16]
  1792. add v19.4s,v19.4s,v27.4s
  1793. add x13,x13,x14,lsl#32
  1794. add v23.4s,v23.4s,v28.4s
  1795. add x15,x15,x16,lsl#32
  1796. add v1.4s,v1.4s,v25.4s
  1797. ldp x14,x16,[x1,#32]
  1798. add v5.4s,v5.4s,v25.4s
  1799. add x17,x17,x19,lsl#32
  1800. add v9.4s,v9.4s,v25.4s
  1801. add x20,x20,x21,lsl#32
  1802. add v13.4s,v13.4s,v25.4s
  1803. ldp x19,x21,[x1,#48]
  1804. add v17.4s,v17.4s,v25.4s
  1805. add x1,x1,#64
  1806. add v21.4s,v21.4s,v25.4s
  1807. #ifdef __ARMEB__
  1808. rev x5,x5
  1809. rev x7,x7
  1810. rev x9,x9
  1811. rev x11,x11
  1812. rev x13,x13
  1813. rev x15,x15
  1814. rev x17,x17
  1815. rev x20,x20
  1816. #endif
  1817. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1818. eor x5,x5,x6
  1819. eor x7,x7,x8
  1820. eor x9,x9,x10
  1821. eor x11,x11,x12
  1822. eor x13,x13,x14
  1823. eor v0.16b,v0.16b,v24.16b
  1824. eor x15,x15,x16
  1825. eor v1.16b,v1.16b,v25.16b
  1826. eor x17,x17,x19
  1827. eor v2.16b,v2.16b,v26.16b
  1828. eor x20,x20,x21
  1829. eor v3.16b,v3.16b,v27.16b
  1830. ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
  1831. stp x5,x7,[x0,#0] // store output
  1832. add x28,x28,#7 // increment counter
  1833. stp x9,x11,[x0,#16]
  1834. stp x13,x15,[x0,#32]
  1835. stp x17,x20,[x0,#48]
  1836. add x0,x0,#64
  1837. st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
  1838. ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
  1839. eor v4.16b,v4.16b,v24.16b
  1840. eor v5.16b,v5.16b,v25.16b
  1841. eor v6.16b,v6.16b,v26.16b
  1842. eor v7.16b,v7.16b,v27.16b
  1843. st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
  1844. ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
  1845. eor v8.16b,v8.16b,v0.16b
  1846. ldp q24,q25,[sp,#0]
  1847. eor v9.16b,v9.16b,v1.16b
  1848. ldp q26,q27,[sp,#32]
  1849. eor v10.16b,v10.16b,v2.16b
  1850. eor v11.16b,v11.16b,v3.16b
  1851. st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
  1852. ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
  1853. eor v12.16b,v12.16b,v4.16b
  1854. eor v13.16b,v13.16b,v5.16b
  1855. eor v14.16b,v14.16b,v6.16b
  1856. eor v15.16b,v15.16b,v7.16b
  1857. st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
  1858. ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
  1859. eor v16.16b,v16.16b,v8.16b
  1860. eor v17.16b,v17.16b,v9.16b
  1861. eor v18.16b,v18.16b,v10.16b
  1862. eor v19.16b,v19.16b,v11.16b
  1863. st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
  1864. shl v0.4s,v31.4s,#1 // 4 -> 8
  1865. eor v20.16b,v20.16b,v12.16b
  1866. eor v21.16b,v21.16b,v13.16b
  1867. eor v22.16b,v22.16b,v14.16b
  1868. eor v23.16b,v23.16b,v15.16b
  1869. st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
  1870. add v27.4s,v27.4s,v0.4s // += 8
  1871. add v28.4s,v28.4s,v0.4s
  1872. add v29.4s,v29.4s,v0.4s
  1873. add v30.4s,v30.4s,v0.4s
  1874. b.hs Loop_outer_512_neon
  1875. adds x2,x2,#512
  1876. ushr v0.4s,v31.4s,#2 // 4 -> 1
  1877. ldp d8,d9,[sp,#128+0] // meet ABI requirements
  1878. ldp d10,d11,[sp,#128+16]
  1879. ldp d12,d13,[sp,#128+32]
  1880. ldp d14,d15,[sp,#128+48]
  1881. stp q24,q31,[sp,#0] // wipe off-load area
  1882. stp q24,q31,[sp,#32]
  1883. stp q24,q31,[sp,#64]
  1884. b.eq Ldone_512_neon
  1885. cmp x2,#192
  1886. sub v27.4s,v27.4s,v0.4s // -= 1
  1887. sub v28.4s,v28.4s,v0.4s
  1888. sub v29.4s,v29.4s,v0.4s
  1889. add sp,sp,#128
  1890. b.hs Loop_outer_neon
  1891. eor v25.16b,v25.16b,v25.16b
  1892. eor v26.16b,v26.16b,v26.16b
  1893. eor v27.16b,v27.16b,v27.16b
  1894. eor v28.16b,v28.16b,v28.16b
  1895. eor v29.16b,v29.16b,v29.16b
  1896. eor v30.16b,v30.16b,v30.16b
  1897. b Loop_outer
  1898. Ldone_512_neon:
  1899. ldp x19,x20,[x29,#16]
  1900. add sp,sp,#128+64
  1901. ldp x21,x22,[x29,#32]
  1902. ldp x23,x24,[x29,#48]
  1903. ldp x25,x26,[x29,#64]
  1904. ldp x27,x28,[x29,#80]
  1905. ldp x29,x30,[sp],#96
  1906. AARCH64_VALIDATE_LINK_REGISTER
  1907. ret
  1908. #endif // !OPENSSL_NO_ASM