aesv8-armx64.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. #if __ARM_MAX_ARCH__>=7
  16. .text
  17. .arch armv8-a+crypto
  18. .section .rodata
  19. .align 5
  20. Lrcon:
  21. .long 0x01,0x01,0x01,0x01
  22. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  23. .long 0x1b,0x1b,0x1b,0x1b
  24. .text
  25. .globl aes_hw_set_encrypt_key
  26. .def aes_hw_set_encrypt_key
  27. .type 32
  28. .endef
  29. .align 5
  30. aes_hw_set_encrypt_key:
  31. Lenc_key:
  32. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  33. AARCH64_VALID_CALL_TARGET
  34. stp x29,x30,[sp,#-16]!
  35. add x29,sp,#0
  36. mov x3,#-1
  37. cmp x0,#0
  38. b.eq Lenc_key_abort
  39. cmp x2,#0
  40. b.eq Lenc_key_abort
  41. mov x3,#-2
  42. cmp w1,#128
  43. b.lt Lenc_key_abort
  44. cmp w1,#256
  45. b.gt Lenc_key_abort
  46. tst w1,#0x3f
  47. b.ne Lenc_key_abort
  48. adrp x3,Lrcon
  49. add x3,x3,:lo12:Lrcon
  50. cmp w1,#192
  51. eor v0.16b,v0.16b,v0.16b
  52. ld1 {v3.16b},[x0],#16
  53. mov w1,#8 // reuse w1
  54. ld1 {v1.4s,v2.4s},[x3],#32
  55. b.lt Loop128
  56. b.eq L192
  57. b L256
  58. .align 4
  59. Loop128:
  60. tbl v6.16b,{v3.16b},v2.16b
  61. ext v5.16b,v0.16b,v3.16b,#12
  62. st1 {v3.4s},[x2],#16
  63. aese v6.16b,v0.16b
  64. subs w1,w1,#1
  65. eor v3.16b,v3.16b,v5.16b
  66. ext v5.16b,v0.16b,v5.16b,#12
  67. eor v3.16b,v3.16b,v5.16b
  68. ext v5.16b,v0.16b,v5.16b,#12
  69. eor v6.16b,v6.16b,v1.16b
  70. eor v3.16b,v3.16b,v5.16b
  71. shl v1.16b,v1.16b,#1
  72. eor v3.16b,v3.16b,v6.16b
  73. b.ne Loop128
  74. ld1 {v1.4s},[x3]
  75. tbl v6.16b,{v3.16b},v2.16b
  76. ext v5.16b,v0.16b,v3.16b,#12
  77. st1 {v3.4s},[x2],#16
  78. aese v6.16b,v0.16b
  79. eor v3.16b,v3.16b,v5.16b
  80. ext v5.16b,v0.16b,v5.16b,#12
  81. eor v3.16b,v3.16b,v5.16b
  82. ext v5.16b,v0.16b,v5.16b,#12
  83. eor v6.16b,v6.16b,v1.16b
  84. eor v3.16b,v3.16b,v5.16b
  85. shl v1.16b,v1.16b,#1
  86. eor v3.16b,v3.16b,v6.16b
  87. tbl v6.16b,{v3.16b},v2.16b
  88. ext v5.16b,v0.16b,v3.16b,#12
  89. st1 {v3.4s},[x2],#16
  90. aese v6.16b,v0.16b
  91. eor v3.16b,v3.16b,v5.16b
  92. ext v5.16b,v0.16b,v5.16b,#12
  93. eor v3.16b,v3.16b,v5.16b
  94. ext v5.16b,v0.16b,v5.16b,#12
  95. eor v6.16b,v6.16b,v1.16b
  96. eor v3.16b,v3.16b,v5.16b
  97. eor v3.16b,v3.16b,v6.16b
  98. st1 {v3.4s},[x2]
  99. add x2,x2,#0x50
  100. mov w12,#10
  101. b Ldone
  102. .align 4
  103. L192:
  104. ld1 {v4.8b},[x0],#8
  105. movi v6.16b,#8 // borrow v6.16b
  106. st1 {v3.4s},[x2],#16
  107. sub v2.16b,v2.16b,v6.16b // adjust the mask
  108. Loop192:
  109. tbl v6.16b,{v4.16b},v2.16b
  110. ext v5.16b,v0.16b,v3.16b,#12
  111. st1 {v4.8b},[x2],#8
  112. aese v6.16b,v0.16b
  113. subs w1,w1,#1
  114. eor v3.16b,v3.16b,v5.16b
  115. ext v5.16b,v0.16b,v5.16b,#12
  116. eor v3.16b,v3.16b,v5.16b
  117. ext v5.16b,v0.16b,v5.16b,#12
  118. eor v3.16b,v3.16b,v5.16b
  119. dup v5.4s,v3.s[3]
  120. eor v5.16b,v5.16b,v4.16b
  121. eor v6.16b,v6.16b,v1.16b
  122. ext v4.16b,v0.16b,v4.16b,#12
  123. shl v1.16b,v1.16b,#1
  124. eor v4.16b,v4.16b,v5.16b
  125. eor v3.16b,v3.16b,v6.16b
  126. eor v4.16b,v4.16b,v6.16b
  127. st1 {v3.4s},[x2],#16
  128. b.ne Loop192
  129. mov w12,#12
  130. add x2,x2,#0x20
  131. b Ldone
  132. .align 4
  133. L256:
  134. ld1 {v4.16b},[x0]
  135. mov w1,#7
  136. mov w12,#14
  137. st1 {v3.4s},[x2],#16
  138. Loop256:
  139. tbl v6.16b,{v4.16b},v2.16b
  140. ext v5.16b,v0.16b,v3.16b,#12
  141. st1 {v4.4s},[x2],#16
  142. aese v6.16b,v0.16b
  143. subs w1,w1,#1
  144. eor v3.16b,v3.16b,v5.16b
  145. ext v5.16b,v0.16b,v5.16b,#12
  146. eor v3.16b,v3.16b,v5.16b
  147. ext v5.16b,v0.16b,v5.16b,#12
  148. eor v6.16b,v6.16b,v1.16b
  149. eor v3.16b,v3.16b,v5.16b
  150. shl v1.16b,v1.16b,#1
  151. eor v3.16b,v3.16b,v6.16b
  152. st1 {v3.4s},[x2],#16
  153. b.eq Ldone
  154. dup v6.4s,v3.s[3] // just splat
  155. ext v5.16b,v0.16b,v4.16b,#12
  156. aese v6.16b,v0.16b
  157. eor v4.16b,v4.16b,v5.16b
  158. ext v5.16b,v0.16b,v5.16b,#12
  159. eor v4.16b,v4.16b,v5.16b
  160. ext v5.16b,v0.16b,v5.16b,#12
  161. eor v4.16b,v4.16b,v5.16b
  162. eor v4.16b,v4.16b,v6.16b
  163. b Loop256
  164. Ldone:
  165. str w12,[x2]
  166. mov x3,#0
  167. Lenc_key_abort:
  168. mov x0,x3 // return value
  169. ldr x29,[sp],#16
  170. ret
  171. .globl aes_hw_set_decrypt_key
  172. .def aes_hw_set_decrypt_key
  173. .type 32
  174. .endef
  175. .align 5
  176. aes_hw_set_decrypt_key:
  177. AARCH64_SIGN_LINK_REGISTER
  178. stp x29,x30,[sp,#-16]!
  179. add x29,sp,#0
  180. bl Lenc_key
  181. cmp x0,#0
  182. b.ne Ldec_key_abort
  183. sub x2,x2,#240 // restore original x2
  184. mov x4,#-16
  185. add x0,x2,x12,lsl#4 // end of key schedule
  186. ld1 {v0.4s},[x2]
  187. ld1 {v1.4s},[x0]
  188. st1 {v0.4s},[x0],x4
  189. st1 {v1.4s},[x2],#16
  190. Loop_imc:
  191. ld1 {v0.4s},[x2]
  192. ld1 {v1.4s},[x0]
  193. aesimc v0.16b,v0.16b
  194. aesimc v1.16b,v1.16b
  195. st1 {v0.4s},[x0],x4
  196. st1 {v1.4s},[x2],#16
  197. cmp x0,x2
  198. b.hi Loop_imc
  199. ld1 {v0.4s},[x2]
  200. aesimc v0.16b,v0.16b
  201. st1 {v0.4s},[x0]
  202. eor x0,x0,x0 // return value
  203. Ldec_key_abort:
  204. ldp x29,x30,[sp],#16
  205. AARCH64_VALIDATE_LINK_REGISTER
  206. ret
  207. .globl aes_hw_encrypt
  208. .def aes_hw_encrypt
  209. .type 32
  210. .endef
  211. .align 5
  212. aes_hw_encrypt:
  213. AARCH64_VALID_CALL_TARGET
  214. ldr w3,[x2,#240]
  215. ld1 {v0.4s},[x2],#16
  216. ld1 {v2.16b},[x0]
  217. sub w3,w3,#2
  218. ld1 {v1.4s},[x2],#16
  219. Loop_enc:
  220. aese v2.16b,v0.16b
  221. aesmc v2.16b,v2.16b
  222. ld1 {v0.4s},[x2],#16
  223. subs w3,w3,#2
  224. aese v2.16b,v1.16b
  225. aesmc v2.16b,v2.16b
  226. ld1 {v1.4s},[x2],#16
  227. b.gt Loop_enc
  228. aese v2.16b,v0.16b
  229. aesmc v2.16b,v2.16b
  230. ld1 {v0.4s},[x2]
  231. aese v2.16b,v1.16b
  232. eor v2.16b,v2.16b,v0.16b
  233. st1 {v2.16b},[x1]
  234. ret
  235. .globl aes_hw_decrypt
  236. .def aes_hw_decrypt
  237. .type 32
  238. .endef
  239. .align 5
  240. aes_hw_decrypt:
  241. AARCH64_VALID_CALL_TARGET
  242. ldr w3,[x2,#240]
  243. ld1 {v0.4s},[x2],#16
  244. ld1 {v2.16b},[x0]
  245. sub w3,w3,#2
  246. ld1 {v1.4s},[x2],#16
  247. Loop_dec:
  248. aesd v2.16b,v0.16b
  249. aesimc v2.16b,v2.16b
  250. ld1 {v0.4s},[x2],#16
  251. subs w3,w3,#2
  252. aesd v2.16b,v1.16b
  253. aesimc v2.16b,v2.16b
  254. ld1 {v1.4s},[x2],#16
  255. b.gt Loop_dec
  256. aesd v2.16b,v0.16b
  257. aesimc v2.16b,v2.16b
  258. ld1 {v0.4s},[x2]
  259. aesd v2.16b,v1.16b
  260. eor v2.16b,v2.16b,v0.16b
  261. st1 {v2.16b},[x1]
  262. ret
  263. .globl aes_hw_cbc_encrypt
  264. .def aes_hw_cbc_encrypt
  265. .type 32
  266. .endef
  267. .align 5
  268. aes_hw_cbc_encrypt:
  269. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  270. AARCH64_VALID_CALL_TARGET
  271. stp x29,x30,[sp,#-16]!
  272. add x29,sp,#0
  273. subs x2,x2,#16
  274. mov x8,#16
  275. b.lo Lcbc_abort
  276. csel x8,xzr,x8,eq
  277. cmp w5,#0 // en- or decrypting?
  278. ldr w5,[x3,#240]
  279. and x2,x2,#-16
  280. ld1 {v6.16b},[x4]
  281. ld1 {v0.16b},[x0],x8
  282. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  283. sub w5,w5,#6
  284. add x7,x3,x5,lsl#4 // pointer to last 7 round keys
  285. sub w5,w5,#2
  286. ld1 {v18.4s,v19.4s},[x7],#32
  287. ld1 {v20.4s,v21.4s},[x7],#32
  288. ld1 {v22.4s,v23.4s},[x7],#32
  289. ld1 {v7.4s},[x7]
  290. add x7,x3,#32
  291. mov w6,w5
  292. b.eq Lcbc_dec
  293. cmp w5,#2
  294. eor v0.16b,v0.16b,v6.16b
  295. eor v5.16b,v16.16b,v7.16b
  296. b.eq Lcbc_enc128
  297. ld1 {v2.4s,v3.4s},[x7]
  298. add x7,x3,#16
  299. add x6,x3,#16*4
  300. add x12,x3,#16*5
  301. aese v0.16b,v16.16b
  302. aesmc v0.16b,v0.16b
  303. add x14,x3,#16*6
  304. add x3,x3,#16*7
  305. b Lenter_cbc_enc
  306. .align 4
  307. Loop_cbc_enc:
  308. aese v0.16b,v16.16b
  309. aesmc v0.16b,v0.16b
  310. st1 {v6.16b},[x1],#16
  311. Lenter_cbc_enc:
  312. aese v0.16b,v17.16b
  313. aesmc v0.16b,v0.16b
  314. aese v0.16b,v2.16b
  315. aesmc v0.16b,v0.16b
  316. ld1 {v16.4s},[x6]
  317. cmp w5,#4
  318. aese v0.16b,v3.16b
  319. aesmc v0.16b,v0.16b
  320. ld1 {v17.4s},[x12]
  321. b.eq Lcbc_enc192
  322. aese v0.16b,v16.16b
  323. aesmc v0.16b,v0.16b
  324. ld1 {v16.4s},[x14]
  325. aese v0.16b,v17.16b
  326. aesmc v0.16b,v0.16b
  327. ld1 {v17.4s},[x3]
  328. nop
  329. Lcbc_enc192:
  330. aese v0.16b,v16.16b
  331. aesmc v0.16b,v0.16b
  332. subs x2,x2,#16
  333. aese v0.16b,v17.16b
  334. aesmc v0.16b,v0.16b
  335. csel x8,xzr,x8,eq
  336. aese v0.16b,v18.16b
  337. aesmc v0.16b,v0.16b
  338. aese v0.16b,v19.16b
  339. aesmc v0.16b,v0.16b
  340. ld1 {v16.16b},[x0],x8
  341. aese v0.16b,v20.16b
  342. aesmc v0.16b,v0.16b
  343. eor v16.16b,v16.16b,v5.16b
  344. aese v0.16b,v21.16b
  345. aesmc v0.16b,v0.16b
  346. ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
  347. aese v0.16b,v22.16b
  348. aesmc v0.16b,v0.16b
  349. aese v0.16b,v23.16b
  350. eor v6.16b,v0.16b,v7.16b
  351. b.hs Loop_cbc_enc
  352. st1 {v6.16b},[x1],#16
  353. b Lcbc_done
  354. .align 5
  355. Lcbc_enc128:
  356. ld1 {v2.4s,v3.4s},[x7]
  357. aese v0.16b,v16.16b
  358. aesmc v0.16b,v0.16b
  359. b Lenter_cbc_enc128
  360. Loop_cbc_enc128:
  361. aese v0.16b,v16.16b
  362. aesmc v0.16b,v0.16b
  363. st1 {v6.16b},[x1],#16
  364. Lenter_cbc_enc128:
  365. aese v0.16b,v17.16b
  366. aesmc v0.16b,v0.16b
  367. subs x2,x2,#16
  368. aese v0.16b,v2.16b
  369. aesmc v0.16b,v0.16b
  370. csel x8,xzr,x8,eq
  371. aese v0.16b,v3.16b
  372. aesmc v0.16b,v0.16b
  373. aese v0.16b,v18.16b
  374. aesmc v0.16b,v0.16b
  375. aese v0.16b,v19.16b
  376. aesmc v0.16b,v0.16b
  377. ld1 {v16.16b},[x0],x8
  378. aese v0.16b,v20.16b
  379. aesmc v0.16b,v0.16b
  380. aese v0.16b,v21.16b
  381. aesmc v0.16b,v0.16b
  382. aese v0.16b,v22.16b
  383. aesmc v0.16b,v0.16b
  384. eor v16.16b,v16.16b,v5.16b
  385. aese v0.16b,v23.16b
  386. eor v6.16b,v0.16b,v7.16b
  387. b.hs Loop_cbc_enc128
  388. st1 {v6.16b},[x1],#16
  389. b Lcbc_done
  390. .align 5
  391. Lcbc_dec:
  392. ld1 {v18.16b},[x0],#16
  393. subs x2,x2,#32 // bias
  394. add w6,w5,#2
  395. orr v3.16b,v0.16b,v0.16b
  396. orr v1.16b,v0.16b,v0.16b
  397. orr v19.16b,v18.16b,v18.16b
  398. b.lo Lcbc_dec_tail
  399. orr v1.16b,v18.16b,v18.16b
  400. ld1 {v18.16b},[x0],#16
  401. orr v2.16b,v0.16b,v0.16b
  402. orr v3.16b,v1.16b,v1.16b
  403. orr v19.16b,v18.16b,v18.16b
  404. Loop3x_cbc_dec:
  405. aesd v0.16b,v16.16b
  406. aesimc v0.16b,v0.16b
  407. aesd v1.16b,v16.16b
  408. aesimc v1.16b,v1.16b
  409. aesd v18.16b,v16.16b
  410. aesimc v18.16b,v18.16b
  411. ld1 {v16.4s},[x7],#16
  412. subs w6,w6,#2
  413. aesd v0.16b,v17.16b
  414. aesimc v0.16b,v0.16b
  415. aesd v1.16b,v17.16b
  416. aesimc v1.16b,v1.16b
  417. aesd v18.16b,v17.16b
  418. aesimc v18.16b,v18.16b
  419. ld1 {v17.4s},[x7],#16
  420. b.gt Loop3x_cbc_dec
  421. aesd v0.16b,v16.16b
  422. aesimc v0.16b,v0.16b
  423. aesd v1.16b,v16.16b
  424. aesimc v1.16b,v1.16b
  425. aesd v18.16b,v16.16b
  426. aesimc v18.16b,v18.16b
  427. eor v4.16b,v6.16b,v7.16b
  428. subs x2,x2,#0x30
  429. eor v5.16b,v2.16b,v7.16b
  430. csel x6,x2,x6,lo // x6, w6, is zero at this point
  431. aesd v0.16b,v17.16b
  432. aesimc v0.16b,v0.16b
  433. aesd v1.16b,v17.16b
  434. aesimc v1.16b,v1.16b
  435. aesd v18.16b,v17.16b
  436. aesimc v18.16b,v18.16b
  437. eor v17.16b,v3.16b,v7.16b
  438. add x0,x0,x6 // x0 is adjusted in such way that
  439. // at exit from the loop v1.16b-v18.16b
  440. // are loaded with last "words"
  441. orr v6.16b,v19.16b,v19.16b
  442. mov x7,x3
  443. aesd v0.16b,v20.16b
  444. aesimc v0.16b,v0.16b
  445. aesd v1.16b,v20.16b
  446. aesimc v1.16b,v1.16b
  447. aesd v18.16b,v20.16b
  448. aesimc v18.16b,v18.16b
  449. ld1 {v2.16b},[x0],#16
  450. aesd v0.16b,v21.16b
  451. aesimc v0.16b,v0.16b
  452. aesd v1.16b,v21.16b
  453. aesimc v1.16b,v1.16b
  454. aesd v18.16b,v21.16b
  455. aesimc v18.16b,v18.16b
  456. ld1 {v3.16b},[x0],#16
  457. aesd v0.16b,v22.16b
  458. aesimc v0.16b,v0.16b
  459. aesd v1.16b,v22.16b
  460. aesimc v1.16b,v1.16b
  461. aesd v18.16b,v22.16b
  462. aesimc v18.16b,v18.16b
  463. ld1 {v19.16b},[x0],#16
  464. aesd v0.16b,v23.16b
  465. aesd v1.16b,v23.16b
  466. aesd v18.16b,v23.16b
  467. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  468. add w6,w5,#2
  469. eor v4.16b,v4.16b,v0.16b
  470. eor v5.16b,v5.16b,v1.16b
  471. eor v18.16b,v18.16b,v17.16b
  472. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  473. st1 {v4.16b},[x1],#16
  474. orr v0.16b,v2.16b,v2.16b
  475. st1 {v5.16b},[x1],#16
  476. orr v1.16b,v3.16b,v3.16b
  477. st1 {v18.16b},[x1],#16
  478. orr v18.16b,v19.16b,v19.16b
  479. b.hs Loop3x_cbc_dec
  480. cmn x2,#0x30
  481. b.eq Lcbc_done
  482. nop
  483. Lcbc_dec_tail:
  484. aesd v1.16b,v16.16b
  485. aesimc v1.16b,v1.16b
  486. aesd v18.16b,v16.16b
  487. aesimc v18.16b,v18.16b
  488. ld1 {v16.4s},[x7],#16
  489. subs w6,w6,#2
  490. aesd v1.16b,v17.16b
  491. aesimc v1.16b,v1.16b
  492. aesd v18.16b,v17.16b
  493. aesimc v18.16b,v18.16b
  494. ld1 {v17.4s},[x7],#16
  495. b.gt Lcbc_dec_tail
  496. aesd v1.16b,v16.16b
  497. aesimc v1.16b,v1.16b
  498. aesd v18.16b,v16.16b
  499. aesimc v18.16b,v18.16b
  500. aesd v1.16b,v17.16b
  501. aesimc v1.16b,v1.16b
  502. aesd v18.16b,v17.16b
  503. aesimc v18.16b,v18.16b
  504. aesd v1.16b,v20.16b
  505. aesimc v1.16b,v1.16b
  506. aesd v18.16b,v20.16b
  507. aesimc v18.16b,v18.16b
  508. cmn x2,#0x20
  509. aesd v1.16b,v21.16b
  510. aesimc v1.16b,v1.16b
  511. aesd v18.16b,v21.16b
  512. aesimc v18.16b,v18.16b
  513. eor v5.16b,v6.16b,v7.16b
  514. aesd v1.16b,v22.16b
  515. aesimc v1.16b,v1.16b
  516. aesd v18.16b,v22.16b
  517. aesimc v18.16b,v18.16b
  518. eor v17.16b,v3.16b,v7.16b
  519. aesd v1.16b,v23.16b
  520. aesd v18.16b,v23.16b
  521. b.eq Lcbc_dec_one
  522. eor v5.16b,v5.16b,v1.16b
  523. eor v17.16b,v17.16b,v18.16b
  524. orr v6.16b,v19.16b,v19.16b
  525. st1 {v5.16b},[x1],#16
  526. st1 {v17.16b},[x1],#16
  527. b Lcbc_done
  528. Lcbc_dec_one:
  529. eor v5.16b,v5.16b,v18.16b
  530. orr v6.16b,v19.16b,v19.16b
  531. st1 {v5.16b},[x1],#16
  532. Lcbc_done:
  533. st1 {v6.16b},[x4]
  534. Lcbc_abort:
  535. ldr x29,[sp],#16
  536. ret
  537. .globl aes_hw_ctr32_encrypt_blocks
  538. .def aes_hw_ctr32_encrypt_blocks
  539. .type 32
  540. .endef
  541. .align 5
  542. aes_hw_ctr32_encrypt_blocks:
  543. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  544. AARCH64_VALID_CALL_TARGET
  545. stp x29,x30,[sp,#-16]!
  546. add x29,sp,#0
  547. ldr w5,[x3,#240]
  548. ldr w8, [x4, #12]
  549. ld1 {v0.4s},[x4]
  550. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  551. sub w5,w5,#4
  552. mov x12,#16
  553. cmp x2,#2
  554. add x7,x3,x5,lsl#4 // pointer to last 5 round keys
  555. sub w5,w5,#2
  556. ld1 {v20.4s,v21.4s},[x7],#32
  557. ld1 {v22.4s,v23.4s},[x7],#32
  558. ld1 {v7.4s},[x7]
  559. add x7,x3,#32
  560. mov w6,w5
  561. csel x12,xzr,x12,lo
  562. // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
  563. // affected by silicon errata #1742098 [0] and #1655431 [1],
  564. // respectively, where the second instruction of an aese/aesmc
  565. // instruction pair may execute twice if an interrupt is taken right
  566. // after the first instruction consumes an input register of which a
  567. // single 32-bit lane has been updated the last time it was modified.
  568. //
  569. // This function uses a counter in one 32-bit lane. The vmov lines
  570. // could write to v1.16b and v18.16b directly, but that trips this bugs.
  571. // We write to v6.16b and copy to the final register as a workaround.
  572. //
  573. // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
  574. // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
  575. #ifndef __ARMEB__
  576. rev w8, w8
  577. #endif
  578. add w10, w8, #1
  579. orr v6.16b,v0.16b,v0.16b
  580. rev w10, w10
  581. mov v6.s[3],w10
  582. add w8, w8, #2
  583. orr v1.16b,v6.16b,v6.16b
  584. b.ls Lctr32_tail
  585. rev w12, w8
  586. mov v6.s[3],w12
  587. sub x2,x2,#3 // bias
  588. orr v18.16b,v6.16b,v6.16b
  589. b Loop3x_ctr32
  590. .align 4
  591. Loop3x_ctr32:
  592. aese v0.16b,v16.16b
  593. aesmc v0.16b,v0.16b
  594. aese v1.16b,v16.16b
  595. aesmc v1.16b,v1.16b
  596. aese v18.16b,v16.16b
  597. aesmc v18.16b,v18.16b
  598. ld1 {v16.4s},[x7],#16
  599. subs w6,w6,#2
  600. aese v0.16b,v17.16b
  601. aesmc v0.16b,v0.16b
  602. aese v1.16b,v17.16b
  603. aesmc v1.16b,v1.16b
  604. aese v18.16b,v17.16b
  605. aesmc v18.16b,v18.16b
  606. ld1 {v17.4s},[x7],#16
  607. b.gt Loop3x_ctr32
  608. aese v0.16b,v16.16b
  609. aesmc v4.16b,v0.16b
  610. aese v1.16b,v16.16b
  611. aesmc v5.16b,v1.16b
  612. ld1 {v2.16b},[x0],#16
  613. add w9,w8,#1
  614. aese v18.16b,v16.16b
  615. aesmc v18.16b,v18.16b
  616. ld1 {v3.16b},[x0],#16
  617. rev w9,w9
  618. aese v4.16b,v17.16b
  619. aesmc v4.16b,v4.16b
  620. aese v5.16b,v17.16b
  621. aesmc v5.16b,v5.16b
  622. ld1 {v19.16b},[x0],#16
  623. mov x7,x3
  624. aese v18.16b,v17.16b
  625. aesmc v17.16b,v18.16b
  626. aese v4.16b,v20.16b
  627. aesmc v4.16b,v4.16b
  628. aese v5.16b,v20.16b
  629. aesmc v5.16b,v5.16b
  630. eor v2.16b,v2.16b,v7.16b
  631. add w10,w8,#2
  632. aese v17.16b,v20.16b
  633. aesmc v17.16b,v17.16b
  634. eor v3.16b,v3.16b,v7.16b
  635. add w8,w8,#3
  636. aese v4.16b,v21.16b
  637. aesmc v4.16b,v4.16b
  638. aese v5.16b,v21.16b
  639. aesmc v5.16b,v5.16b
  640. // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
  641. // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
  642. // 32-bit mode. See the comment above.
  643. eor v19.16b,v19.16b,v7.16b
  644. mov v6.s[3], w9
  645. aese v17.16b,v21.16b
  646. aesmc v17.16b,v17.16b
  647. orr v0.16b,v6.16b,v6.16b
  648. rev w10,w10
  649. aese v4.16b,v22.16b
  650. aesmc v4.16b,v4.16b
  651. mov v6.s[3], w10
  652. rev w12,w8
  653. aese v5.16b,v22.16b
  654. aesmc v5.16b,v5.16b
  655. orr v1.16b,v6.16b,v6.16b
  656. mov v6.s[3], w12
  657. aese v17.16b,v22.16b
  658. aesmc v17.16b,v17.16b
  659. orr v18.16b,v6.16b,v6.16b
  660. subs x2,x2,#3
  661. aese v4.16b,v23.16b
  662. aese v5.16b,v23.16b
  663. aese v17.16b,v23.16b
  664. eor v2.16b,v2.16b,v4.16b
  665. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  666. st1 {v2.16b},[x1],#16
  667. eor v3.16b,v3.16b,v5.16b
  668. mov w6,w5
  669. st1 {v3.16b},[x1],#16
  670. eor v19.16b,v19.16b,v17.16b
  671. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  672. st1 {v19.16b},[x1],#16
  673. b.hs Loop3x_ctr32
  674. adds x2,x2,#3
  675. b.eq Lctr32_done
  676. cmp x2,#1
  677. mov x12,#16
  678. csel x12,xzr,x12,eq
  679. Lctr32_tail:
  680. aese v0.16b,v16.16b
  681. aesmc v0.16b,v0.16b
  682. aese v1.16b,v16.16b
  683. aesmc v1.16b,v1.16b
  684. ld1 {v16.4s},[x7],#16
  685. subs w6,w6,#2
  686. aese v0.16b,v17.16b
  687. aesmc v0.16b,v0.16b
  688. aese v1.16b,v17.16b
  689. aesmc v1.16b,v1.16b
  690. ld1 {v17.4s},[x7],#16
  691. b.gt Lctr32_tail
  692. aese v0.16b,v16.16b
  693. aesmc v0.16b,v0.16b
  694. aese v1.16b,v16.16b
  695. aesmc v1.16b,v1.16b
  696. aese v0.16b,v17.16b
  697. aesmc v0.16b,v0.16b
  698. aese v1.16b,v17.16b
  699. aesmc v1.16b,v1.16b
  700. ld1 {v2.16b},[x0],x12
  701. aese v0.16b,v20.16b
  702. aesmc v0.16b,v0.16b
  703. aese v1.16b,v20.16b
  704. aesmc v1.16b,v1.16b
  705. ld1 {v3.16b},[x0]
  706. aese v0.16b,v21.16b
  707. aesmc v0.16b,v0.16b
  708. aese v1.16b,v21.16b
  709. aesmc v1.16b,v1.16b
  710. eor v2.16b,v2.16b,v7.16b
  711. aese v0.16b,v22.16b
  712. aesmc v0.16b,v0.16b
  713. aese v1.16b,v22.16b
  714. aesmc v1.16b,v1.16b
  715. eor v3.16b,v3.16b,v7.16b
  716. aese v0.16b,v23.16b
  717. aese v1.16b,v23.16b
  718. cmp x2,#1
  719. eor v2.16b,v2.16b,v0.16b
  720. eor v3.16b,v3.16b,v1.16b
  721. st1 {v2.16b},[x1],#16
  722. b.eq Lctr32_done
  723. st1 {v3.16b},[x1]
  724. Lctr32_done:
  725. ldr x29,[sp],#16
  726. ret
  727. #endif
  728. #endif
  729. #endif // !OPENSSL_NO_ASM