aesv8-armx64.S 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. #if __ARM_MAX_ARCH__>=7
  15. .text
  16. .section __TEXT,__const
  17. .align 5
  18. Lrcon:
  19. .long 0x01,0x01,0x01,0x01
  20. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  21. .long 0x1b,0x1b,0x1b,0x1b
  22. .text
  23. .globl _aes_hw_set_encrypt_key
  24. .private_extern _aes_hw_set_encrypt_key
  25. .align 5
  26. _aes_hw_set_encrypt_key:
  27. Lenc_key:
  28. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  29. AARCH64_VALID_CALL_TARGET
  30. stp x29,x30,[sp,#-16]!
  31. add x29,sp,#0
  32. mov x3,#-1
  33. cmp x0,#0
  34. b.eq Lenc_key_abort
  35. cmp x2,#0
  36. b.eq Lenc_key_abort
  37. mov x3,#-2
  38. cmp w1,#128
  39. b.lt Lenc_key_abort
  40. cmp w1,#256
  41. b.gt Lenc_key_abort
  42. tst w1,#0x3f
  43. b.ne Lenc_key_abort
  44. adrp x3,Lrcon@PAGE
  45. add x3,x3,Lrcon@PAGEOFF
  46. cmp w1,#192
  47. eor v0.16b,v0.16b,v0.16b
  48. ld1 {v3.16b},[x0],#16
  49. mov w1,#8 // reuse w1
  50. ld1 {v1.4s,v2.4s},[x3],#32
  51. b.lt Loop128
  52. b.eq L192
  53. b L256
  54. .align 4
  55. Loop128:
  56. tbl v6.16b,{v3.16b},v2.16b
  57. ext v5.16b,v0.16b,v3.16b,#12
  58. st1 {v3.4s},[x2],#16
  59. aese v6.16b,v0.16b
  60. subs w1,w1,#1
  61. eor v3.16b,v3.16b,v5.16b
  62. ext v5.16b,v0.16b,v5.16b,#12
  63. eor v3.16b,v3.16b,v5.16b
  64. ext v5.16b,v0.16b,v5.16b,#12
  65. eor v6.16b,v6.16b,v1.16b
  66. eor v3.16b,v3.16b,v5.16b
  67. shl v1.16b,v1.16b,#1
  68. eor v3.16b,v3.16b,v6.16b
  69. b.ne Loop128
  70. ld1 {v1.4s},[x3]
  71. tbl v6.16b,{v3.16b},v2.16b
  72. ext v5.16b,v0.16b,v3.16b,#12
  73. st1 {v3.4s},[x2],#16
  74. aese v6.16b,v0.16b
  75. eor v3.16b,v3.16b,v5.16b
  76. ext v5.16b,v0.16b,v5.16b,#12
  77. eor v3.16b,v3.16b,v5.16b
  78. ext v5.16b,v0.16b,v5.16b,#12
  79. eor v6.16b,v6.16b,v1.16b
  80. eor v3.16b,v3.16b,v5.16b
  81. shl v1.16b,v1.16b,#1
  82. eor v3.16b,v3.16b,v6.16b
  83. tbl v6.16b,{v3.16b},v2.16b
  84. ext v5.16b,v0.16b,v3.16b,#12
  85. st1 {v3.4s},[x2],#16
  86. aese v6.16b,v0.16b
  87. eor v3.16b,v3.16b,v5.16b
  88. ext v5.16b,v0.16b,v5.16b,#12
  89. eor v3.16b,v3.16b,v5.16b
  90. ext v5.16b,v0.16b,v5.16b,#12
  91. eor v6.16b,v6.16b,v1.16b
  92. eor v3.16b,v3.16b,v5.16b
  93. eor v3.16b,v3.16b,v6.16b
  94. st1 {v3.4s},[x2]
  95. add x2,x2,#0x50
  96. mov w12,#10
  97. b Ldone
  98. .align 4
  99. L192:
  100. ld1 {v4.8b},[x0],#8
  101. movi v6.16b,#8 // borrow v6.16b
  102. st1 {v3.4s},[x2],#16
  103. sub v2.16b,v2.16b,v6.16b // adjust the mask
  104. Loop192:
  105. tbl v6.16b,{v4.16b},v2.16b
  106. ext v5.16b,v0.16b,v3.16b,#12
  107. st1 {v4.8b},[x2],#8
  108. aese v6.16b,v0.16b
  109. subs w1,w1,#1
  110. eor v3.16b,v3.16b,v5.16b
  111. ext v5.16b,v0.16b,v5.16b,#12
  112. eor v3.16b,v3.16b,v5.16b
  113. ext v5.16b,v0.16b,v5.16b,#12
  114. eor v3.16b,v3.16b,v5.16b
  115. dup v5.4s,v3.s[3]
  116. eor v5.16b,v5.16b,v4.16b
  117. eor v6.16b,v6.16b,v1.16b
  118. ext v4.16b,v0.16b,v4.16b,#12
  119. shl v1.16b,v1.16b,#1
  120. eor v4.16b,v4.16b,v5.16b
  121. eor v3.16b,v3.16b,v6.16b
  122. eor v4.16b,v4.16b,v6.16b
  123. st1 {v3.4s},[x2],#16
  124. b.ne Loop192
  125. mov w12,#12
  126. add x2,x2,#0x20
  127. b Ldone
  128. .align 4
  129. L256:
  130. ld1 {v4.16b},[x0]
  131. mov w1,#7
  132. mov w12,#14
  133. st1 {v3.4s},[x2],#16
  134. Loop256:
  135. tbl v6.16b,{v4.16b},v2.16b
  136. ext v5.16b,v0.16b,v3.16b,#12
  137. st1 {v4.4s},[x2],#16
  138. aese v6.16b,v0.16b
  139. subs w1,w1,#1
  140. eor v3.16b,v3.16b,v5.16b
  141. ext v5.16b,v0.16b,v5.16b,#12
  142. eor v3.16b,v3.16b,v5.16b
  143. ext v5.16b,v0.16b,v5.16b,#12
  144. eor v6.16b,v6.16b,v1.16b
  145. eor v3.16b,v3.16b,v5.16b
  146. shl v1.16b,v1.16b,#1
  147. eor v3.16b,v3.16b,v6.16b
  148. st1 {v3.4s},[x2],#16
  149. b.eq Ldone
  150. dup v6.4s,v3.s[3] // just splat
  151. ext v5.16b,v0.16b,v4.16b,#12
  152. aese v6.16b,v0.16b
  153. eor v4.16b,v4.16b,v5.16b
  154. ext v5.16b,v0.16b,v5.16b,#12
  155. eor v4.16b,v4.16b,v5.16b
  156. ext v5.16b,v0.16b,v5.16b,#12
  157. eor v4.16b,v4.16b,v5.16b
  158. eor v4.16b,v4.16b,v6.16b
  159. b Loop256
  160. Ldone:
  161. str w12,[x2]
  162. mov x3,#0
  163. Lenc_key_abort:
  164. mov x0,x3 // return value
  165. ldr x29,[sp],#16
  166. ret
  167. .globl _aes_hw_set_decrypt_key
  168. .private_extern _aes_hw_set_decrypt_key
  169. .align 5
  170. _aes_hw_set_decrypt_key:
  171. AARCH64_SIGN_LINK_REGISTER
  172. stp x29,x30,[sp,#-16]!
  173. add x29,sp,#0
  174. bl Lenc_key
  175. cmp x0,#0
  176. b.ne Ldec_key_abort
  177. sub x2,x2,#240 // restore original x2
  178. mov x4,#-16
  179. add x0,x2,x12,lsl#4 // end of key schedule
  180. ld1 {v0.4s},[x2]
  181. ld1 {v1.4s},[x0]
  182. st1 {v0.4s},[x0],x4
  183. st1 {v1.4s},[x2],#16
  184. Loop_imc:
  185. ld1 {v0.4s},[x2]
  186. ld1 {v1.4s},[x0]
  187. aesimc v0.16b,v0.16b
  188. aesimc v1.16b,v1.16b
  189. st1 {v0.4s},[x0],x4
  190. st1 {v1.4s},[x2],#16
  191. cmp x0,x2
  192. b.hi Loop_imc
  193. ld1 {v0.4s},[x2]
  194. aesimc v0.16b,v0.16b
  195. st1 {v0.4s},[x0]
  196. eor x0,x0,x0 // return value
  197. Ldec_key_abort:
  198. ldp x29,x30,[sp],#16
  199. AARCH64_VALIDATE_LINK_REGISTER
  200. ret
  201. .globl _aes_hw_encrypt
  202. .private_extern _aes_hw_encrypt
  203. .align 5
  204. _aes_hw_encrypt:
  205. AARCH64_VALID_CALL_TARGET
  206. ldr w3,[x2,#240]
  207. ld1 {v0.4s},[x2],#16
  208. ld1 {v2.16b},[x0]
  209. sub w3,w3,#2
  210. ld1 {v1.4s},[x2],#16
  211. Loop_enc:
  212. aese v2.16b,v0.16b
  213. aesmc v2.16b,v2.16b
  214. ld1 {v0.4s},[x2],#16
  215. subs w3,w3,#2
  216. aese v2.16b,v1.16b
  217. aesmc v2.16b,v2.16b
  218. ld1 {v1.4s},[x2],#16
  219. b.gt Loop_enc
  220. aese v2.16b,v0.16b
  221. aesmc v2.16b,v2.16b
  222. ld1 {v0.4s},[x2]
  223. aese v2.16b,v1.16b
  224. eor v2.16b,v2.16b,v0.16b
  225. st1 {v2.16b},[x1]
  226. ret
  227. .globl _aes_hw_decrypt
  228. .private_extern _aes_hw_decrypt
  229. .align 5
  230. _aes_hw_decrypt:
  231. AARCH64_VALID_CALL_TARGET
  232. ldr w3,[x2,#240]
  233. ld1 {v0.4s},[x2],#16
  234. ld1 {v2.16b},[x0]
  235. sub w3,w3,#2
  236. ld1 {v1.4s},[x2],#16
  237. Loop_dec:
  238. aesd v2.16b,v0.16b
  239. aesimc v2.16b,v2.16b
  240. ld1 {v0.4s},[x2],#16
  241. subs w3,w3,#2
  242. aesd v2.16b,v1.16b
  243. aesimc v2.16b,v2.16b
  244. ld1 {v1.4s},[x2],#16
  245. b.gt Loop_dec
  246. aesd v2.16b,v0.16b
  247. aesimc v2.16b,v2.16b
  248. ld1 {v0.4s},[x2]
  249. aesd v2.16b,v1.16b
  250. eor v2.16b,v2.16b,v0.16b
  251. st1 {v2.16b},[x1]
  252. ret
  253. .globl _aes_hw_cbc_encrypt
  254. .private_extern _aes_hw_cbc_encrypt
  255. .align 5
  256. _aes_hw_cbc_encrypt:
  257. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  258. AARCH64_VALID_CALL_TARGET
  259. stp x29,x30,[sp,#-16]!
  260. add x29,sp,#0
  261. subs x2,x2,#16
  262. mov x8,#16
  263. b.lo Lcbc_abort
  264. csel x8,xzr,x8,eq
  265. cmp w5,#0 // en- or decrypting?
  266. ldr w5,[x3,#240]
  267. and x2,x2,#-16
  268. ld1 {v6.16b},[x4]
  269. ld1 {v0.16b},[x0],x8
  270. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  271. sub w5,w5,#6
  272. add x7,x3,x5,lsl#4 // pointer to last 7 round keys
  273. sub w5,w5,#2
  274. ld1 {v18.4s,v19.4s},[x7],#32
  275. ld1 {v20.4s,v21.4s},[x7],#32
  276. ld1 {v22.4s,v23.4s},[x7],#32
  277. ld1 {v7.4s},[x7]
  278. add x7,x3,#32
  279. mov w6,w5
  280. b.eq Lcbc_dec
  281. cmp w5,#2
  282. eor v0.16b,v0.16b,v6.16b
  283. eor v5.16b,v16.16b,v7.16b
  284. b.eq Lcbc_enc128
  285. ld1 {v2.4s,v3.4s},[x7]
  286. add x7,x3,#16
  287. add x6,x3,#16*4
  288. add x12,x3,#16*5
  289. aese v0.16b,v16.16b
  290. aesmc v0.16b,v0.16b
  291. add x14,x3,#16*6
  292. add x3,x3,#16*7
  293. b Lenter_cbc_enc
  294. .align 4
  295. Loop_cbc_enc:
  296. aese v0.16b,v16.16b
  297. aesmc v0.16b,v0.16b
  298. st1 {v6.16b},[x1],#16
  299. Lenter_cbc_enc:
  300. aese v0.16b,v17.16b
  301. aesmc v0.16b,v0.16b
  302. aese v0.16b,v2.16b
  303. aesmc v0.16b,v0.16b
  304. ld1 {v16.4s},[x6]
  305. cmp w5,#4
  306. aese v0.16b,v3.16b
  307. aesmc v0.16b,v0.16b
  308. ld1 {v17.4s},[x12]
  309. b.eq Lcbc_enc192
  310. aese v0.16b,v16.16b
  311. aesmc v0.16b,v0.16b
  312. ld1 {v16.4s},[x14]
  313. aese v0.16b,v17.16b
  314. aesmc v0.16b,v0.16b
  315. ld1 {v17.4s},[x3]
  316. nop
  317. Lcbc_enc192:
  318. aese v0.16b,v16.16b
  319. aesmc v0.16b,v0.16b
  320. subs x2,x2,#16
  321. aese v0.16b,v17.16b
  322. aesmc v0.16b,v0.16b
  323. csel x8,xzr,x8,eq
  324. aese v0.16b,v18.16b
  325. aesmc v0.16b,v0.16b
  326. aese v0.16b,v19.16b
  327. aesmc v0.16b,v0.16b
  328. ld1 {v16.16b},[x0],x8
  329. aese v0.16b,v20.16b
  330. aesmc v0.16b,v0.16b
  331. eor v16.16b,v16.16b,v5.16b
  332. aese v0.16b,v21.16b
  333. aesmc v0.16b,v0.16b
  334. ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
  335. aese v0.16b,v22.16b
  336. aesmc v0.16b,v0.16b
  337. aese v0.16b,v23.16b
  338. eor v6.16b,v0.16b,v7.16b
  339. b.hs Loop_cbc_enc
  340. st1 {v6.16b},[x1],#16
  341. b Lcbc_done
  342. .align 5
  343. Lcbc_enc128:
  344. ld1 {v2.4s,v3.4s},[x7]
  345. aese v0.16b,v16.16b
  346. aesmc v0.16b,v0.16b
  347. b Lenter_cbc_enc128
  348. Loop_cbc_enc128:
  349. aese v0.16b,v16.16b
  350. aesmc v0.16b,v0.16b
  351. st1 {v6.16b},[x1],#16
  352. Lenter_cbc_enc128:
  353. aese v0.16b,v17.16b
  354. aesmc v0.16b,v0.16b
  355. subs x2,x2,#16
  356. aese v0.16b,v2.16b
  357. aesmc v0.16b,v0.16b
  358. csel x8,xzr,x8,eq
  359. aese v0.16b,v3.16b
  360. aesmc v0.16b,v0.16b
  361. aese v0.16b,v18.16b
  362. aesmc v0.16b,v0.16b
  363. aese v0.16b,v19.16b
  364. aesmc v0.16b,v0.16b
  365. ld1 {v16.16b},[x0],x8
  366. aese v0.16b,v20.16b
  367. aesmc v0.16b,v0.16b
  368. aese v0.16b,v21.16b
  369. aesmc v0.16b,v0.16b
  370. aese v0.16b,v22.16b
  371. aesmc v0.16b,v0.16b
  372. eor v16.16b,v16.16b,v5.16b
  373. aese v0.16b,v23.16b
  374. eor v6.16b,v0.16b,v7.16b
  375. b.hs Loop_cbc_enc128
  376. st1 {v6.16b},[x1],#16
  377. b Lcbc_done
  378. .align 5
  379. Lcbc_dec:
  380. ld1 {v18.16b},[x0],#16
  381. subs x2,x2,#32 // bias
  382. add w6,w5,#2
  383. orr v3.16b,v0.16b,v0.16b
  384. orr v1.16b,v0.16b,v0.16b
  385. orr v19.16b,v18.16b,v18.16b
  386. b.lo Lcbc_dec_tail
  387. orr v1.16b,v18.16b,v18.16b
  388. ld1 {v18.16b},[x0],#16
  389. orr v2.16b,v0.16b,v0.16b
  390. orr v3.16b,v1.16b,v1.16b
  391. orr v19.16b,v18.16b,v18.16b
  392. Loop3x_cbc_dec:
  393. aesd v0.16b,v16.16b
  394. aesimc v0.16b,v0.16b
  395. aesd v1.16b,v16.16b
  396. aesimc v1.16b,v1.16b
  397. aesd v18.16b,v16.16b
  398. aesimc v18.16b,v18.16b
  399. ld1 {v16.4s},[x7],#16
  400. subs w6,w6,#2
  401. aesd v0.16b,v17.16b
  402. aesimc v0.16b,v0.16b
  403. aesd v1.16b,v17.16b
  404. aesimc v1.16b,v1.16b
  405. aesd v18.16b,v17.16b
  406. aesimc v18.16b,v18.16b
  407. ld1 {v17.4s},[x7],#16
  408. b.gt Loop3x_cbc_dec
  409. aesd v0.16b,v16.16b
  410. aesimc v0.16b,v0.16b
  411. aesd v1.16b,v16.16b
  412. aesimc v1.16b,v1.16b
  413. aesd v18.16b,v16.16b
  414. aesimc v18.16b,v18.16b
  415. eor v4.16b,v6.16b,v7.16b
  416. subs x2,x2,#0x30
  417. eor v5.16b,v2.16b,v7.16b
  418. csel x6,x2,x6,lo // x6, w6, is zero at this point
  419. aesd v0.16b,v17.16b
  420. aesimc v0.16b,v0.16b
  421. aesd v1.16b,v17.16b
  422. aesimc v1.16b,v1.16b
  423. aesd v18.16b,v17.16b
  424. aesimc v18.16b,v18.16b
  425. eor v17.16b,v3.16b,v7.16b
  426. add x0,x0,x6 // x0 is adjusted in such way that
  427. // at exit from the loop v1.16b-v18.16b
  428. // are loaded with last "words"
  429. orr v6.16b,v19.16b,v19.16b
  430. mov x7,x3
  431. aesd v0.16b,v20.16b
  432. aesimc v0.16b,v0.16b
  433. aesd v1.16b,v20.16b
  434. aesimc v1.16b,v1.16b
  435. aesd v18.16b,v20.16b
  436. aesimc v18.16b,v18.16b
  437. ld1 {v2.16b},[x0],#16
  438. aesd v0.16b,v21.16b
  439. aesimc v0.16b,v0.16b
  440. aesd v1.16b,v21.16b
  441. aesimc v1.16b,v1.16b
  442. aesd v18.16b,v21.16b
  443. aesimc v18.16b,v18.16b
  444. ld1 {v3.16b},[x0],#16
  445. aesd v0.16b,v22.16b
  446. aesimc v0.16b,v0.16b
  447. aesd v1.16b,v22.16b
  448. aesimc v1.16b,v1.16b
  449. aesd v18.16b,v22.16b
  450. aesimc v18.16b,v18.16b
  451. ld1 {v19.16b},[x0],#16
  452. aesd v0.16b,v23.16b
  453. aesd v1.16b,v23.16b
  454. aesd v18.16b,v23.16b
  455. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  456. add w6,w5,#2
  457. eor v4.16b,v4.16b,v0.16b
  458. eor v5.16b,v5.16b,v1.16b
  459. eor v18.16b,v18.16b,v17.16b
  460. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  461. st1 {v4.16b},[x1],#16
  462. orr v0.16b,v2.16b,v2.16b
  463. st1 {v5.16b},[x1],#16
  464. orr v1.16b,v3.16b,v3.16b
  465. st1 {v18.16b},[x1],#16
  466. orr v18.16b,v19.16b,v19.16b
  467. b.hs Loop3x_cbc_dec
  468. cmn x2,#0x30
  469. b.eq Lcbc_done
  470. nop
  471. Lcbc_dec_tail:
  472. aesd v1.16b,v16.16b
  473. aesimc v1.16b,v1.16b
  474. aesd v18.16b,v16.16b
  475. aesimc v18.16b,v18.16b
  476. ld1 {v16.4s},[x7],#16
  477. subs w6,w6,#2
  478. aesd v1.16b,v17.16b
  479. aesimc v1.16b,v1.16b
  480. aesd v18.16b,v17.16b
  481. aesimc v18.16b,v18.16b
  482. ld1 {v17.4s},[x7],#16
  483. b.gt Lcbc_dec_tail
  484. aesd v1.16b,v16.16b
  485. aesimc v1.16b,v1.16b
  486. aesd v18.16b,v16.16b
  487. aesimc v18.16b,v18.16b
  488. aesd v1.16b,v17.16b
  489. aesimc v1.16b,v1.16b
  490. aesd v18.16b,v17.16b
  491. aesimc v18.16b,v18.16b
  492. aesd v1.16b,v20.16b
  493. aesimc v1.16b,v1.16b
  494. aesd v18.16b,v20.16b
  495. aesimc v18.16b,v18.16b
  496. cmn x2,#0x20
  497. aesd v1.16b,v21.16b
  498. aesimc v1.16b,v1.16b
  499. aesd v18.16b,v21.16b
  500. aesimc v18.16b,v18.16b
  501. eor v5.16b,v6.16b,v7.16b
  502. aesd v1.16b,v22.16b
  503. aesimc v1.16b,v1.16b
  504. aesd v18.16b,v22.16b
  505. aesimc v18.16b,v18.16b
  506. eor v17.16b,v3.16b,v7.16b
  507. aesd v1.16b,v23.16b
  508. aesd v18.16b,v23.16b
  509. b.eq Lcbc_dec_one
  510. eor v5.16b,v5.16b,v1.16b
  511. eor v17.16b,v17.16b,v18.16b
  512. orr v6.16b,v19.16b,v19.16b
  513. st1 {v5.16b},[x1],#16
  514. st1 {v17.16b},[x1],#16
  515. b Lcbc_done
  516. Lcbc_dec_one:
  517. eor v5.16b,v5.16b,v18.16b
  518. orr v6.16b,v19.16b,v19.16b
  519. st1 {v5.16b},[x1],#16
  520. Lcbc_done:
  521. st1 {v6.16b},[x4]
  522. Lcbc_abort:
  523. ldr x29,[sp],#16
  524. ret
  525. .globl _aes_hw_ctr32_encrypt_blocks
  526. .private_extern _aes_hw_ctr32_encrypt_blocks
  527. .align 5
  528. _aes_hw_ctr32_encrypt_blocks:
  529. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  530. AARCH64_VALID_CALL_TARGET
  531. stp x29,x30,[sp,#-16]!
  532. add x29,sp,#0
  533. ldr w5,[x3,#240]
  534. ldr w8, [x4, #12]
  535. ld1 {v0.4s},[x4]
  536. ld1 {v16.4s,v17.4s},[x3] // load key schedule...
  537. sub w5,w5,#4
  538. mov x12,#16
  539. cmp x2,#2
  540. add x7,x3,x5,lsl#4 // pointer to last 5 round keys
  541. sub w5,w5,#2
  542. ld1 {v20.4s,v21.4s},[x7],#32
  543. ld1 {v22.4s,v23.4s},[x7],#32
  544. ld1 {v7.4s},[x7]
  545. add x7,x3,#32
  546. mov w6,w5
  547. csel x12,xzr,x12,lo
  548. // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
  549. // affected by silicon errata #1742098 [0] and #1655431 [1],
  550. // respectively, where the second instruction of an aese/aesmc
  551. // instruction pair may execute twice if an interrupt is taken right
  552. // after the first instruction consumes an input register of which a
  553. // single 32-bit lane has been updated the last time it was modified.
  554. //
  555. // This function uses a counter in one 32-bit lane. The vmov lines
  556. // could write to v1.16b and v18.16b directly, but that trips this bugs.
  557. // We write to v6.16b and copy to the final register as a workaround.
  558. //
  559. // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
  560. // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
  561. #ifndef __ARMEB__
  562. rev w8, w8
  563. #endif
  564. add w10, w8, #1
  565. orr v6.16b,v0.16b,v0.16b
  566. rev w10, w10
  567. mov v6.s[3],w10
  568. add w8, w8, #2
  569. orr v1.16b,v6.16b,v6.16b
  570. b.ls Lctr32_tail
  571. rev w12, w8
  572. mov v6.s[3],w12
  573. sub x2,x2,#3 // bias
  574. orr v18.16b,v6.16b,v6.16b
  575. b Loop3x_ctr32
  576. .align 4
  577. Loop3x_ctr32:
  578. aese v0.16b,v16.16b
  579. aesmc v0.16b,v0.16b
  580. aese v1.16b,v16.16b
  581. aesmc v1.16b,v1.16b
  582. aese v18.16b,v16.16b
  583. aesmc v18.16b,v18.16b
  584. ld1 {v16.4s},[x7],#16
  585. subs w6,w6,#2
  586. aese v0.16b,v17.16b
  587. aesmc v0.16b,v0.16b
  588. aese v1.16b,v17.16b
  589. aesmc v1.16b,v1.16b
  590. aese v18.16b,v17.16b
  591. aesmc v18.16b,v18.16b
  592. ld1 {v17.4s},[x7],#16
  593. b.gt Loop3x_ctr32
  594. aese v0.16b,v16.16b
  595. aesmc v4.16b,v0.16b
  596. aese v1.16b,v16.16b
  597. aesmc v5.16b,v1.16b
  598. ld1 {v2.16b},[x0],#16
  599. add w9,w8,#1
  600. aese v18.16b,v16.16b
  601. aesmc v18.16b,v18.16b
  602. ld1 {v3.16b},[x0],#16
  603. rev w9,w9
  604. aese v4.16b,v17.16b
  605. aesmc v4.16b,v4.16b
  606. aese v5.16b,v17.16b
  607. aesmc v5.16b,v5.16b
  608. ld1 {v19.16b},[x0],#16
  609. mov x7,x3
  610. aese v18.16b,v17.16b
  611. aesmc v17.16b,v18.16b
  612. aese v4.16b,v20.16b
  613. aesmc v4.16b,v4.16b
  614. aese v5.16b,v20.16b
  615. aesmc v5.16b,v5.16b
  616. eor v2.16b,v2.16b,v7.16b
  617. add w10,w8,#2
  618. aese v17.16b,v20.16b
  619. aesmc v17.16b,v17.16b
  620. eor v3.16b,v3.16b,v7.16b
  621. add w8,w8,#3
  622. aese v4.16b,v21.16b
  623. aesmc v4.16b,v4.16b
  624. aese v5.16b,v21.16b
  625. aesmc v5.16b,v5.16b
  626. // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
  627. // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
  628. // 32-bit mode. See the comment above.
  629. eor v19.16b,v19.16b,v7.16b
  630. mov v6.s[3], w9
  631. aese v17.16b,v21.16b
  632. aesmc v17.16b,v17.16b
  633. orr v0.16b,v6.16b,v6.16b
  634. rev w10,w10
  635. aese v4.16b,v22.16b
  636. aesmc v4.16b,v4.16b
  637. mov v6.s[3], w10
  638. rev w12,w8
  639. aese v5.16b,v22.16b
  640. aesmc v5.16b,v5.16b
  641. orr v1.16b,v6.16b,v6.16b
  642. mov v6.s[3], w12
  643. aese v17.16b,v22.16b
  644. aesmc v17.16b,v17.16b
  645. orr v18.16b,v6.16b,v6.16b
  646. subs x2,x2,#3
  647. aese v4.16b,v23.16b
  648. aese v5.16b,v23.16b
  649. aese v17.16b,v23.16b
  650. eor v2.16b,v2.16b,v4.16b
  651. ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
  652. st1 {v2.16b},[x1],#16
  653. eor v3.16b,v3.16b,v5.16b
  654. mov w6,w5
  655. st1 {v3.16b},[x1],#16
  656. eor v19.16b,v19.16b,v17.16b
  657. ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
  658. st1 {v19.16b},[x1],#16
  659. b.hs Loop3x_ctr32
  660. adds x2,x2,#3
  661. b.eq Lctr32_done
  662. cmp x2,#1
  663. mov x12,#16
  664. csel x12,xzr,x12,eq
  665. Lctr32_tail:
  666. aese v0.16b,v16.16b
  667. aesmc v0.16b,v0.16b
  668. aese v1.16b,v16.16b
  669. aesmc v1.16b,v1.16b
  670. ld1 {v16.4s},[x7],#16
  671. subs w6,w6,#2
  672. aese v0.16b,v17.16b
  673. aesmc v0.16b,v0.16b
  674. aese v1.16b,v17.16b
  675. aesmc v1.16b,v1.16b
  676. ld1 {v17.4s},[x7],#16
  677. b.gt Lctr32_tail
  678. aese v0.16b,v16.16b
  679. aesmc v0.16b,v0.16b
  680. aese v1.16b,v16.16b
  681. aesmc v1.16b,v1.16b
  682. aese v0.16b,v17.16b
  683. aesmc v0.16b,v0.16b
  684. aese v1.16b,v17.16b
  685. aesmc v1.16b,v1.16b
  686. ld1 {v2.16b},[x0],x12
  687. aese v0.16b,v20.16b
  688. aesmc v0.16b,v0.16b
  689. aese v1.16b,v20.16b
  690. aesmc v1.16b,v1.16b
  691. ld1 {v3.16b},[x0]
  692. aese v0.16b,v21.16b
  693. aesmc v0.16b,v0.16b
  694. aese v1.16b,v21.16b
  695. aesmc v1.16b,v1.16b
  696. eor v2.16b,v2.16b,v7.16b
  697. aese v0.16b,v22.16b
  698. aesmc v0.16b,v0.16b
  699. aese v1.16b,v22.16b
  700. aesmc v1.16b,v1.16b
  701. eor v3.16b,v3.16b,v7.16b
  702. aese v0.16b,v23.16b
  703. aese v1.16b,v23.16b
  704. cmp x2,#1
  705. eor v2.16b,v2.16b,v0.16b
  706. eor v3.16b,v3.16b,v1.16b
  707. st1 {v2.16b},[x1],#16
  708. b.eq Lctr32_done
  709. st1 {v3.16b},[x1]
  710. Lctr32_done:
  711. ldr x29,[sp],#16
  712. ret
  713. #endif
  714. #endif // !OPENSSL_NO_ASM