vpaes-armv8.S 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .section .rodata
  16. .align 7 // totally strategic alignment
  17. _vpaes_consts:
  18. Lk_mc_forward: // mc_forward
  19. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  20. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  21. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  22. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  23. Lk_mc_backward: // mc_backward
  24. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  25. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  26. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  27. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  28. Lk_sr: // sr
  29. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  30. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  31. .quad 0x0F060D040B020900, 0x070E050C030A0108
  32. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  33. //
  34. // "Hot" constants
  35. //
  36. Lk_inv: // inv, inva
  37. .quad 0x0E05060F0D080180, 0x040703090A0B0C02
  38. .quad 0x01040A060F0B0780, 0x030D0E0C02050809
  39. Lk_ipt: // input transform (lo, hi)
  40. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  41. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  42. Lk_sbo: // sbou, sbot
  43. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  44. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  45. Lk_sb1: // sb1u, sb1t
  46. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  47. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  48. Lk_sb2: // sb2u, sb2t
  49. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  50. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  51. //
  52. // Decryption stuff
  53. //
  54. Lk_dipt: // decryption input transform
  55. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  56. .quad 0x86E383E660056500, 0x12771772F491F194
  57. Lk_dsbo: // decryption sbox final output
  58. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  59. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  60. Lk_dsb9: // decryption sbox output *9*u, *9*t
  61. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  62. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  63. Lk_dsbd: // decryption sbox output *D*u, *D*t
  64. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  65. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  66. Lk_dsbb: // decryption sbox output *B*u, *B*t
  67. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  68. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  69. Lk_dsbe: // decryption sbox output *E*u, *E*t
  70. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  71. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  72. //
  73. // Key schedule constants
  74. //
  75. Lk_dksd: // decryption key schedule: invskew x*D
  76. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  77. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  78. Lk_dksb: // decryption key schedule: invskew x*B
  79. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  80. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  81. Lk_dkse: // decryption key schedule: invskew x*E + 0x63
  82. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  83. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  84. Lk_dks9: // decryption key schedule: invskew x*9
  85. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  86. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  87. Lk_rcon: // rcon
  88. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  89. Lk_opt: // output transform
  90. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  91. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  92. Lk_deskew: // deskew tables: inverts the sbox's "skew"
  93. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  94. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  95. .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
  96. .align 2
  97. .align 6
  98. .text
  99. ##
  100. ## _aes_preheat
  101. ##
  102. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  103. ## and %xmm9-%xmm15 as specified below.
  104. ##
  105. .def _vpaes_encrypt_preheat
  106. .type 32
  107. .endef
  108. .align 4
  109. _vpaes_encrypt_preheat:
  110. adrp x10, Lk_inv
  111. add x10, x10, :lo12:Lk_inv
  112. movi v17.16b, #0x0f
  113. ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
  114. ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
  115. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
  116. ret
  117. ##
  118. ## _aes_encrypt_core
  119. ##
  120. ## AES-encrypt %xmm0.
  121. ##
  122. ## Inputs:
  123. ## %xmm0 = input
  124. ## %xmm9-%xmm15 as in _vpaes_preheat
  125. ## (%rdx) = scheduled keys
  126. ##
  127. ## Output in %xmm0
  128. ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
  129. ## Preserves %xmm6 - %xmm8 so you get some local vectors
  130. ##
  131. ##
  132. .def _vpaes_encrypt_core
  133. .type 32
  134. .endef
  135. .align 4
  136. _vpaes_encrypt_core:
  137. mov x9, x2
  138. ldr w8, [x2,#240] // pull rounds
  139. adrp x11, Lk_mc_forward+16
  140. add x11, x11, :lo12:Lk_mc_forward+16
  141. // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  142. ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
  143. and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  144. ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  145. tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
  146. // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  147. tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
  148. eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
  149. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  150. b Lenc_entry
  151. .align 4
  152. Lenc_loop:
  153. // middle of middle round
  154. add x10, x11, #0x40
  155. tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  156. ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
  157. tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  158. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  159. tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  160. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  161. tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  162. ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
  163. tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  164. eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  165. tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  166. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  167. tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  168. eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  169. and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
  170. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  171. sub w8, w8, #1 // nr--
  172. Lenc_entry:
  173. // top of round
  174. and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
  175. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  176. tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  177. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  178. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  179. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  180. eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  181. eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  182. tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  183. tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  184. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  185. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  186. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
  187. cbnz w8, Lenc_loop
  188. // middle of last round
  189. add x10, x11, #0x80
  190. // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  191. // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  192. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  193. ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
  194. tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  195. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  196. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  197. tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
  198. ret
  199. .globl vpaes_encrypt
  200. .def vpaes_encrypt
  201. .type 32
  202. .endef
  203. .align 4
  204. vpaes_encrypt:
  205. AARCH64_SIGN_LINK_REGISTER
  206. stp x29,x30,[sp,#-16]!
  207. add x29,sp,#0
  208. ld1 {v7.16b}, [x0]
  209. bl _vpaes_encrypt_preheat
  210. bl _vpaes_encrypt_core
  211. st1 {v0.16b}, [x1]
  212. ldp x29,x30,[sp],#16
  213. AARCH64_VALIDATE_LINK_REGISTER
  214. ret
  215. .def _vpaes_encrypt_2x
  216. .type 32
  217. .endef
  218. .align 4
  219. _vpaes_encrypt_2x:
  220. mov x9, x2
  221. ldr w8, [x2,#240] // pull rounds
  222. adrp x11, Lk_mc_forward+16
  223. add x11, x11, :lo12:Lk_mc_forward+16
  224. // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  225. ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
  226. and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  227. ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  228. and v9.16b, v15.16b, v17.16b
  229. ushr v8.16b, v15.16b, #4
  230. tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
  231. tbl v9.16b, {v20.16b}, v9.16b
  232. // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  233. tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
  234. tbl v10.16b, {v21.16b}, v8.16b
  235. eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
  236. eor v8.16b, v9.16b, v16.16b
  237. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  238. eor v8.16b, v8.16b, v10.16b
  239. b Lenc_2x_entry
  240. .align 4
  241. Lenc_2x_loop:
  242. // middle of middle round
  243. add x10, x11, #0x40
  244. tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  245. tbl v12.16b, {v25.16b}, v10.16b
  246. ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
  247. tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  248. tbl v8.16b, {v24.16b}, v11.16b
  249. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  250. eor v12.16b, v12.16b, v16.16b
  251. tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  252. tbl v13.16b, {v27.16b}, v10.16b
  253. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  254. eor v8.16b, v8.16b, v12.16b
  255. tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  256. tbl v10.16b, {v26.16b}, v11.16b
  257. ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
  258. tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  259. tbl v11.16b, {v8.16b}, v1.16b
  260. eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  261. eor v10.16b, v10.16b, v13.16b
  262. tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  263. tbl v8.16b, {v8.16b}, v4.16b
  264. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  265. eor v11.16b, v11.16b, v10.16b
  266. tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  267. tbl v12.16b, {v11.16b},v1.16b
  268. eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  269. eor v8.16b, v8.16b, v11.16b
  270. and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
  271. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  272. eor v8.16b, v8.16b, v12.16b
  273. sub w8, w8, #1 // nr--
  274. Lenc_2x_entry:
  275. // top of round
  276. and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
  277. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  278. and v9.16b, v8.16b, v17.16b
  279. ushr v8.16b, v8.16b, #4
  280. tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  281. tbl v13.16b, {v19.16b},v9.16b
  282. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  283. eor v9.16b, v9.16b, v8.16b
  284. tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  285. tbl v11.16b, {v18.16b},v8.16b
  286. tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  287. tbl v12.16b, {v18.16b},v9.16b
  288. eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  289. eor v11.16b, v11.16b, v13.16b
  290. eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  291. eor v12.16b, v12.16b, v13.16b
  292. tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  293. tbl v10.16b, {v18.16b},v11.16b
  294. tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  295. tbl v11.16b, {v18.16b},v12.16b
  296. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  297. eor v10.16b, v10.16b, v9.16b
  298. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  299. eor v11.16b, v11.16b, v8.16b
  300. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
  301. cbnz w8, Lenc_2x_loop
  302. // middle of last round
  303. add x10, x11, #0x80
  304. // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  305. // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  306. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  307. tbl v12.16b, {v22.16b}, v10.16b
  308. ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
  309. tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  310. tbl v8.16b, {v23.16b}, v11.16b
  311. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  312. eor v12.16b, v12.16b, v16.16b
  313. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  314. eor v8.16b, v8.16b, v12.16b
  315. tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
  316. tbl v1.16b, {v8.16b},v1.16b
  317. ret
  318. .def _vpaes_decrypt_preheat
  319. .type 32
  320. .endef
  321. .align 4
  322. _vpaes_decrypt_preheat:
  323. adrp x10, Lk_inv
  324. add x10, x10, :lo12:Lk_inv
  325. movi v17.16b, #0x0f
  326. adrp x11, Lk_dipt
  327. add x11, x11, :lo12:Lk_dipt
  328. ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
  329. ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
  330. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
  331. ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
  332. ret
  333. ##
  334. ## Decryption core
  335. ##
  336. ## Same API as encryption core.
  337. ##
  338. .def _vpaes_decrypt_core
  339. .type 32
  340. .endef
  341. .align 4
  342. _vpaes_decrypt_core:
  343. mov x9, x2
  344. ldr w8, [x2,#240] // pull rounds
  345. // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
  346. lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
  347. eor x11, x11, #0x30 // xor $0x30, %r11
  348. adrp x10, Lk_sr
  349. add x10, x10, :lo12:Lk_sr
  350. and x11, x11, #0x30 // and $0x30, %r11
  351. add x11, x11, x10
  352. adrp x10, Lk_mc_forward+48
  353. add x10, x10, :lo12:Lk_mc_forward+48
  354. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
  355. and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  356. ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  357. tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  358. ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
  359. // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  360. tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  361. eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
  362. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  363. b Ldec_entry
  364. .align 4
  365. Ldec_loop:
  366. //
  367. // Inverse mix columns
  368. //
  369. // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  370. // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  371. tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  372. tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  373. eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
  374. // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  375. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  376. // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  377. tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  378. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  379. tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  380. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  381. // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  382. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  383. // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  384. tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  385. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  386. tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  387. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  388. // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  389. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  390. // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  391. tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  392. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  393. tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  394. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  395. ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
  396. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  397. sub w8, w8, #1 // sub $1,%rax # nr--
  398. Ldec_entry:
  399. // top of round
  400. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  401. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  402. tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  403. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  404. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  405. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  406. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  407. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  408. tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  409. tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  410. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  411. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  412. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
  413. cbnz w8, Ldec_loop
  414. // middle of last round
  415. // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  416. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  417. // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  418. ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
  419. tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  420. eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  421. eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  422. tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
  423. ret
  424. .globl vpaes_decrypt
  425. .def vpaes_decrypt
  426. .type 32
  427. .endef
  428. .align 4
  429. vpaes_decrypt:
  430. AARCH64_SIGN_LINK_REGISTER
  431. stp x29,x30,[sp,#-16]!
  432. add x29,sp,#0
  433. ld1 {v7.16b}, [x0]
  434. bl _vpaes_decrypt_preheat
  435. bl _vpaes_decrypt_core
  436. st1 {v0.16b}, [x1]
  437. ldp x29,x30,[sp],#16
  438. AARCH64_VALIDATE_LINK_REGISTER
  439. ret
  440. // v14-v15 input, v0-v1 output
  441. .def _vpaes_decrypt_2x
  442. .type 32
  443. .endef
  444. .align 4
  445. _vpaes_decrypt_2x:
  446. mov x9, x2
  447. ldr w8, [x2,#240] // pull rounds
  448. // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
  449. lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
  450. eor x11, x11, #0x30 // xor $0x30, %r11
  451. adrp x10, Lk_sr
  452. add x10, x10, :lo12:Lk_sr
  453. and x11, x11, #0x30 // and $0x30, %r11
  454. add x11, x11, x10
  455. adrp x10, Lk_mc_forward+48
  456. add x10, x10, :lo12:Lk_mc_forward+48
  457. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
  458. and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  459. ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  460. and v9.16b, v15.16b, v17.16b
  461. ushr v8.16b, v15.16b, #4
  462. tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  463. tbl v10.16b, {v20.16b},v9.16b
  464. ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
  465. // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  466. tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  467. tbl v8.16b, {v21.16b},v8.16b
  468. eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
  469. eor v10.16b, v10.16b, v16.16b
  470. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  471. eor v8.16b, v8.16b, v10.16b
  472. b Ldec_2x_entry
  473. .align 4
  474. Ldec_2x_loop:
  475. //
  476. // Inverse mix columns
  477. //
  478. // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  479. // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  480. tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  481. tbl v12.16b, {v24.16b}, v10.16b
  482. tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  483. tbl v9.16b, {v25.16b}, v11.16b
  484. eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
  485. eor v8.16b, v12.16b, v16.16b
  486. // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  487. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  488. eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  489. // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  490. tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  491. tbl v12.16b, {v26.16b}, v10.16b
  492. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  493. tbl v8.16b, {v8.16b},v5.16b
  494. tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  495. tbl v9.16b, {v27.16b}, v11.16b
  496. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  497. eor v8.16b, v8.16b, v12.16b
  498. // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  499. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  500. eor v8.16b, v8.16b, v9.16b
  501. // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  502. tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  503. tbl v12.16b, {v28.16b}, v10.16b
  504. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  505. tbl v8.16b, {v8.16b},v5.16b
  506. tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  507. tbl v9.16b, {v29.16b}, v11.16b
  508. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  509. eor v8.16b, v8.16b, v12.16b
  510. // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  511. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  512. eor v8.16b, v8.16b, v9.16b
  513. // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  514. tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  515. tbl v12.16b, {v30.16b}, v10.16b
  516. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  517. tbl v8.16b, {v8.16b},v5.16b
  518. tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  519. tbl v9.16b, {v31.16b}, v11.16b
  520. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  521. eor v8.16b, v8.16b, v12.16b
  522. ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
  523. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  524. eor v8.16b, v8.16b, v9.16b
  525. sub w8, w8, #1 // sub $1,%rax # nr--
  526. Ldec_2x_entry:
  527. // top of round
  528. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  529. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  530. and v9.16b, v8.16b, v17.16b
  531. ushr v8.16b, v8.16b, #4
  532. tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  533. tbl v10.16b, {v19.16b},v9.16b
  534. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  535. eor v9.16b, v9.16b, v8.16b
  536. tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  537. tbl v11.16b, {v18.16b},v8.16b
  538. tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  539. tbl v12.16b, {v18.16b},v9.16b
  540. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  541. eor v11.16b, v11.16b, v10.16b
  542. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  543. eor v12.16b, v12.16b, v10.16b
  544. tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  545. tbl v10.16b, {v18.16b},v11.16b
  546. tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  547. tbl v11.16b, {v18.16b},v12.16b
  548. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  549. eor v10.16b, v10.16b, v9.16b
  550. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  551. eor v11.16b, v11.16b, v8.16b
  552. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
  553. cbnz w8, Ldec_2x_loop
  554. // middle of last round
  555. // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  556. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  557. tbl v12.16b, {v22.16b}, v10.16b
  558. // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  559. tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  560. tbl v9.16b, {v23.16b}, v11.16b
  561. ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
  562. eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  563. eor v12.16b, v12.16b, v16.16b
  564. eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  565. eor v8.16b, v9.16b, v12.16b
  566. tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
  567. tbl v1.16b, {v8.16b},v2.16b
  568. ret
  569. ########################################################
  570. ## ##
  571. ## AES key schedule ##
  572. ## ##
  573. ########################################################
  574. .def _vpaes_key_preheat
  575. .type 32
  576. .endef
  577. .align 4
  578. _vpaes_key_preheat:
  579. adrp x10, Lk_inv
  580. add x10, x10, :lo12:Lk_inv
  581. movi v16.16b, #0x5b // Lk_s63
  582. adrp x11, Lk_sb1
  583. add x11, x11, :lo12:Lk_sb1
  584. movi v17.16b, #0x0f // Lk_s0F
  585. ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
  586. adrp x10, Lk_dksd
  587. add x10, x10, :lo12:Lk_dksd
  588. ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
  589. adrp x11, Lk_mc_forward
  590. add x11, x11, :lo12:Lk_mc_forward
  591. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
  592. ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
  593. ld1 {v8.2d}, [x10] // Lk_rcon
  594. ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
  595. ret
  596. .def _vpaes_schedule_core
  597. .type 32
  598. .endef
  599. .align 4
  600. _vpaes_schedule_core:
  601. AARCH64_SIGN_LINK_REGISTER
  602. stp x29, x30, [sp,#-16]!
  603. add x29,sp,#0
  604. bl _vpaes_key_preheat // load the tables
  605. ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
  606. // input transform
  607. mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
  608. bl _vpaes_schedule_transform
  609. mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
  610. adrp x10, Lk_sr // lea Lk_sr(%rip),%r10
  611. add x10, x10, :lo12:Lk_sr
  612. add x8, x8, x10
  613. cbnz w3, Lschedule_am_decrypting
  614. // encrypting, output zeroth round key after transform
  615. st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
  616. b Lschedule_go
  617. Lschedule_am_decrypting:
  618. // decrypting, output zeroth round key after shiftrows
  619. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  620. tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  621. st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
  622. eor x8, x8, #0x30 // xor $0x30, %r8
  623. Lschedule_go:
  624. cmp w1, #192 // cmp $192, %esi
  625. b.hi Lschedule_256
  626. b.eq Lschedule_192
  627. // 128: fall though
  628. ##
  629. ## .schedule_128
  630. ##
  631. ## 128-bit specific part of key schedule.
  632. ##
  633. ## This schedule is really simple, because all its parts
  634. ## are accomplished by the subroutines.
  635. ##
  636. Lschedule_128:
  637. mov x0, #10 // mov $10, %esi
  638. Loop_schedule_128:
  639. sub x0, x0, #1 // dec %esi
  640. bl _vpaes_schedule_round
  641. cbz x0, Lschedule_mangle_last
  642. bl _vpaes_schedule_mangle // write output
  643. b Loop_schedule_128
  644. ##
  645. ## .aes_schedule_192
  646. ##
  647. ## 192-bit specific part of key schedule.
  648. ##
  649. ## The main body of this schedule is the same as the 128-bit
  650. ## schedule, but with more smearing. The long, high side is
  651. ## stored in %xmm7 as before, and the short, low side is in
  652. ## the high bits of %xmm6.
  653. ##
  654. ## This schedule is somewhat nastier, however, because each
  655. ## round produces 192 bits of key material, or 1.5 round keys.
  656. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  657. ## keys.
  658. ##
  659. .align 4
  660. Lschedule_192:
  661. sub x0, x0, #8
  662. ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  663. bl _vpaes_schedule_transform // input transform
  664. mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
  665. eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
  666. ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
  667. mov x0, #4 // mov $4, %esi
  668. Loop_schedule_192:
  669. sub x0, x0, #1 // dec %esi
  670. bl _vpaes_schedule_round
  671. ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
  672. bl _vpaes_schedule_mangle // save key n
  673. bl _vpaes_schedule_192_smear
  674. bl _vpaes_schedule_mangle // save key n+1
  675. bl _vpaes_schedule_round
  676. cbz x0, Lschedule_mangle_last
  677. bl _vpaes_schedule_mangle // save key n+2
  678. bl _vpaes_schedule_192_smear
  679. b Loop_schedule_192
  680. ##
  681. ## .aes_schedule_256
  682. ##
  683. ## 256-bit specific part of key schedule.
  684. ##
  685. ## The structure here is very similar to the 128-bit
  686. ## schedule, but with an additional "low side" in
  687. ## %xmm6. The low side's rounds are the same as the
  688. ## high side's, except no rcon and no rotation.
  689. ##
  690. .align 4
  691. Lschedule_256:
  692. ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  693. bl _vpaes_schedule_transform // input transform
  694. mov x0, #7 // mov $7, %esi
  695. Loop_schedule_256:
  696. sub x0, x0, #1 // dec %esi
  697. bl _vpaes_schedule_mangle // output low result
  698. mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  699. // high round
  700. bl _vpaes_schedule_round
  701. cbz x0, Lschedule_mangle_last
  702. bl _vpaes_schedule_mangle
  703. // low round. swap xmm7 and xmm6
  704. dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
  705. movi v4.16b, #0
  706. mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
  707. mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
  708. bl _vpaes_schedule_low_round
  709. mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
  710. b Loop_schedule_256
  711. ##
  712. ## .aes_schedule_mangle_last
  713. ##
  714. ## Mangler for last round of key schedule
  715. ## Mangles %xmm0
  716. ## when encrypting, outputs out(%xmm0) ^ 63
  717. ## when decrypting, outputs unskew(%xmm0)
  718. ##
  719. ## Always called right before return... jumps to cleanup and exits
  720. ##
  721. .align 4
  722. Lschedule_mangle_last:
  723. // schedule last round key from xmm0
  724. adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew
  725. add x11, x11, :lo12:Lk_deskew
  726. cbnz w3, Lschedule_mangle_last_dec
  727. // encrypting
  728. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
  729. adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform
  730. add x11, x11, :lo12:Lk_opt
  731. add x2, x2, #32 // add $32, %rdx
  732. tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
  733. Lschedule_mangle_last_dec:
  734. ld1 {v20.2d,v21.2d}, [x11] // reload constants
  735. sub x2, x2, #16 // add $-16, %rdx
  736. eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
  737. bl _vpaes_schedule_transform // output transform
  738. st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
  739. // cleanup
  740. eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
  741. eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
  742. eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
  743. eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
  744. eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
  745. eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
  746. eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
  747. eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
  748. ldp x29, x30, [sp],#16
  749. AARCH64_VALIDATE_LINK_REGISTER
  750. ret
  751. ##
  752. ## .aes_schedule_192_smear
  753. ##
  754. ## Smear the short, low side in the 192-bit key schedule.
  755. ##
  756. ## Inputs:
  757. ## %xmm7: high side, b a x y
  758. ## %xmm6: low side, d c 0 0
  759. ## %xmm13: 0
  760. ##
  761. ## Outputs:
  762. ## %xmm6: b+c+d b+c 0 0
  763. ## %xmm0: b+c+d b+c b a
  764. ##
  765. .def _vpaes_schedule_192_smear
  766. .type 32
  767. .endef
  768. .align 4
  769. _vpaes_schedule_192_smear:
  770. movi v1.16b, #0
  771. dup v0.4s, v7.s[3]
  772. ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  773. ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  774. eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  775. eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
  776. eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  777. mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
  778. ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
  779. ret
  780. ##
  781. ## .aes_schedule_round
  782. ##
  783. ## Runs one main round of the key schedule on %xmm0, %xmm7
  784. ##
  785. ## Specifically, runs subbytes on the high dword of %xmm0
  786. ## then rotates it by one byte and xors into the low dword of
  787. ## %xmm7.
  788. ##
  789. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  790. ## next rcon.
  791. ##
  792. ## Smears the dwords of %xmm7 by xoring the low into the
  793. ## second low, result into third, result into highest.
  794. ##
  795. ## Returns results in %xmm7 = %xmm0.
  796. ## Clobbers %xmm1-%xmm4, %r11.
  797. ##
  798. .def _vpaes_schedule_round
  799. .type 32
  800. .endef
  801. .align 4
  802. _vpaes_schedule_round:
  803. // extract rcon from xmm8
  804. movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
  805. ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
  806. ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
  807. eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
  808. // rotate
  809. dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
  810. ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
  811. // fall through...
  812. // low round: same as high round, but no rotation and no rcon.
  813. _vpaes_schedule_low_round:
  814. // smear xmm7
  815. ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
  816. eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
  817. ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
  818. // subbytes
  819. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  820. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  821. eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
  822. tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  823. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  824. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  825. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  826. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  827. eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
  828. tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  829. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  830. tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  831. eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  832. eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  833. tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  834. tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  835. eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  836. // add in smeared stuff
  837. eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
  838. eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
  839. ret
  840. ##
  841. ## .aes_schedule_transform
  842. ##
  843. ## Linear-transform %xmm0 according to tables at (%r11)
  844. ##
  845. ## Requires that %xmm9 = 0x0F0F... as in preheat
  846. ## Output in %xmm0
  847. ## Clobbers %xmm1, %xmm2
  848. ##
  849. .def _vpaes_schedule_transform
  850. .type 32
  851. .endef
  852. .align 4
  853. _vpaes_schedule_transform:
  854. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  855. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  856. // vmovdqa (%r11), %xmm2 # lo
  857. tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  858. // vmovdqa 16(%r11), %xmm1 # hi
  859. tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  860. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  861. ret
  862. ##
  863. ## .aes_schedule_mangle
  864. ##
  865. ## Mangle xmm0 from (basis-transformed) standard version
  866. ## to our version.
  867. ##
  868. ## On encrypt,
  869. ## xor with 0x63
  870. ## multiply by circulant 0,1,1,1
  871. ## apply shiftrows transform
  872. ##
  873. ## On decrypt,
  874. ## xor with 0x63
  875. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  876. ## deskew
  877. ## apply shiftrows transform
  878. ##
  879. ##
  880. ## Writes out to (%rdx), and increments or decrements it
  881. ## Keeps track of round number mod 4 in %r8
  882. ## Preserves xmm0
  883. ## Clobbers xmm1-xmm5
  884. ##
  885. .def _vpaes_schedule_mangle
  886. .type 32
  887. .endef
  888. .align 4
  889. _vpaes_schedule_mangle:
  890. mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
  891. // vmovdqa .Lk_mc_forward(%rip),%xmm5
  892. cbnz w3, Lschedule_mangle_dec
  893. // encrypting
  894. eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
  895. add x2, x2, #16 // add $16, %rdx
  896. tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
  897. tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
  898. tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
  899. eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
  900. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  901. eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
  902. b Lschedule_mangle_both
  903. .align 4
  904. Lschedule_mangle_dec:
  905. // inverse mix columns
  906. // lea .Lk_dksd(%rip),%r11
  907. ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
  908. and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  909. // vmovdqa 0x00(%r11), %xmm2
  910. tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  911. // vmovdqa 0x10(%r11), %xmm3
  912. tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  913. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  914. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  915. // vmovdqa 0x20(%r11), %xmm2
  916. tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  917. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  918. // vmovdqa 0x30(%r11), %xmm3
  919. tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  920. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  921. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  922. // vmovdqa 0x40(%r11), %xmm2
  923. tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  924. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  925. // vmovdqa 0x50(%r11), %xmm3
  926. tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  927. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  928. // vmovdqa 0x60(%r11), %xmm2
  929. tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  930. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  931. // vmovdqa 0x70(%r11), %xmm4
  932. tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
  933. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  934. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  935. eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
  936. sub x2, x2, #16 // add $-16, %rdx
  937. Lschedule_mangle_both:
  938. tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  939. add x8, x8, #48 // add $-16, %r8
  940. and x8, x8, #~(1<<6) // and $0x30, %r8
  941. st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
  942. ret
  943. .globl vpaes_set_encrypt_key
  944. .def vpaes_set_encrypt_key
  945. .type 32
  946. .endef
  947. .align 4
  948. vpaes_set_encrypt_key:
  949. AARCH64_SIGN_LINK_REGISTER
  950. stp x29,x30,[sp,#-16]!
  951. add x29,sp,#0
  952. stp d8,d9,[sp,#-16]! // ABI spec says so
  953. lsr w9, w1, #5 // shr $5,%eax
  954. add w9, w9, #5 // $5,%eax
  955. str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  956. mov w3, #0 // mov $0,%ecx
  957. mov x8, #0x30 // mov $0x30,%r8d
  958. bl _vpaes_schedule_core
  959. eor x0, x0, x0
  960. ldp d8,d9,[sp],#16
  961. ldp x29,x30,[sp],#16
  962. AARCH64_VALIDATE_LINK_REGISTER
  963. ret
  964. .globl vpaes_set_decrypt_key
  965. .def vpaes_set_decrypt_key
  966. .type 32
  967. .endef
  968. .align 4
  969. vpaes_set_decrypt_key:
  970. AARCH64_SIGN_LINK_REGISTER
  971. stp x29,x30,[sp,#-16]!
  972. add x29,sp,#0
  973. stp d8,d9,[sp,#-16]! // ABI spec says so
  974. lsr w9, w1, #5 // shr $5,%eax
  975. add w9, w9, #5 // $5,%eax
  976. str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  977. lsl w9, w9, #4 // shl $4,%eax
  978. add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
  979. add x2, x2, x9
  980. mov w3, #1 // mov $1,%ecx
  981. lsr w8, w1, #1 // shr $1,%r8d
  982. and x8, x8, #32 // and $32,%r8d
  983. eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
  984. bl _vpaes_schedule_core
  985. ldp d8,d9,[sp],#16
  986. ldp x29,x30,[sp],#16
  987. AARCH64_VALIDATE_LINK_REGISTER
  988. ret
  989. .globl vpaes_cbc_encrypt
  990. .def vpaes_cbc_encrypt
  991. .type 32
  992. .endef
  993. .align 4
  994. vpaes_cbc_encrypt:
  995. AARCH64_SIGN_LINK_REGISTER
  996. cbz x2, Lcbc_abort
  997. cmp w5, #0 // check direction
  998. b.eq vpaes_cbc_decrypt
  999. stp x29,x30,[sp,#-16]!
  1000. add x29,sp,#0
  1001. mov x17, x2 // reassign
  1002. mov x2, x3 // reassign
  1003. ld1 {v0.16b}, [x4] // load ivec
  1004. bl _vpaes_encrypt_preheat
  1005. b Lcbc_enc_loop
  1006. .align 4
  1007. Lcbc_enc_loop:
  1008. ld1 {v7.16b}, [x0],#16 // load input
  1009. eor v7.16b, v7.16b, v0.16b // xor with ivec
  1010. bl _vpaes_encrypt_core
  1011. st1 {v0.16b}, [x1],#16 // save output
  1012. subs x17, x17, #16
  1013. b.hi Lcbc_enc_loop
  1014. st1 {v0.16b}, [x4] // write ivec
  1015. ldp x29,x30,[sp],#16
  1016. Lcbc_abort:
  1017. AARCH64_VALIDATE_LINK_REGISTER
  1018. ret
  1019. .def vpaes_cbc_decrypt
  1020. .type 32
  1021. .endef
  1022. .align 4
  1023. vpaes_cbc_decrypt:
  1024. // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
  1025. // only from vpaes_cbc_encrypt which has already signed the return address.
  1026. stp x29,x30,[sp,#-16]!
  1027. add x29,sp,#0
  1028. stp d8,d9,[sp,#-16]! // ABI spec says so
  1029. stp d10,d11,[sp,#-16]!
  1030. stp d12,d13,[sp,#-16]!
  1031. stp d14,d15,[sp,#-16]!
  1032. mov x17, x2 // reassign
  1033. mov x2, x3 // reassign
  1034. ld1 {v6.16b}, [x4] // load ivec
  1035. bl _vpaes_decrypt_preheat
  1036. tst x17, #16
  1037. b.eq Lcbc_dec_loop2x
  1038. ld1 {v7.16b}, [x0], #16 // load input
  1039. bl _vpaes_decrypt_core
  1040. eor v0.16b, v0.16b, v6.16b // xor with ivec
  1041. orr v6.16b, v7.16b, v7.16b // next ivec value
  1042. st1 {v0.16b}, [x1], #16
  1043. subs x17, x17, #16
  1044. b.ls Lcbc_dec_done
  1045. .align 4
  1046. Lcbc_dec_loop2x:
  1047. ld1 {v14.16b,v15.16b}, [x0], #32
  1048. bl _vpaes_decrypt_2x
  1049. eor v0.16b, v0.16b, v6.16b // xor with ivec
  1050. eor v1.16b, v1.16b, v14.16b
  1051. orr v6.16b, v15.16b, v15.16b
  1052. st1 {v0.16b,v1.16b}, [x1], #32
  1053. subs x17, x17, #32
  1054. b.hi Lcbc_dec_loop2x
  1055. Lcbc_dec_done:
  1056. st1 {v6.16b}, [x4]
  1057. ldp d14,d15,[sp],#16
  1058. ldp d12,d13,[sp],#16
  1059. ldp d10,d11,[sp],#16
  1060. ldp d8,d9,[sp],#16
  1061. ldp x29,x30,[sp],#16
  1062. AARCH64_VALIDATE_LINK_REGISTER
  1063. ret
  1064. .globl vpaes_ctr32_encrypt_blocks
  1065. .def vpaes_ctr32_encrypt_blocks
  1066. .type 32
  1067. .endef
  1068. .align 4
  1069. vpaes_ctr32_encrypt_blocks:
  1070. AARCH64_SIGN_LINK_REGISTER
  1071. stp x29,x30,[sp,#-16]!
  1072. add x29,sp,#0
  1073. stp d8,d9,[sp,#-16]! // ABI spec says so
  1074. stp d10,d11,[sp,#-16]!
  1075. stp d12,d13,[sp,#-16]!
  1076. stp d14,d15,[sp,#-16]!
  1077. cbz x2, Lctr32_done
  1078. // Note, unlike the other functions, x2 here is measured in blocks,
  1079. // not bytes.
  1080. mov x17, x2
  1081. mov x2, x3
  1082. // Load the IV and counter portion.
  1083. ldr w6, [x4, #12]
  1084. ld1 {v7.16b}, [x4]
  1085. bl _vpaes_encrypt_preheat
  1086. tst x17, #1
  1087. rev w6, w6 // The counter is big-endian.
  1088. b.eq Lctr32_prep_loop
  1089. // Handle one block so the remaining block count is even for
  1090. // _vpaes_encrypt_2x.
  1091. ld1 {v6.16b}, [x0], #16 // Load input ahead of time
  1092. bl _vpaes_encrypt_core
  1093. eor v0.16b, v0.16b, v6.16b // XOR input and result
  1094. st1 {v0.16b}, [x1], #16
  1095. subs x17, x17, #1
  1096. // Update the counter.
  1097. add w6, w6, #1
  1098. rev w7, w6
  1099. mov v7.s[3], w7
  1100. b.ls Lctr32_done
  1101. Lctr32_prep_loop:
  1102. // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
  1103. // uses v14 and v15.
  1104. mov v15.16b, v7.16b
  1105. mov v14.16b, v7.16b
  1106. add w6, w6, #1
  1107. rev w7, w6
  1108. mov v15.s[3], w7
  1109. Lctr32_loop:
  1110. ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
  1111. bl _vpaes_encrypt_2x
  1112. eor v0.16b, v0.16b, v6.16b // XOR input and result
  1113. eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
  1114. st1 {v0.16b,v1.16b}, [x1], #32
  1115. subs x17, x17, #2
  1116. // Update the counter.
  1117. add w7, w6, #1
  1118. add w6, w6, #2
  1119. rev w7, w7
  1120. mov v14.s[3], w7
  1121. rev w7, w6
  1122. mov v15.s[3], w7
  1123. b.hi Lctr32_loop
  1124. Lctr32_done:
  1125. ldp d14,d15,[sp],#16
  1126. ldp d12,d13,[sp],#16
  1127. ldp d10,d11,[sp],#16
  1128. ldp d8,d9,[sp],#16
  1129. ldp x29,x30,[sp],#16
  1130. AARCH64_VALIDATE_LINK_REGISTER
  1131. ret
  1132. #endif
  1133. #endif // !OPENSSL_NO_ASM