vpaes-armv7.S 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__arm__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. .syntax unified
  15. .arch armv7-a
  16. .fpu neon
  17. #if defined(__thumb2__)
  18. .thumb
  19. #else
  20. .code 32
  21. #endif
  22. .text
  23. .type _vpaes_consts,%object
  24. .align 7 @ totally strategic alignment
  25. _vpaes_consts:
  26. .Lk_mc_forward:@ mc_forward
  27. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  28. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  29. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  30. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  31. .Lk_mc_backward:@ mc_backward
  32. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  33. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  34. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  35. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  36. .Lk_sr:@ sr
  37. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  38. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  39. .quad 0x0F060D040B020900, 0x070E050C030A0108
  40. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  41. @
  42. @ "Hot" constants
  43. @
  44. .Lk_inv:@ inv, inva
  45. .quad 0x0E05060F0D080180, 0x040703090A0B0C02
  46. .quad 0x01040A060F0B0780, 0x030D0E0C02050809
  47. .Lk_ipt:@ input transform (lo, hi)
  48. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  49. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  50. .Lk_sbo:@ sbou, sbot
  51. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  52. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  53. .Lk_sb1:@ sb1u, sb1t
  54. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  55. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  56. .Lk_sb2:@ sb2u, sb2t
  57. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  58. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  59. .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
  60. .align 2
  61. .size _vpaes_consts,.-_vpaes_consts
  62. .align 6
  63. @@
  64. @@ _aes_preheat
  65. @@
  66. @@ Fills q9-q15 as specified below.
  67. @@
  68. .type _vpaes_preheat,%function
  69. .align 4
  70. _vpaes_preheat:
  71. adr r10, .Lk_inv
  72. vmov.i8 q9, #0x0f @ .Lk_s0F
  73. vld1.64 {q10,q11}, [r10]! @ .Lk_inv
  74. add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo
  75. vld1.64 {q12,q13}, [r10]! @ .Lk_sb1
  76. vld1.64 {q14,q15}, [r10] @ .Lk_sb2
  77. bx lr
  78. @@
  79. @@ _aes_encrypt_core
  80. @@
  81. @@ AES-encrypt q0.
  82. @@
  83. @@ Inputs:
  84. @@ q0 = input
  85. @@ q9-q15 as in _vpaes_preheat
  86. @@ [r2] = scheduled keys
  87. @@
  88. @@ Output in q0
  89. @@ Clobbers q1-q5, r8-r11
  90. @@ Preserves q6-q8 so you get some local vectors
  91. @@
  92. @@
  93. .type _vpaes_encrypt_core,%function
  94. .align 4
  95. _vpaes_encrypt_core:
  96. mov r9, r2
  97. ldr r8, [r2,#240] @ pull rounds
  98. adr r11, .Lk_ipt
  99. @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  100. @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  101. vld1.64 {q2, q3}, [r11]
  102. adr r11, .Lk_mc_forward+16
  103. vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
  104. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  105. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  106. vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
  107. vtbl.8 d3, {q2}, d3
  108. vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
  109. vtbl.8 d5, {q3}, d1
  110. veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
  111. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  112. @ .Lenc_entry ends with a bnz instruction which is normally paired with
  113. @ subs in .Lenc_loop.
  114. tst r8, r8
  115. b .Lenc_entry
  116. .align 4
  117. .Lenc_loop:
  118. @ middle of middle round
  119. add r10, r11, #0x40
  120. vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  121. vtbl.8 d9, {q13}, d5
  122. vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
  123. vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  124. vtbl.8 d1, {q12}, d7
  125. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  126. vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  127. vtbl.8 d11, {q15}, d5
  128. veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  129. vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  130. vtbl.8 d5, {q14}, d7
  131. vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
  132. vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  133. vtbl.8 d7, {q0}, d3
  134. veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  135. @ Write to q5 instead of q0, so the table and destination registers do
  136. @ not overlap.
  137. vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  138. vtbl.8 d11, {q0}, d9
  139. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  140. vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  141. vtbl.8 d9, {q3}, d3
  142. @ Here we restore the original q0/q5 usage.
  143. veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  144. and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
  145. veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  146. subs r8, r8, #1 @ nr--
  147. .Lenc_entry:
  148. @ top of round
  149. vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
  150. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  151. vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  152. vtbl.8 d11, {q11}, d3
  153. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  154. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  155. vtbl.8 d7, {q10}, d1
  156. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  157. vtbl.8 d9, {q10}, d3
  158. veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  159. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  160. vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  161. vtbl.8 d5, {q10}, d7
  162. vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  163. vtbl.8 d7, {q10}, d9
  164. veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  165. veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  166. vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
  167. bne .Lenc_loop
  168. @ middle of last round
  169. add r10, r11, #0x80
  170. adr r11, .Lk_sbo
  171. @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
  172. @ overlap table and destination registers.
  173. vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
  174. vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  175. vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  176. vtbl.8 d9, {q1}, d5
  177. vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
  178. @ Write to q2 instead of q0 below, to avoid overlapping table and
  179. @ destination registers.
  180. vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  181. vtbl.8 d5, {q0}, d7
  182. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  183. veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  184. @ Here we restore the original q0/q2 usage.
  185. vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
  186. vtbl.8 d1, {q2}, d3
  187. bx lr
  188. .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
  189. .globl vpaes_encrypt
  190. .hidden vpaes_encrypt
  191. .type vpaes_encrypt,%function
  192. .align 4
  193. vpaes_encrypt:
  194. @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
  195. @ alignment.
  196. stmdb sp!, {r7,r8,r9,r10,r11,lr}
  197. @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
  198. vstmdb sp!, {d8,d9,d10,d11}
  199. vld1.64 {q0}, [r0]
  200. bl _vpaes_preheat
  201. bl _vpaes_encrypt_core
  202. vst1.64 {q0}, [r1]
  203. vldmia sp!, {d8,d9,d10,d11}
  204. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  205. .size vpaes_encrypt,.-vpaes_encrypt
  206. @
  207. @ Decryption stuff
  208. @
  209. .type _vpaes_decrypt_consts,%object
  210. .align 4
  211. _vpaes_decrypt_consts:
  212. .Lk_dipt:@ decryption input transform
  213. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  214. .quad 0x86E383E660056500, 0x12771772F491F194
  215. .Lk_dsbo:@ decryption sbox final output
  216. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  217. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  218. .Lk_dsb9:@ decryption sbox output *9*u, *9*t
  219. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  220. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  221. .Lk_dsbd:@ decryption sbox output *D*u, *D*t
  222. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  223. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  224. .Lk_dsbb:@ decryption sbox output *B*u, *B*t
  225. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  226. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  227. .Lk_dsbe:@ decryption sbox output *E*u, *E*t
  228. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  229. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  230. .size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts
  231. @@
  232. @@ Decryption core
  233. @@
  234. @@ Same API as encryption core, except it clobbers q12-q15 rather than using
  235. @@ the values from _vpaes_preheat. q9-q11 must still be set from
  236. @@ _vpaes_preheat.
  237. @@
  238. .type _vpaes_decrypt_core,%function
  239. .align 4
  240. _vpaes_decrypt_core:
  241. mov r9, r2
  242. ldr r8, [r2,#240] @ pull rounds
  243. @ This function performs shuffles with various constants. The x86_64
  244. @ version loads them on-demand into %xmm0-%xmm5. This does not work well
  245. @ for ARMv7 because those registers are shuffle destinations. The ARMv8
  246. @ version preloads those constants into registers, but ARMv7 has half
  247. @ the registers to work with. Instead, we load them on-demand into
  248. @ q12-q15, registers normally use for preloaded constants. This is fine
  249. @ because decryption doesn't use those constants. The values are
  250. @ constant, so this does not interfere with potential 2x optimizations.
  251. adr r7, .Lk_dipt
  252. vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
  253. lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
  254. eor r11, r11, #0x30 @ xor $0x30, %r11
  255. adr r10, .Lk_sr
  256. and r11, r11, #0x30 @ and $0x30, %r11
  257. add r11, r11, r10
  258. adr r10, .Lk_mc_forward+48
  259. vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
  260. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  261. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  262. vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
  263. vtbl.8 d5, {q12}, d3
  264. vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5
  265. @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  266. vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
  267. vtbl.8 d1, {q13}, d1
  268. veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
  269. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  270. @ .Ldec_entry ends with a bnz instruction which is normally paired with
  271. @ subs in .Ldec_loop.
  272. tst r8, r8
  273. b .Ldec_entry
  274. .align 4
  275. .Ldec_loop:
  276. @
  277. @ Inverse mix columns
  278. @
  279. @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
  280. @ the function.
  281. adr r10, .Lk_dsb9
  282. vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  283. @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  284. @ Load sbd* ahead of time.
  285. vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  286. @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  287. vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  288. vtbl.8 d9, {q12}, d5
  289. vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  290. vtbl.8 d3, {q13}, d7
  291. veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
  292. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  293. @ Load sbb* ahead of time.
  294. vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
  295. @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
  296. vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  297. vtbl.8 d9, {q14}, d5
  298. @ Write to q1 instead of q0, so the table and destination registers do
  299. @ not overlap.
  300. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  301. vtbl.8 d3, {q0}, d11
  302. @ Here we restore the original q0/q1 usage. This instruction is
  303. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  304. @ below.
  305. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  306. vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  307. vtbl.8 d3, {q15}, d7
  308. @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  309. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  310. @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  311. @ Load sbd* ahead of time.
  312. vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
  313. @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
  314. vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  315. vtbl.8 d9, {q12}, d5
  316. @ Write to q1 instead of q0, so the table and destination registers do
  317. @ not overlap.
  318. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  319. vtbl.8 d3, {q0}, d11
  320. @ Here we restore the original q0/q1 usage. This instruction is
  321. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  322. @ below.
  323. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  324. vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  325. vtbl.8 d3, {q13}, d7
  326. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  327. vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  328. vtbl.8 d9, {q14}, d5
  329. @ Write to q1 instead of q0, so the table and destination registers do
  330. @ not overlap.
  331. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  332. vtbl.8 d3, {q0}, d11
  333. @ Here we restore the original q0/q1 usage. This instruction is
  334. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  335. @ below.
  336. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  337. vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  338. vtbl.8 d3, {q15}, d7
  339. vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
  340. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  341. subs r8, r8, #1 @ sub $1,%rax # nr--
  342. .Ldec_entry:
  343. @ top of round
  344. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
  345. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  346. vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  347. vtbl.8 d5, {q11}, d3
  348. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  349. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  350. vtbl.8 d7, {q10}, d1
  351. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  352. vtbl.8 d9, {q10}, d3
  353. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  354. veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  355. vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  356. vtbl.8 d5, {q10}, d7
  357. vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  358. vtbl.8 d7, {q10}, d9
  359. veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  360. veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  361. vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
  362. bne .Ldec_loop
  363. @ middle of last round
  364. adr r10, .Lk_dsbo
  365. @ Write to q1 rather than q4 to avoid overlapping table and destination.
  366. vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  367. vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  368. vtbl.8 d9, {q1}, d5
  369. @ Write to q2 rather than q1 to avoid overlapping table and destination.
  370. vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  371. vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  372. vtbl.8 d3, {q2}, d7
  373. vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
  374. veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  375. @ Write to q1 rather than q0 so the table and destination registers
  376. @ below do not overlap.
  377. veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  378. vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
  379. vtbl.8 d1, {q1}, d5
  380. bx lr
  381. .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
  382. .globl vpaes_decrypt
  383. .hidden vpaes_decrypt
  384. .type vpaes_decrypt,%function
  385. .align 4
  386. vpaes_decrypt:
  387. @ _vpaes_decrypt_core uses r7-r11.
  388. stmdb sp!, {r7,r8,r9,r10,r11,lr}
  389. @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
  390. vstmdb sp!, {d8,d9,d10,d11}
  391. vld1.64 {q0}, [r0]
  392. bl _vpaes_preheat
  393. bl _vpaes_decrypt_core
  394. vst1.64 {q0}, [r1]
  395. vldmia sp!, {d8,d9,d10,d11}
  396. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  397. .size vpaes_decrypt,.-vpaes_decrypt
  398. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  399. @@ @@
  400. @@ AES key schedule @@
  401. @@ @@
  402. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  403. @ This function diverges from both x86_64 and armv7 in which constants are
  404. @ pinned. x86_64 has a common preheat function for all operations. aarch64
  405. @ separates them because it has enough registers to pin nearly all constants.
  406. @ armv7 does not have enough registers, but needing explicit loads and stores
  407. @ also complicates using x86_64's register allocation directly.
  408. @
  409. @ We pin some constants for convenience and leave q14 and q15 free to load
  410. @ others on demand.
  411. @
  412. @ Key schedule constants
  413. @
  414. .type _vpaes_key_consts,%object
  415. .align 4
  416. _vpaes_key_consts:
  417. .Lk_dksd:@ decryption key schedule: invskew x*D
  418. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  419. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  420. .Lk_dksb:@ decryption key schedule: invskew x*B
  421. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  422. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  423. .Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
  424. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  425. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  426. .Lk_dks9:@ decryption key schedule: invskew x*9
  427. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  428. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  429. .Lk_rcon:@ rcon
  430. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  431. .Lk_opt:@ output transform
  432. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  433. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  434. .Lk_deskew:@ deskew tables: inverts the sbox's "skew"
  435. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  436. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  437. .size _vpaes_key_consts,.-_vpaes_key_consts
  438. .type _vpaes_key_preheat,%function
  439. .align 4
  440. _vpaes_key_preheat:
  441. adr r11, .Lk_rcon
  442. vmov.i8 q12, #0x5b @ .Lk_s63
  443. adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
  444. vmov.i8 q9, #0x0f @ .Lk_s0F
  445. vld1.64 {q10,q11}, [r10] @ .Lk_inv
  446. vld1.64 {q8}, [r11] @ .Lk_rcon
  447. bx lr
  448. .size _vpaes_key_preheat,.-_vpaes_key_preheat
  449. .type _vpaes_schedule_core,%function
  450. .align 4
  451. _vpaes_schedule_core:
  452. @ We only need to save lr, but ARM requires an 8-byte stack alignment,
  453. @ so save an extra register.
  454. stmdb sp!, {r3,lr}
  455. bl _vpaes_key_preheat @ load the tables
  456. adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
  457. vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
  458. @ input transform
  459. @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
  460. @ overlap table and destination.
  461. vmov q4, q0 @ vmovdqa %xmm0, %xmm3
  462. bl _vpaes_schedule_transform
  463. adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
  464. vmov q7, q0 @ vmovdqa %xmm0, %xmm7
  465. add r8, r8, r10
  466. tst r3, r3
  467. bne .Lschedule_am_decrypting
  468. @ encrypting, output zeroth round key after transform
  469. vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
  470. b .Lschedule_go
  471. .Lschedule_am_decrypting:
  472. @ decrypting, output zeroth round key after shiftrows
  473. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  474. vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  475. vtbl.8 d7, {q4}, d3
  476. vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
  477. eor r8, r8, #0x30 @ xor $0x30, %r8
  478. .Lschedule_go:
  479. cmp r1, #192 @ cmp $192, %esi
  480. bhi .Lschedule_256
  481. beq .Lschedule_192
  482. @ 128: fall though
  483. @@
  484. @@ .schedule_128
  485. @@
  486. @@ 128-bit specific part of key schedule.
  487. @@
  488. @@ This schedule is really simple, because all its parts
  489. @@ are accomplished by the subroutines.
  490. @@
  491. .Lschedule_128:
  492. mov r0, #10 @ mov $10, %esi
  493. .Loop_schedule_128:
  494. bl _vpaes_schedule_round
  495. subs r0, r0, #1 @ dec %esi
  496. beq .Lschedule_mangle_last
  497. bl _vpaes_schedule_mangle @ write output
  498. b .Loop_schedule_128
  499. @@
  500. @@ .aes_schedule_192
  501. @@
  502. @@ 192-bit specific part of key schedule.
  503. @@
  504. @@ The main body of this schedule is the same as the 128-bit
  505. @@ schedule, but with more smearing. The long, high side is
  506. @@ stored in q7 as before, and the short, low side is in
  507. @@ the high bits of q6.
  508. @@
  509. @@ This schedule is somewhat nastier, however, because each
  510. @@ round produces 192 bits of key material, or 1.5 round keys.
  511. @@ Therefore, on each cycle we do 2 rounds and produce 3 round
  512. @@ keys.
  513. @@
  514. .align 4
  515. .Lschedule_192:
  516. sub r0, r0, #8
  517. vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  518. bl _vpaes_schedule_transform @ input transform
  519. vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
  520. vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
  521. @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
  522. mov r0, #4 @ mov $4, %esi
  523. .Loop_schedule_192:
  524. bl _vpaes_schedule_round
  525. vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
  526. bl _vpaes_schedule_mangle @ save key n
  527. bl _vpaes_schedule_192_smear
  528. bl _vpaes_schedule_mangle @ save key n+1
  529. bl _vpaes_schedule_round
  530. subs r0, r0, #1 @ dec %esi
  531. beq .Lschedule_mangle_last
  532. bl _vpaes_schedule_mangle @ save key n+2
  533. bl _vpaes_schedule_192_smear
  534. b .Loop_schedule_192
  535. @@
  536. @@ .aes_schedule_256
  537. @@
  538. @@ 256-bit specific part of key schedule.
  539. @@
  540. @@ The structure here is very similar to the 128-bit
  541. @@ schedule, but with an additional "low side" in
  542. @@ q6. The low side's rounds are the same as the
  543. @@ high side's, except no rcon and no rotation.
  544. @@
  545. .align 4
  546. .Lschedule_256:
  547. vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  548. bl _vpaes_schedule_transform @ input transform
  549. mov r0, #7 @ mov $7, %esi
  550. .Loop_schedule_256:
  551. bl _vpaes_schedule_mangle @ output low result
  552. vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  553. @ high round
  554. bl _vpaes_schedule_round
  555. subs r0, r0, #1 @ dec %esi
  556. beq .Lschedule_mangle_last
  557. bl _vpaes_schedule_mangle
  558. @ low round. swap xmm7 and xmm6
  559. vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
  560. vmov.i8 q4, #0
  561. vmov q5, q7 @ vmovdqa %xmm7, %xmm5
  562. vmov q7, q6 @ vmovdqa %xmm6, %xmm7
  563. bl _vpaes_schedule_low_round
  564. vmov q7, q5 @ vmovdqa %xmm5, %xmm7
  565. b .Loop_schedule_256
  566. @@
  567. @@ .aes_schedule_mangle_last
  568. @@
  569. @@ Mangler for last round of key schedule
  570. @@ Mangles q0
  571. @@ when encrypting, outputs out(q0) ^ 63
  572. @@ when decrypting, outputs unskew(q0)
  573. @@
  574. @@ Always called right before return... jumps to cleanup and exits
  575. @@
  576. .align 4
  577. .Lschedule_mangle_last:
  578. @ schedule last round key from xmm0
  579. adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew
  580. tst r3, r3
  581. bne .Lschedule_mangle_last_dec
  582. @ encrypting
  583. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
  584. adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform
  585. add r2, r2, #32 @ add $32, %rdx
  586. vmov q2, q0
  587. vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
  588. vtbl.8 d1, {q2}, d3
  589. .Lschedule_mangle_last_dec:
  590. sub r2, r2, #16 @ add $-16, %rdx
  591. veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0
  592. bl _vpaes_schedule_transform @ output transform
  593. vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
  594. @ cleanup
  595. veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
  596. veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
  597. veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
  598. veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
  599. veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
  600. veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
  601. veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
  602. veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
  603. ldmia sp!, {r3,pc} @ return
  604. .size _vpaes_schedule_core,.-_vpaes_schedule_core
  605. @@
  606. @@ .aes_schedule_192_smear
  607. @@
  608. @@ Smear the short, low side in the 192-bit key schedule.
  609. @@
  610. @@ Inputs:
  611. @@ q7: high side, b a x y
  612. @@ q6: low side, d c 0 0
  613. @@
  614. @@ Outputs:
  615. @@ q6: b+c+d b+c 0 0
  616. @@ q0: b+c+d b+c b a
  617. @@
  618. .type _vpaes_schedule_192_smear,%function
  619. .align 4
  620. _vpaes_schedule_192_smear:
  621. vmov.i8 q1, #0
  622. vdup.32 q0, d15[1]
  623. vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  624. vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  625. veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  626. veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
  627. veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  628. vmov q0, q6 @ vmovdqa %xmm6, %xmm0
  629. vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
  630. bx lr
  631. .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
  632. @@
  633. @@ .aes_schedule_round
  634. @@
  635. @@ Runs one main round of the key schedule on q0, q7
  636. @@
  637. @@ Specifically, runs subbytes on the high dword of q0
  638. @@ then rotates it by one byte and xors into the low dword of
  639. @@ q7.
  640. @@
  641. @@ Adds rcon from low byte of q8, then rotates q8 for
  642. @@ next rcon.
  643. @@
  644. @@ Smears the dwords of q7 by xoring the low into the
  645. @@ second low, result into third, result into highest.
  646. @@
  647. @@ Returns results in q7 = q0.
  648. @@ Clobbers q1-q4, r11.
  649. @@
  650. .type _vpaes_schedule_round,%function
  651. .align 4
  652. _vpaes_schedule_round:
  653. @ extract rcon from xmm8
  654. vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
  655. vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
  656. vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
  657. veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
  658. @ rotate
  659. vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
  660. vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
  661. @ fall through...
  662. @ low round: same as high round, but no rotation and no rcon.
  663. _vpaes_schedule_low_round:
  664. @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
  665. @ We pin other values in _vpaes_key_preheat, so load them now.
  666. adr r11, .Lk_sb1
  667. vld1.64 {q14,q15}, [r11]
  668. @ smear xmm7
  669. vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
  670. veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
  671. vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
  672. @ subbytes
  673. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
  674. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  675. veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
  676. vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  677. vtbl.8 d5, {q11}, d3
  678. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  679. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  680. vtbl.8 d7, {q10}, d1
  681. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  682. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  683. vtbl.8 d9, {q10}, d3
  684. veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7
  685. vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  686. vtbl.8 d7, {q10}, d7
  687. veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  688. vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  689. vtbl.8 d5, {q10}, d9
  690. veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  691. veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  692. vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  693. vtbl.8 d9, {q15}, d7
  694. vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  695. vtbl.8 d3, {q14}, d5
  696. veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  697. @ add in smeared stuff
  698. veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
  699. veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
  700. bx lr
  701. .size _vpaes_schedule_round,.-_vpaes_schedule_round
  702. @@
  703. @@ .aes_schedule_transform
  704. @@
  705. @@ Linear-transform q0 according to tables at [r11]
  706. @@
  707. @@ Requires that q9 = 0x0F0F... as in preheat
  708. @@ Output in q0
  709. @@ Clobbers q1, q2, q14, q15
  710. @@
  711. .type _vpaes_schedule_transform,%function
  712. .align 4
  713. _vpaes_schedule_transform:
  714. vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
  715. @ vmovdqa 16(%r11), %xmm1 # hi
  716. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  717. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  718. vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
  719. vtbl.8 d5, {q14}, d3
  720. vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
  721. vtbl.8 d1, {q15}, d1
  722. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  723. bx lr
  724. .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
  725. @@
  726. @@ .aes_schedule_mangle
  727. @@
  728. @@ Mangles q0 from (basis-transformed) standard version
  729. @@ to our version.
  730. @@
  731. @@ On encrypt,
  732. @@ xor with 0x63
  733. @@ multiply by circulant 0,1,1,1
  734. @@ apply shiftrows transform
  735. @@
  736. @@ On decrypt,
  737. @@ xor with 0x63
  738. @@ multiply by "inverse mixcolumns" circulant E,B,D,9
  739. @@ deskew
  740. @@ apply shiftrows transform
  741. @@
  742. @@
  743. @@ Writes out to [r2], and increments or decrements it
  744. @@ Keeps track of round number mod 4 in r8
  745. @@ Preserves q0
  746. @@ Clobbers q1-q5
  747. @@
  748. .type _vpaes_schedule_mangle,%function
  749. .align 4
  750. _vpaes_schedule_mangle:
  751. tst r3, r3
  752. vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
  753. adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
  754. vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
  755. bne .Lschedule_mangle_dec
  756. @ encrypting
  757. @ Write to q2 so we do not overlap table and destination below.
  758. veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4
  759. add r2, r2, #16 @ add $16, %rdx
  760. vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
  761. vtbl.8 d9, {q2}, d11
  762. vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
  763. vtbl.8 d3, {q4}, d11
  764. vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
  765. vtbl.8 d7, {q1}, d11
  766. veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
  767. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  768. veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
  769. b .Lschedule_mangle_both
  770. .align 4
  771. .Lschedule_mangle_dec:
  772. @ inverse mix columns
  773. adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11
  774. vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
  775. vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  776. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
  777. @ vmovdqa 0x10(%r11), %xmm3
  778. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  779. vtbl.8 d5, {q14}, d9
  780. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  781. vtbl.8 d7, {q15}, d3
  782. @ Load .Lk_dksb ahead of time.
  783. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
  784. @ vmovdqa 0x30(%r11), %xmm3
  785. @ Write to q13 so we do not overlap table and destination.
  786. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  787. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  788. vtbl.8 d7, {q13}, d11
  789. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  790. vtbl.8 d5, {q14}, d9
  791. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  792. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  793. vtbl.8 d7, {q15}, d3
  794. @ Load .Lk_dkse ahead of time.
  795. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
  796. @ vmovdqa 0x50(%r11), %xmm3
  797. @ Write to q13 so we do not overlap table and destination.
  798. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  799. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  800. vtbl.8 d7, {q13}, d11
  801. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  802. vtbl.8 d5, {q14}, d9
  803. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  804. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  805. vtbl.8 d7, {q15}, d3
  806. @ Load .Lk_dkse ahead of time.
  807. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
  808. @ vmovdqa 0x70(%r11), %xmm4
  809. @ Write to q13 so we do not overlap table and destination.
  810. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  811. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  812. vtbl.8 d5, {q14}, d9
  813. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  814. vtbl.8 d7, {q13}, d11
  815. vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
  816. vtbl.8 d9, {q15}, d3
  817. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  818. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  819. veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
  820. sub r2, r2, #16 @ add $-16, %rdx
  821. .Lschedule_mangle_both:
  822. @ Write to q2 so table and destination do not overlap.
  823. vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  824. vtbl.8 d5, {q3}, d3
  825. add r8, r8, #64-16 @ add $-16, %r8
  826. and r8, r8, #~(1<<6) @ and $0x30, %r8
  827. vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
  828. bx lr
  829. .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
  830. .globl vpaes_set_encrypt_key
  831. .hidden vpaes_set_encrypt_key
  832. .type vpaes_set_encrypt_key,%function
  833. .align 4
  834. vpaes_set_encrypt_key:
  835. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  836. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  837. lsr r9, r1, #5 @ shr $5,%eax
  838. add r9, r9, #5 @ $5,%eax
  839. str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  840. mov r3, #0 @ mov $0,%ecx
  841. mov r8, #0x30 @ mov $0x30,%r8d
  842. bl _vpaes_schedule_core
  843. eor r0, r0, r0
  844. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  845. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  846. .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
  847. .globl vpaes_set_decrypt_key
  848. .hidden vpaes_set_decrypt_key
  849. .type vpaes_set_decrypt_key,%function
  850. .align 4
  851. vpaes_set_decrypt_key:
  852. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  853. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  854. lsr r9, r1, #5 @ shr $5,%eax
  855. add r9, r9, #5 @ $5,%eax
  856. str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  857. lsl r9, r9, #4 @ shl $4,%eax
  858. add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
  859. add r2, r2, r9
  860. mov r3, #1 @ mov $1,%ecx
  861. lsr r8, r1, #1 @ shr $1,%r8d
  862. and r8, r8, #32 @ and $32,%r8d
  863. eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
  864. bl _vpaes_schedule_core
  865. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  866. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  867. .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
  868. @ Additional constants for converting to bsaes.
  869. .type _vpaes_convert_consts,%object
  870. .align 4
  871. _vpaes_convert_consts:
  872. @ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
  873. @ transform in the AES S-box. 0x63 is incorporated into the low half of the
  874. @ table. This was computed with the following script:
  875. @
  876. @ def u64s_to_u128(x, y):
  877. @ return x | (y << 64)
  878. @ def u128_to_u64s(w):
  879. @ return w & ((1<<64)-1), w >> 64
  880. @ def get_byte(w, i):
  881. @ return (w >> (i*8)) & 0xff
  882. @ def apply_table(table, b):
  883. @ lo = b & 0xf
  884. @ hi = b >> 4
  885. @ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
  886. @ def opt(b):
  887. @ table = [
  888. @ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
  889. @ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
  890. @ ]
  891. @ return apply_table(table, b)
  892. @ def rot_byte(b, n):
  893. @ return 0xff & ((b << n) | (b >> (8-n)))
  894. @ def skew(x):
  895. @ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
  896. @ rot_byte(x, 4))
  897. @ table = [0, 0]
  898. @ for i in range(16):
  899. @ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
  900. @ table[1] |= skew(opt(i<<4)) << (i*8)
  901. @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
  902. @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
  903. .Lk_opt_then_skew:
  904. .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
  905. .quad 0x1f30062936192f00, 0xb49bad829db284ab
  906. @ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
  907. @ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
  908. @ becomes 0x22334411 and then 0x11443322.
  909. .Lk_decrypt_transform:
  910. .quad 0x0704050603000102, 0x0f0c0d0e0b08090a
  911. .size _vpaes_convert_consts,.-_vpaes_convert_consts
  912. @ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
  913. .globl vpaes_encrypt_key_to_bsaes
  914. .hidden vpaes_encrypt_key_to_bsaes
  915. .type vpaes_encrypt_key_to_bsaes,%function
  916. .align 4
  917. vpaes_encrypt_key_to_bsaes:
  918. stmdb sp!, {r11, lr}
  919. @ See _vpaes_schedule_core for the key schedule logic. In particular,
  920. @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
  921. @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
  922. @ contain the transformations not in the bsaes representation. This
  923. @ function inverts those transforms.
  924. @
  925. @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
  926. @ representation, which does not match the other aes_nohw_*
  927. @ implementations. The ARM aes_nohw_* stores each 32-bit word
  928. @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
  929. @ cost of extra REV and VREV32 operations in little-endian ARM.
  930. vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
  931. adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
  932. add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
  933. vld1.64 {q12}, [r2]
  934. vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64
  935. adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
  936. vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied
  937. @ vpaes stores one fewer round count than bsaes, but the number of keys
  938. @ is the same.
  939. ldr r2, [r1,#240]
  940. add r2, r2, #1
  941. str r2, [r0,#240]
  942. @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
  943. @ Invert this with .Lk_opt.
  944. vld1.64 {q0}, [r1]!
  945. bl _vpaes_schedule_transform
  946. vrev32.8 q0, q0
  947. vst1.64 {q0}, [r0]!
  948. @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
  949. @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
  950. @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
  951. .Loop_enc_key_to_bsaes:
  952. vld1.64 {q0}, [r1]!
  953. @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
  954. @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
  955. @ We use r3 rather than r8 to avoid a callee-saved register.
  956. vld1.64 {q1}, [r3]
  957. vtbl.8 d4, {q0}, d2
  958. vtbl.8 d5, {q0}, d3
  959. add r3, r3, #16
  960. and r3, r3, #~(1<<6)
  961. vmov q0, q2
  962. @ Handle the last key differently.
  963. subs r2, r2, #1
  964. beq .Loop_enc_key_to_bsaes_last
  965. @ Multiply by the circulant. This is its own inverse.
  966. vtbl.8 d2, {q0}, d24
  967. vtbl.8 d3, {q0}, d25
  968. vmov q0, q1
  969. vtbl.8 d4, {q1}, d24
  970. vtbl.8 d5, {q1}, d25
  971. veor q0, q0, q2
  972. vtbl.8 d2, {q2}, d24
  973. vtbl.8 d3, {q2}, d25
  974. veor q0, q0, q1
  975. @ XOR and finish.
  976. veor q0, q0, q10
  977. bl _vpaes_schedule_transform
  978. vrev32.8 q0, q0
  979. vst1.64 {q0}, [r0]!
  980. b .Loop_enc_key_to_bsaes
  981. .Loop_enc_key_to_bsaes_last:
  982. @ The final key does not have a basis transform (note
  983. @ .Lschedule_mangle_last inverts the original transform). It only XORs
  984. @ 0x63 and applies ShiftRows. The latter was already inverted in the
  985. @ loop. Note that, because we act on the original representation, we use
  986. @ q11, not q10.
  987. veor q0, q0, q11
  988. vrev32.8 q0, q0
  989. vst1.64 {q0}, [r0]
  990. @ Wipe registers which contained key material.
  991. veor q0, q0, q0
  992. veor q1, q1, q1
  993. veor q2, q2, q2
  994. ldmia sp!, {r11, pc} @ return
  995. .size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
  996. @ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
  997. .globl vpaes_decrypt_key_to_bsaes
  998. .hidden vpaes_decrypt_key_to_bsaes
  999. .type vpaes_decrypt_key_to_bsaes,%function
  1000. .align 4
  1001. vpaes_decrypt_key_to_bsaes:
  1002. stmdb sp!, {r11, lr}
  1003. @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
  1004. @ computes the decryption key schedule in reverse. Additionally,
  1005. @ aes-x86_64.pl shares some transformations, so we must only partially
  1006. @ invert vpaes's transformations. In general, vpaes computes in a
  1007. @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
  1008. @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
  1009. @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
  1010. @
  1011. @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
  1012. @ representation, which does not match the other aes_nohw_*
  1013. @ implementations. The ARM aes_nohw_* stores each 32-bit word
  1014. @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
  1015. @ cost of extra REV and VREV32 operations in little-endian ARM.
  1016. adr r2, .Lk_decrypt_transform
  1017. adr r3, .Lk_sr+0x30
  1018. adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
  1019. vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
  1020. vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
  1021. @ vpaes stores one fewer round count than bsaes, but the number of keys
  1022. @ is the same.
  1023. ldr r2, [r1,#240]
  1024. add r2, r2, #1
  1025. str r2, [r0,#240]
  1026. @ Undo the basis change and reapply the S-box affine transform. See
  1027. @ .Lschedule_mangle_last.
  1028. vld1.64 {q0}, [r1]!
  1029. bl _vpaes_schedule_transform
  1030. vrev32.8 q0, q0
  1031. vst1.64 {q0}, [r0]!
  1032. @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
  1033. @ it simultaneously inverts MixColumns and the S-box affine transform.
  1034. @ See .Lk_dksd through .Lk_dks9.
  1035. .Loop_dec_key_to_bsaes:
  1036. vld1.64 {q0}, [r1]!
  1037. @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
  1038. @ forwards cancels inverting for which direction we cycle r3. We use r3
  1039. @ rather than r8 to avoid a callee-saved register.
  1040. vld1.64 {q1}, [r3]
  1041. vtbl.8 d4, {q0}, d2
  1042. vtbl.8 d5, {q0}, d3
  1043. add r3, r3, #64-16
  1044. and r3, r3, #~(1<<6)
  1045. vmov q0, q2
  1046. @ Handle the last key differently.
  1047. subs r2, r2, #1
  1048. beq .Loop_dec_key_to_bsaes_last
  1049. @ Undo the basis change and reapply the S-box affine transform.
  1050. bl _vpaes_schedule_transform
  1051. @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
  1052. @ combine the two operations in .Lk_decrypt_transform.
  1053. @
  1054. @ TODO(davidben): Where does the rotation come from?
  1055. vtbl.8 d2, {q0}, d24
  1056. vtbl.8 d3, {q0}, d25
  1057. vst1.64 {q1}, [r0]!
  1058. b .Loop_dec_key_to_bsaes
  1059. .Loop_dec_key_to_bsaes_last:
  1060. @ The final key only inverts ShiftRows (already done in the loop). See
  1061. @ .Lschedule_am_decrypting. Its basis is not transformed.
  1062. vrev32.8 q0, q0
  1063. vst1.64 {q0}, [r0]!
  1064. @ Wipe registers which contained key material.
  1065. veor q0, q0, q0
  1066. veor q1, q1, q1
  1067. veor q2, q2, q2
  1068. ldmia sp!, {r11, pc} @ return
  1069. .size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
  1070. .globl vpaes_ctr32_encrypt_blocks
  1071. .hidden vpaes_ctr32_encrypt_blocks
  1072. .type vpaes_ctr32_encrypt_blocks,%function
  1073. .align 4
  1074. vpaes_ctr32_encrypt_blocks:
  1075. mov ip, sp
  1076. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  1077. @ This function uses q4-q7 (d8-d15), which are callee-saved.
  1078. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  1079. cmp r2, #0
  1080. @ r8 is passed on the stack.
  1081. ldr r8, [ip]
  1082. beq .Lctr32_done
  1083. @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
  1084. mov r9, r3
  1085. mov r3, r2
  1086. mov r2, r9
  1087. @ Load the IV and counter portion.
  1088. ldr r7, [r8, #12]
  1089. vld1.8 {q7}, [r8]
  1090. bl _vpaes_preheat
  1091. rev r7, r7 @ The counter is big-endian.
  1092. .Lctr32_loop:
  1093. vmov q0, q7
  1094. vld1.8 {q6}, [r0]! @ .Load input ahead of time
  1095. bl _vpaes_encrypt_core
  1096. veor q0, q0, q6 @ XOR input and result
  1097. vst1.8 {q0}, [r1]!
  1098. subs r3, r3, #1
  1099. @ Update the counter.
  1100. add r7, r7, #1
  1101. rev r9, r7
  1102. vmov.32 d15[1], r9
  1103. bne .Lctr32_loop
  1104. .Lctr32_done:
  1105. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  1106. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  1107. .size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
  1108. #endif
  1109. #endif // !OPENSSL_NO_ASM
  1110. .section .note.GNU-stack,"",%progbits