vpaes-armv7.S 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. .syntax unified
  14. #if defined(__thumb2__)
  15. .thumb
  16. #else
  17. .code 32
  18. #endif
  19. .text
  20. .align 7 @ totally strategic alignment
  21. _vpaes_consts:
  22. Lk_mc_forward:@ mc_forward
  23. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  24. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  25. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  26. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  27. Lk_mc_backward:@ mc_backward
  28. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  29. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  30. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  31. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  32. Lk_sr:@ sr
  33. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  34. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  35. .quad 0x0F060D040B020900, 0x070E050C030A0108
  36. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  37. @
  38. @ "Hot" constants
  39. @
  40. Lk_inv:@ inv, inva
  41. .quad 0x0E05060F0D080180, 0x040703090A0B0C02
  42. .quad 0x01040A060F0B0780, 0x030D0E0C02050809
  43. Lk_ipt:@ input transform (lo, hi)
  44. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  45. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  46. Lk_sbo:@ sbou, sbot
  47. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  48. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  49. Lk_sb1:@ sb1u, sb1t
  50. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  51. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  52. Lk_sb2:@ sb2u, sb2t
  53. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  54. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  55. .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
  56. .align 2
  57. .align 6
  58. @@
  59. @@ _aes_preheat
  60. @@
  61. @@ Fills q9-q15 as specified below.
  62. @@
  63. #ifdef __thumb2__
  64. .thumb_func _vpaes_preheat
  65. #endif
  66. .align 4
  67. _vpaes_preheat:
  68. adr r10, Lk_inv
  69. vmov.i8 q9, #0x0f @ Lk_s0F
  70. vld1.64 {q10,q11}, [r10]! @ Lk_inv
  71. add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo
  72. vld1.64 {q12,q13}, [r10]! @ Lk_sb1
  73. vld1.64 {q14,q15}, [r10] @ Lk_sb2
  74. bx lr
  75. @@
  76. @@ _aes_encrypt_core
  77. @@
  78. @@ AES-encrypt q0.
  79. @@
  80. @@ Inputs:
  81. @@ q0 = input
  82. @@ q9-q15 as in _vpaes_preheat
  83. @@ [r2] = scheduled keys
  84. @@
  85. @@ Output in q0
  86. @@ Clobbers q1-q5, r8-r11
  87. @@ Preserves q6-q8 so you get some local vectors
  88. @@
  89. @@
  90. #ifdef __thumb2__
  91. .thumb_func _vpaes_encrypt_core
  92. #endif
  93. .align 4
  94. _vpaes_encrypt_core:
  95. mov r9, r2
  96. ldr r8, [r2,#240] @ pull rounds
  97. adr r11, Lk_ipt
  98. @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  99. @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  100. vld1.64 {q2, q3}, [r11]
  101. adr r11, Lk_mc_forward+16
  102. vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
  103. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  104. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  105. vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
  106. vtbl.8 d3, {q2}, d3
  107. vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
  108. vtbl.8 d5, {q3}, d1
  109. veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
  110. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  111. @ .Lenc_entry ends with a bnz instruction which is normally paired with
  112. @ subs in .Lenc_loop.
  113. tst r8, r8
  114. b Lenc_entry
  115. .align 4
  116. Lenc_loop:
  117. @ middle of middle round
  118. add r10, r11, #0x40
  119. vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  120. vtbl.8 d9, {q13}, d5
  121. vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
  122. vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  123. vtbl.8 d1, {q12}, d7
  124. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  125. vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  126. vtbl.8 d11, {q15}, d5
  127. veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  128. vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  129. vtbl.8 d5, {q14}, d7
  130. vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
  131. vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  132. vtbl.8 d7, {q0}, d3
  133. veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  134. @ Write to q5 instead of q0, so the table and destination registers do
  135. @ not overlap.
  136. vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  137. vtbl.8 d11, {q0}, d9
  138. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  139. vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  140. vtbl.8 d9, {q3}, d3
  141. @ Here we restore the original q0/q5 usage.
  142. veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  143. and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
  144. veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  145. subs r8, r8, #1 @ nr--
  146. Lenc_entry:
  147. @ top of round
  148. vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
  149. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  150. vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  151. vtbl.8 d11, {q11}, d3
  152. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  153. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  154. vtbl.8 d7, {q10}, d1
  155. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  156. vtbl.8 d9, {q10}, d3
  157. veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  158. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  159. vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  160. vtbl.8 d5, {q10}, d7
  161. vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  162. vtbl.8 d7, {q10}, d9
  163. veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  164. veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  165. vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
  166. bne Lenc_loop
  167. @ middle of last round
  168. add r10, r11, #0x80
  169. adr r11, Lk_sbo
  170. @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
  171. @ overlap table and destination registers.
  172. vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
  173. vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16
  174. vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  175. vtbl.8 d9, {q1}, d5
  176. vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
  177. @ Write to q2 instead of q0 below, to avoid overlapping table and
  178. @ destination registers.
  179. vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  180. vtbl.8 d5, {q0}, d7
  181. veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  182. veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  183. @ Here we restore the original q0/q2 usage.
  184. vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
  185. vtbl.8 d1, {q2}, d3
  186. bx lr
  187. .globl _vpaes_encrypt
  188. .private_extern _vpaes_encrypt
  189. #ifdef __thumb2__
  190. .thumb_func _vpaes_encrypt
  191. #endif
  192. .align 4
  193. _vpaes_encrypt:
  194. @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
  195. @ alignment.
  196. stmdb sp!, {r7,r8,r9,r10,r11,lr}
  197. @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
  198. vstmdb sp!, {d8,d9,d10,d11}
  199. vld1.64 {q0}, [r0]
  200. bl _vpaes_preheat
  201. bl _vpaes_encrypt_core
  202. vst1.64 {q0}, [r1]
  203. vldmia sp!, {d8,d9,d10,d11}
  204. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  205. @
  206. @ Decryption stuff
  207. @
  208. .align 4
  209. _vpaes_decrypt_consts:
  210. Lk_dipt:@ decryption input transform
  211. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  212. .quad 0x86E383E660056500, 0x12771772F491F194
  213. Lk_dsbo:@ decryption sbox final output
  214. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  215. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  216. Lk_dsb9:@ decryption sbox output *9*u, *9*t
  217. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  218. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  219. Lk_dsbd:@ decryption sbox output *D*u, *D*t
  220. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  221. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  222. Lk_dsbb:@ decryption sbox output *B*u, *B*t
  223. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  224. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  225. Lk_dsbe:@ decryption sbox output *E*u, *E*t
  226. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  227. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  228. @@
  229. @@ Decryption core
  230. @@
  231. @@ Same API as encryption core, except it clobbers q12-q15 rather than using
  232. @@ the values from _vpaes_preheat. q9-q11 must still be set from
  233. @@ _vpaes_preheat.
  234. @@
  235. #ifdef __thumb2__
  236. .thumb_func _vpaes_decrypt_core
  237. #endif
  238. .align 4
  239. _vpaes_decrypt_core:
  240. mov r9, r2
  241. ldr r8, [r2,#240] @ pull rounds
  242. @ This function performs shuffles with various constants. The x86_64
  243. @ version loads them on-demand into %xmm0-%xmm5. This does not work well
  244. @ for ARMv7 because those registers are shuffle destinations. The ARMv8
  245. @ version preloads those constants into registers, but ARMv7 has half
  246. @ the registers to work with. Instead, we load them on-demand into
  247. @ q12-q15, registers normally use for preloaded constants. This is fine
  248. @ because decryption doesn't use those constants. The values are
  249. @ constant, so this does not interfere with potential 2x optimizations.
  250. adr r7, Lk_dipt
  251. vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo
  252. lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
  253. eor r11, r11, #0x30 @ xor $0x30, %r11
  254. adr r10, Lk_sr
  255. and r11, r11, #0x30 @ and $0x30, %r11
  256. add r11, r11, r10
  257. adr r10, Lk_mc_forward+48
  258. vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
  259. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  260. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  261. vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
  262. vtbl.8 d5, {q12}, d3
  263. vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5
  264. @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  265. vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
  266. vtbl.8 d1, {q13}, d1
  267. veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
  268. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  269. @ .Ldec_entry ends with a bnz instruction which is normally paired with
  270. @ subs in .Ldec_loop.
  271. tst r8, r8
  272. b Ldec_entry
  273. .align 4
  274. Ldec_loop:
  275. @
  276. @ Inverse mix columns
  277. @
  278. @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
  279. @ the function.
  280. adr r10, Lk_dsb9
  281. vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  282. @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  283. @ Load sbd* ahead of time.
  284. vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  285. @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  286. vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  287. vtbl.8 d9, {q12}, d5
  288. vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  289. vtbl.8 d3, {q13}, d7
  290. veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
  291. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  292. @ Load sbb* ahead of time.
  293. vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
  294. @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
  295. vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  296. vtbl.8 d9, {q14}, d5
  297. @ Write to q1 instead of q0, so the table and destination registers do
  298. @ not overlap.
  299. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  300. vtbl.8 d3, {q0}, d11
  301. @ Here we restore the original q0/q1 usage. This instruction is
  302. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  303. @ below.
  304. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  305. vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  306. vtbl.8 d3, {q15}, d7
  307. @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  308. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  309. @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  310. @ Load sbd* ahead of time.
  311. vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
  312. @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
  313. vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  314. vtbl.8 d9, {q12}, d5
  315. @ Write to q1 instead of q0, so the table and destination registers do
  316. @ not overlap.
  317. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  318. vtbl.8 d3, {q0}, d11
  319. @ Here we restore the original q0/q1 usage. This instruction is
  320. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  321. @ below.
  322. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  323. vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  324. vtbl.8 d3, {q13}, d7
  325. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  326. vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  327. vtbl.8 d9, {q14}, d5
  328. @ Write to q1 instead of q0, so the table and destination registers do
  329. @ not overlap.
  330. vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  331. vtbl.8 d3, {q0}, d11
  332. @ Here we restore the original q0/q1 usage. This instruction is
  333. @ reordered from the ARMv8 version so we do not clobber the vtbl.8
  334. @ below.
  335. veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  336. vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  337. vtbl.8 d3, {q15}, d7
  338. vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
  339. veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  340. subs r8, r8, #1 @ sub $1,%rax # nr--
  341. Ldec_entry:
  342. @ top of round
  343. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
  344. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  345. vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  346. vtbl.8 d5, {q11}, d3
  347. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  348. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  349. vtbl.8 d7, {q10}, d1
  350. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  351. vtbl.8 d9, {q10}, d3
  352. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  353. veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  354. vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  355. vtbl.8 d5, {q10}, d7
  356. vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  357. vtbl.8 d7, {q10}, d9
  358. veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  359. veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  360. vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
  361. bne Ldec_loop
  362. @ middle of last round
  363. adr r10, Lk_dsbo
  364. @ Write to q1 rather than q4 to avoid overlapping table and destination.
  365. vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  366. vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  367. vtbl.8 d9, {q1}, d5
  368. @ Write to q2 rather than q1 to avoid overlapping table and destination.
  369. vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  370. vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  371. vtbl.8 d3, {q2}, d7
  372. vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
  373. veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  374. @ Write to q1 rather than q0 so the table and destination registers
  375. @ below do not overlap.
  376. veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  377. vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
  378. vtbl.8 d1, {q1}, d5
  379. bx lr
  380. .globl _vpaes_decrypt
  381. .private_extern _vpaes_decrypt
  382. #ifdef __thumb2__
  383. .thumb_func _vpaes_decrypt
  384. #endif
  385. .align 4
  386. _vpaes_decrypt:
  387. @ _vpaes_decrypt_core uses r7-r11.
  388. stmdb sp!, {r7,r8,r9,r10,r11,lr}
  389. @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
  390. vstmdb sp!, {d8,d9,d10,d11}
  391. vld1.64 {q0}, [r0]
  392. bl _vpaes_preheat
  393. bl _vpaes_decrypt_core
  394. vst1.64 {q0}, [r1]
  395. vldmia sp!, {d8,d9,d10,d11}
  396. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  397. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  398. @@ @@
  399. @@ AES key schedule @@
  400. @@ @@
  401. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  402. @ This function diverges from both x86_64 and armv7 in which constants are
  403. @ pinned. x86_64 has a common preheat function for all operations. aarch64
  404. @ separates them because it has enough registers to pin nearly all constants.
  405. @ armv7 does not have enough registers, but needing explicit loads and stores
  406. @ also complicates using x86_64's register allocation directly.
  407. @
  408. @ We pin some constants for convenience and leave q14 and q15 free to load
  409. @ others on demand.
  410. @
  411. @ Key schedule constants
  412. @
  413. .align 4
  414. _vpaes_key_consts:
  415. Lk_dksd:@ decryption key schedule: invskew x*D
  416. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  417. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  418. Lk_dksb:@ decryption key schedule: invskew x*B
  419. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  420. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  421. Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
  422. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  423. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  424. Lk_dks9:@ decryption key schedule: invskew x*9
  425. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  426. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  427. Lk_rcon:@ rcon
  428. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  429. Lk_opt:@ output transform
  430. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  431. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  432. Lk_deskew:@ deskew tables: inverts the sbox's "skew"
  433. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  434. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  435. #ifdef __thumb2__
  436. .thumb_func _vpaes_key_preheat
  437. #endif
  438. .align 4
  439. _vpaes_key_preheat:
  440. adr r11, Lk_rcon
  441. vmov.i8 q12, #0x5b @ Lk_s63
  442. adr r10, Lk_inv @ Must be aligned to 8 mod 16.
  443. vmov.i8 q9, #0x0f @ Lk_s0F
  444. vld1.64 {q10,q11}, [r10] @ Lk_inv
  445. vld1.64 {q8}, [r11] @ Lk_rcon
  446. bx lr
  447. #ifdef __thumb2__
  448. .thumb_func _vpaes_schedule_core
  449. #endif
  450. .align 4
  451. _vpaes_schedule_core:
  452. @ We only need to save lr, but ARM requires an 8-byte stack alignment,
  453. @ so save an extra register.
  454. stmdb sp!, {r3,lr}
  455. bl _vpaes_key_preheat @ load the tables
  456. adr r11, Lk_ipt @ Must be aligned to 8 mod 16.
  457. vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
  458. @ input transform
  459. @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
  460. @ overlap table and destination.
  461. vmov q4, q0 @ vmovdqa %xmm0, %xmm3
  462. bl _vpaes_schedule_transform
  463. adr r10, Lk_sr @ Must be aligned to 8 mod 16.
  464. vmov q7, q0 @ vmovdqa %xmm0, %xmm7
  465. add r8, r8, r10
  466. tst r3, r3
  467. bne Lschedule_am_decrypting
  468. @ encrypting, output zeroth round key after transform
  469. vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
  470. b Lschedule_go
  471. Lschedule_am_decrypting:
  472. @ decrypting, output zeroth round key after shiftrows
  473. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  474. vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  475. vtbl.8 d7, {q4}, d3
  476. vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
  477. eor r8, r8, #0x30 @ xor $0x30, %r8
  478. Lschedule_go:
  479. cmp r1, #192 @ cmp $192, %esi
  480. bhi Lschedule_256
  481. beq Lschedule_192
  482. @ 128: fall though
  483. @@
  484. @@ .schedule_128
  485. @@
  486. @@ 128-bit specific part of key schedule.
  487. @@
  488. @@ This schedule is really simple, because all its parts
  489. @@ are accomplished by the subroutines.
  490. @@
  491. Lschedule_128:
  492. mov r0, #10 @ mov $10, %esi
  493. Loop_schedule_128:
  494. bl _vpaes_schedule_round
  495. subs r0, r0, #1 @ dec %esi
  496. beq Lschedule_mangle_last
  497. bl _vpaes_schedule_mangle @ write output
  498. b Loop_schedule_128
  499. @@
  500. @@ .aes_schedule_192
  501. @@
  502. @@ 192-bit specific part of key schedule.
  503. @@
  504. @@ The main body of this schedule is the same as the 128-bit
  505. @@ schedule, but with more smearing. The long, high side is
  506. @@ stored in q7 as before, and the short, low side is in
  507. @@ the high bits of q6.
  508. @@
  509. @@ This schedule is somewhat nastier, however, because each
  510. @@ round produces 192 bits of key material, or 1.5 round keys.
  511. @@ Therefore, on each cycle we do 2 rounds and produce 3 round
  512. @@ keys.
  513. @@
  514. .align 4
  515. Lschedule_192:
  516. sub r0, r0, #8
  517. vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  518. bl _vpaes_schedule_transform @ input transform
  519. vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
  520. vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
  521. @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
  522. mov r0, #4 @ mov $4, %esi
  523. Loop_schedule_192:
  524. bl _vpaes_schedule_round
  525. vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
  526. bl _vpaes_schedule_mangle @ save key n
  527. bl _vpaes_schedule_192_smear
  528. bl _vpaes_schedule_mangle @ save key n+1
  529. bl _vpaes_schedule_round
  530. subs r0, r0, #1 @ dec %esi
  531. beq Lschedule_mangle_last
  532. bl _vpaes_schedule_mangle @ save key n+2
  533. bl _vpaes_schedule_192_smear
  534. b Loop_schedule_192
  535. @@
  536. @@ .aes_schedule_256
  537. @@
  538. @@ 256-bit specific part of key schedule.
  539. @@
  540. @@ The structure here is very similar to the 128-bit
  541. @@ schedule, but with an additional "low side" in
  542. @@ q6. The low side's rounds are the same as the
  543. @@ high side's, except no rcon and no rotation.
  544. @@
  545. .align 4
  546. Lschedule_256:
  547. vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  548. bl _vpaes_schedule_transform @ input transform
  549. mov r0, #7 @ mov $7, %esi
  550. Loop_schedule_256:
  551. bl _vpaes_schedule_mangle @ output low result
  552. vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  553. @ high round
  554. bl _vpaes_schedule_round
  555. subs r0, r0, #1 @ dec %esi
  556. beq Lschedule_mangle_last
  557. bl _vpaes_schedule_mangle
  558. @ low round. swap xmm7 and xmm6
  559. vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
  560. vmov.i8 q4, #0
  561. vmov q5, q7 @ vmovdqa %xmm7, %xmm5
  562. vmov q7, q6 @ vmovdqa %xmm6, %xmm7
  563. bl _vpaes_schedule_low_round
  564. vmov q7, q5 @ vmovdqa %xmm5, %xmm7
  565. b Loop_schedule_256
  566. @@
  567. @@ .aes_schedule_mangle_last
  568. @@
  569. @@ Mangler for last round of key schedule
  570. @@ Mangles q0
  571. @@ when encrypting, outputs out(q0) ^ 63
  572. @@ when decrypting, outputs unskew(q0)
  573. @@
  574. @@ Always called right before return... jumps to cleanup and exits
  575. @@
  576. .align 4
  577. Lschedule_mangle_last:
  578. @ schedule last round key from xmm0
  579. adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew
  580. tst r3, r3
  581. bne Lschedule_mangle_last_dec
  582. @ encrypting
  583. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
  584. adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform
  585. add r2, r2, #32 @ add $32, %rdx
  586. vmov q2, q0
  587. vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
  588. vtbl.8 d1, {q2}, d3
  589. Lschedule_mangle_last_dec:
  590. sub r2, r2, #16 @ add $-16, %rdx
  591. veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0
  592. bl _vpaes_schedule_transform @ output transform
  593. vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
  594. @ cleanup
  595. veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
  596. veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
  597. veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
  598. veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
  599. veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
  600. veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
  601. veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
  602. veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
  603. ldmia sp!, {r3,pc} @ return
  604. @@
  605. @@ .aes_schedule_192_smear
  606. @@
  607. @@ Smear the short, low side in the 192-bit key schedule.
  608. @@
  609. @@ Inputs:
  610. @@ q7: high side, b a x y
  611. @@ q6: low side, d c 0 0
  612. @@
  613. @@ Outputs:
  614. @@ q6: b+c+d b+c 0 0
  615. @@ q0: b+c+d b+c b a
  616. @@
  617. #ifdef __thumb2__
  618. .thumb_func _vpaes_schedule_192_smear
  619. #endif
  620. .align 4
  621. _vpaes_schedule_192_smear:
  622. vmov.i8 q1, #0
  623. vdup.32 q0, d15[1]
  624. vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  625. vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  626. veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  627. veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
  628. veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  629. vmov q0, q6 @ vmovdqa %xmm6, %xmm0
  630. vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
  631. bx lr
  632. @@
  633. @@ .aes_schedule_round
  634. @@
  635. @@ Runs one main round of the key schedule on q0, q7
  636. @@
  637. @@ Specifically, runs subbytes on the high dword of q0
  638. @@ then rotates it by one byte and xors into the low dword of
  639. @@ q7.
  640. @@
  641. @@ Adds rcon from low byte of q8, then rotates q8 for
  642. @@ next rcon.
  643. @@
  644. @@ Smears the dwords of q7 by xoring the low into the
  645. @@ second low, result into third, result into highest.
  646. @@
  647. @@ Returns results in q7 = q0.
  648. @@ Clobbers q1-q4, r11.
  649. @@
  650. #ifdef __thumb2__
  651. .thumb_func _vpaes_schedule_round
  652. #endif
  653. .align 4
  654. _vpaes_schedule_round:
  655. @ extract rcon from xmm8
  656. vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
  657. vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
  658. vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
  659. veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
  660. @ rotate
  661. vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
  662. vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
  663. @ fall through...
  664. @ low round: same as high round, but no rotation and no rcon.
  665. _vpaes_schedule_low_round:
  666. @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
  667. @ We pin other values in _vpaes_key_preheat, so load them now.
  668. adr r11, Lk_sb1
  669. vld1.64 {q14,q15}, [r11]
  670. @ smear xmm7
  671. vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
  672. veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
  673. vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
  674. @ subbytes
  675. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
  676. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
  677. veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
  678. vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  679. vtbl.8 d5, {q11}, d3
  680. veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  681. vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  682. vtbl.8 d7, {q10}, d1
  683. veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  684. vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  685. vtbl.8 d9, {q10}, d3
  686. veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7
  687. vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  688. vtbl.8 d7, {q10}, d7
  689. veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  690. vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  691. vtbl.8 d5, {q10}, d9
  692. veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  693. veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  694. vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  695. vtbl.8 d9, {q15}, d7
  696. vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  697. vtbl.8 d3, {q14}, d5
  698. veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  699. @ add in smeared stuff
  700. veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
  701. veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
  702. bx lr
  703. @@
  704. @@ .aes_schedule_transform
  705. @@
  706. @@ Linear-transform q0 according to tables at [r11]
  707. @@
  708. @@ Requires that q9 = 0x0F0F... as in preheat
  709. @@ Output in q0
  710. @@ Clobbers q1, q2, q14, q15
  711. @@
  712. #ifdef __thumb2__
  713. .thumb_func _vpaes_schedule_transform
  714. #endif
  715. .align 4
  716. _vpaes_schedule_transform:
  717. vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
  718. @ vmovdqa 16(%r11), %xmm1 # hi
  719. vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
  720. vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
  721. vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
  722. vtbl.8 d5, {q14}, d3
  723. vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
  724. vtbl.8 d1, {q15}, d1
  725. veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
  726. bx lr
  727. @@
  728. @@ .aes_schedule_mangle
  729. @@
  730. @@ Mangles q0 from (basis-transformed) standard version
  731. @@ to our version.
  732. @@
  733. @@ On encrypt,
  734. @@ xor with 0x63
  735. @@ multiply by circulant 0,1,1,1
  736. @@ apply shiftrows transform
  737. @@
  738. @@ On decrypt,
  739. @@ xor with 0x63
  740. @@ multiply by "inverse mixcolumns" circulant E,B,D,9
  741. @@ deskew
  742. @@ apply shiftrows transform
  743. @@
  744. @@
  745. @@ Writes out to [r2], and increments or decrements it
  746. @@ Keeps track of round number mod 4 in r8
  747. @@ Preserves q0
  748. @@ Clobbers q1-q5
  749. @@
  750. #ifdef __thumb2__
  751. .thumb_func _vpaes_schedule_mangle
  752. #endif
  753. .align 4
  754. _vpaes_schedule_mangle:
  755. tst r3, r3
  756. vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
  757. adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16.
  758. vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5
  759. bne Lschedule_mangle_dec
  760. @ encrypting
  761. @ Write to q2 so we do not overlap table and destination below.
  762. veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4
  763. add r2, r2, #16 @ add $16, %rdx
  764. vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
  765. vtbl.8 d9, {q2}, d11
  766. vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
  767. vtbl.8 d3, {q4}, d11
  768. vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
  769. vtbl.8 d7, {q1}, d11
  770. veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
  771. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  772. veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
  773. b Lschedule_mangle_both
  774. .align 4
  775. Lschedule_mangle_dec:
  776. @ inverse mix columns
  777. adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11
  778. vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
  779. vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  780. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
  781. @ vmovdqa 0x10(%r11), %xmm3
  782. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  783. vtbl.8 d5, {q14}, d9
  784. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  785. vtbl.8 d7, {q15}, d3
  786. @ Load .Lk_dksb ahead of time.
  787. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
  788. @ vmovdqa 0x30(%r11), %xmm3
  789. @ Write to q13 so we do not overlap table and destination.
  790. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  791. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  792. vtbl.8 d7, {q13}, d11
  793. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  794. vtbl.8 d5, {q14}, d9
  795. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  796. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  797. vtbl.8 d7, {q15}, d3
  798. @ Load .Lk_dkse ahead of time.
  799. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
  800. @ vmovdqa 0x50(%r11), %xmm3
  801. @ Write to q13 so we do not overlap table and destination.
  802. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  803. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  804. vtbl.8 d7, {q13}, d11
  805. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  806. vtbl.8 d5, {q14}, d9
  807. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  808. vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  809. vtbl.8 d7, {q15}, d3
  810. @ Load .Lk_dkse ahead of time.
  811. vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
  812. @ vmovdqa 0x70(%r11), %xmm4
  813. @ Write to q13 so we do not overlap table and destination.
  814. veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
  815. vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
  816. vtbl.8 d5, {q14}, d9
  817. vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
  818. vtbl.8 d7, {q13}, d11
  819. vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
  820. vtbl.8 d9, {q15}, d3
  821. vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
  822. veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
  823. veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
  824. sub r2, r2, #16 @ add $-16, %rdx
  825. Lschedule_mangle_both:
  826. @ Write to q2 so table and destination do not overlap.
  827. vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
  828. vtbl.8 d5, {q3}, d3
  829. add r8, r8, #64-16 @ add $-16, %r8
  830. and r8, r8, #~(1<<6) @ and $0x30, %r8
  831. vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
  832. bx lr
  833. .globl _vpaes_set_encrypt_key
  834. .private_extern _vpaes_set_encrypt_key
  835. #ifdef __thumb2__
  836. .thumb_func _vpaes_set_encrypt_key
  837. #endif
  838. .align 4
  839. _vpaes_set_encrypt_key:
  840. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  841. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  842. lsr r9, r1, #5 @ shr $5,%eax
  843. add r9, r9, #5 @ $5,%eax
  844. str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  845. mov r3, #0 @ mov $0,%ecx
  846. mov r8, #0x30 @ mov $0x30,%r8d
  847. bl _vpaes_schedule_core
  848. eor r0, r0, r0
  849. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  850. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  851. .globl _vpaes_set_decrypt_key
  852. .private_extern _vpaes_set_decrypt_key
  853. #ifdef __thumb2__
  854. .thumb_func _vpaes_set_decrypt_key
  855. #endif
  856. .align 4
  857. _vpaes_set_decrypt_key:
  858. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  859. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  860. lsr r9, r1, #5 @ shr $5,%eax
  861. add r9, r9, #5 @ $5,%eax
  862. str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  863. lsl r9, r9, #4 @ shl $4,%eax
  864. add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
  865. add r2, r2, r9
  866. mov r3, #1 @ mov $1,%ecx
  867. lsr r8, r1, #1 @ shr $1,%r8d
  868. and r8, r8, #32 @ and $32,%r8d
  869. eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
  870. bl _vpaes_schedule_core
  871. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  872. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  873. @ Additional constants for converting to bsaes.
  874. .align 4
  875. _vpaes_convert_consts:
  876. @ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
  877. @ transform in the AES S-box. 0x63 is incorporated into the low half of the
  878. @ table. This was computed with the following script:
  879. @
  880. @ def u64s_to_u128(x, y):
  881. @ return x | (y << 64)
  882. @ def u128_to_u64s(w):
  883. @ return w & ((1<<64)-1), w >> 64
  884. @ def get_byte(w, i):
  885. @ return (w >> (i*8)) & 0xff
  886. @ def apply_table(table, b):
  887. @ lo = b & 0xf
  888. @ hi = b >> 4
  889. @ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
  890. @ def opt(b):
  891. @ table = [
  892. @ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
  893. @ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
  894. @ ]
  895. @ return apply_table(table, b)
  896. @ def rot_byte(b, n):
  897. @ return 0xff & ((b << n) | (b >> (8-n)))
  898. @ def skew(x):
  899. @ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
  900. @ rot_byte(x, 4))
  901. @ table = [0, 0]
  902. @ for i in range(16):
  903. @ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
  904. @ table[1] |= skew(opt(i<<4)) << (i*8)
  905. @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
  906. @ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
  907. Lk_opt_then_skew:
  908. .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
  909. .quad 0x1f30062936192f00, 0xb49bad829db284ab
  910. @ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
  911. @ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
  912. @ becomes 0x22334411 and then 0x11443322.
  913. Lk_decrypt_transform:
  914. .quad 0x0704050603000102, 0x0f0c0d0e0b08090a
  915. @ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
  916. .globl _vpaes_encrypt_key_to_bsaes
  917. .private_extern _vpaes_encrypt_key_to_bsaes
  918. #ifdef __thumb2__
  919. .thumb_func _vpaes_encrypt_key_to_bsaes
  920. #endif
  921. .align 4
  922. _vpaes_encrypt_key_to_bsaes:
  923. stmdb sp!, {r11, lr}
  924. @ See _vpaes_schedule_core for the key schedule logic. In particular,
  925. @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
  926. @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
  927. @ contain the transformations not in the bsaes representation. This
  928. @ function inverts those transforms.
  929. @
  930. @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
  931. @ representation, which does not match the other aes_nohw_*
  932. @ implementations. The ARM aes_nohw_* stores each 32-bit word
  933. @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
  934. @ cost of extra REV and VREV32 operations in little-endian ARM.
  935. vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
  936. adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16.
  937. add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
  938. vld1.64 {q12}, [r2]
  939. vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64
  940. adr r11, Lk_opt @ Must be aligned to 8 mod 16.
  941. vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied
  942. @ vpaes stores one fewer round count than bsaes, but the number of keys
  943. @ is the same.
  944. ldr r2, [r1,#240]
  945. add r2, r2, #1
  946. str r2, [r0,#240]
  947. @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
  948. @ Invert this with .Lk_opt.
  949. vld1.64 {q0}, [r1]!
  950. bl _vpaes_schedule_transform
  951. vrev32.8 q0, q0
  952. vst1.64 {q0}, [r0]!
  953. @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
  954. @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
  955. @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
  956. Loop_enc_key_to_bsaes:
  957. vld1.64 {q0}, [r1]!
  958. @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
  959. @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
  960. @ We use r3 rather than r8 to avoid a callee-saved register.
  961. vld1.64 {q1}, [r3]
  962. vtbl.8 d4, {q0}, d2
  963. vtbl.8 d5, {q0}, d3
  964. add r3, r3, #16
  965. and r3, r3, #~(1<<6)
  966. vmov q0, q2
  967. @ Handle the last key differently.
  968. subs r2, r2, #1
  969. beq Loop_enc_key_to_bsaes_last
  970. @ Multiply by the circulant. This is its own inverse.
  971. vtbl.8 d2, {q0}, d24
  972. vtbl.8 d3, {q0}, d25
  973. vmov q0, q1
  974. vtbl.8 d4, {q1}, d24
  975. vtbl.8 d5, {q1}, d25
  976. veor q0, q0, q2
  977. vtbl.8 d2, {q2}, d24
  978. vtbl.8 d3, {q2}, d25
  979. veor q0, q0, q1
  980. @ XOR and finish.
  981. veor q0, q0, q10
  982. bl _vpaes_schedule_transform
  983. vrev32.8 q0, q0
  984. vst1.64 {q0}, [r0]!
  985. b Loop_enc_key_to_bsaes
  986. Loop_enc_key_to_bsaes_last:
  987. @ The final key does not have a basis transform (note
  988. @ .Lschedule_mangle_last inverts the original transform). It only XORs
  989. @ 0x63 and applies ShiftRows. The latter was already inverted in the
  990. @ loop. Note that, because we act on the original representation, we use
  991. @ q11, not q10.
  992. veor q0, q0, q11
  993. vrev32.8 q0, q0
  994. vst1.64 {q0}, [r0]
  995. @ Wipe registers which contained key material.
  996. veor q0, q0, q0
  997. veor q1, q1, q1
  998. veor q2, q2, q2
  999. ldmia sp!, {r11, pc} @ return
  1000. @ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
  1001. .globl _vpaes_decrypt_key_to_bsaes
  1002. .private_extern _vpaes_decrypt_key_to_bsaes
  1003. #ifdef __thumb2__
  1004. .thumb_func _vpaes_decrypt_key_to_bsaes
  1005. #endif
  1006. .align 4
  1007. _vpaes_decrypt_key_to_bsaes:
  1008. stmdb sp!, {r11, lr}
  1009. @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
  1010. @ computes the decryption key schedule in reverse. Additionally,
  1011. @ aes-x86_64.pl shares some transformations, so we must only partially
  1012. @ invert vpaes's transformations. In general, vpaes computes in a
  1013. @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
  1014. @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
  1015. @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
  1016. @
  1017. @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
  1018. @ representation, which does not match the other aes_nohw_*
  1019. @ implementations. The ARM aes_nohw_* stores each 32-bit word
  1020. @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
  1021. @ cost of extra REV and VREV32 operations in little-endian ARM.
  1022. adr r2, Lk_decrypt_transform
  1023. adr r3, Lk_sr+0x30
  1024. adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
  1025. vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
  1026. vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
  1027. @ vpaes stores one fewer round count than bsaes, but the number of keys
  1028. @ is the same.
  1029. ldr r2, [r1,#240]
  1030. add r2, r2, #1
  1031. str r2, [r0,#240]
  1032. @ Undo the basis change and reapply the S-box affine transform. See
  1033. @ .Lschedule_mangle_last.
  1034. vld1.64 {q0}, [r1]!
  1035. bl _vpaes_schedule_transform
  1036. vrev32.8 q0, q0
  1037. vst1.64 {q0}, [r0]!
  1038. @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
  1039. @ it simultaneously inverts MixColumns and the S-box affine transform.
  1040. @ See .Lk_dksd through .Lk_dks9.
  1041. Loop_dec_key_to_bsaes:
  1042. vld1.64 {q0}, [r1]!
  1043. @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
  1044. @ forwards cancels inverting for which direction we cycle r3. We use r3
  1045. @ rather than r8 to avoid a callee-saved register.
  1046. vld1.64 {q1}, [r3]
  1047. vtbl.8 d4, {q0}, d2
  1048. vtbl.8 d5, {q0}, d3
  1049. add r3, r3, #64-16
  1050. and r3, r3, #~(1<<6)
  1051. vmov q0, q2
  1052. @ Handle the last key differently.
  1053. subs r2, r2, #1
  1054. beq Loop_dec_key_to_bsaes_last
  1055. @ Undo the basis change and reapply the S-box affine transform.
  1056. bl _vpaes_schedule_transform
  1057. @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
  1058. @ combine the two operations in .Lk_decrypt_transform.
  1059. @
  1060. @ TODO(davidben): Where does the rotation come from?
  1061. vtbl.8 d2, {q0}, d24
  1062. vtbl.8 d3, {q0}, d25
  1063. vst1.64 {q1}, [r0]!
  1064. b Loop_dec_key_to_bsaes
  1065. Loop_dec_key_to_bsaes_last:
  1066. @ The final key only inverts ShiftRows (already done in the loop). See
  1067. @ .Lschedule_am_decrypting. Its basis is not transformed.
  1068. vrev32.8 q0, q0
  1069. vst1.64 {q0}, [r0]!
  1070. @ Wipe registers which contained key material.
  1071. veor q0, q0, q0
  1072. veor q1, q1, q1
  1073. veor q2, q2, q2
  1074. ldmia sp!, {r11, pc} @ return
  1075. .globl _vpaes_ctr32_encrypt_blocks
  1076. .private_extern _vpaes_ctr32_encrypt_blocks
  1077. #ifdef __thumb2__
  1078. .thumb_func _vpaes_ctr32_encrypt_blocks
  1079. #endif
  1080. .align 4
  1081. _vpaes_ctr32_encrypt_blocks:
  1082. mov ip, sp
  1083. stmdb sp!, {r7,r8,r9,r10,r11, lr}
  1084. @ This function uses q4-q7 (d8-d15), which are callee-saved.
  1085. vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  1086. cmp r2, #0
  1087. @ r8 is passed on the stack.
  1088. ldr r8, [ip]
  1089. beq Lctr32_done
  1090. @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
  1091. mov r9, r3
  1092. mov r3, r2
  1093. mov r2, r9
  1094. @ Load the IV and counter portion.
  1095. ldr r7, [r8, #12]
  1096. vld1.8 {q7}, [r8]
  1097. bl _vpaes_preheat
  1098. rev r7, r7 @ The counter is big-endian.
  1099. Lctr32_loop:
  1100. vmov q0, q7
  1101. vld1.8 {q6}, [r0]! @ Load input ahead of time
  1102. bl _vpaes_encrypt_core
  1103. veor q0, q0, q6 @ XOR input and result
  1104. vst1.8 {q0}, [r1]!
  1105. subs r3, r3, #1
  1106. @ Update the counter.
  1107. add r7, r7, #1
  1108. rev r9, r7
  1109. vmov.32 d15[1], r9
  1110. bne Lctr32_loop
  1111. Lctr32_done:
  1112. vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
  1113. ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
  1114. #endif // !OPENSSL_NO_ASM