vpaes-armv8.S 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. .section __TEXT,__const
  15. .align 7 // totally strategic alignment
  16. _vpaes_consts:
  17. Lk_mc_forward: // mc_forward
  18. .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
  19. .quad 0x080B0A0904070605, 0x000302010C0F0E0D
  20. .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
  21. .quad 0x000302010C0F0E0D, 0x080B0A0904070605
  22. Lk_mc_backward: // mc_backward
  23. .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
  24. .quad 0x020100030E0D0C0F, 0x0A09080B06050407
  25. .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
  26. .quad 0x0A09080B06050407, 0x020100030E0D0C0F
  27. Lk_sr: // sr
  28. .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
  29. .quad 0x030E09040F0A0500, 0x0B06010C07020D08
  30. .quad 0x0F060D040B020900, 0x070E050C030A0108
  31. .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
  32. //
  33. // "Hot" constants
  34. //
  35. Lk_inv: // inv, inva
  36. .quad 0x0E05060F0D080180, 0x040703090A0B0C02
  37. .quad 0x01040A060F0B0780, 0x030D0E0C02050809
  38. Lk_ipt: // input transform (lo, hi)
  39. .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
  40. .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
  41. Lk_sbo: // sbou, sbot
  42. .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
  43. .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
  44. Lk_sb1: // sb1u, sb1t
  45. .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
  46. .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
  47. Lk_sb2: // sb2u, sb2t
  48. .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
  49. .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
  50. //
  51. // Decryption stuff
  52. //
  53. Lk_dipt: // decryption input transform
  54. .quad 0x0F505B040B545F00, 0x154A411E114E451A
  55. .quad 0x86E383E660056500, 0x12771772F491F194
  56. Lk_dsbo: // decryption sbox final output
  57. .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
  58. .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
  59. Lk_dsb9: // decryption sbox output *9*u, *9*t
  60. .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
  61. .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
  62. Lk_dsbd: // decryption sbox output *D*u, *D*t
  63. .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
  64. .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
  65. Lk_dsbb: // decryption sbox output *B*u, *B*t
  66. .quad 0xD022649296B44200, 0x602646F6B0F2D404
  67. .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
  68. Lk_dsbe: // decryption sbox output *E*u, *E*t
  69. .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
  70. .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
  71. //
  72. // Key schedule constants
  73. //
  74. Lk_dksd: // decryption key schedule: invskew x*D
  75. .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
  76. .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
  77. Lk_dksb: // decryption key schedule: invskew x*B
  78. .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
  79. .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
  80. Lk_dkse: // decryption key schedule: invskew x*E + 0x63
  81. .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
  82. .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
  83. Lk_dks9: // decryption key schedule: invskew x*9
  84. .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
  85. .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
  86. Lk_rcon: // rcon
  87. .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
  88. Lk_opt: // output transform
  89. .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
  90. .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
  91. Lk_deskew: // deskew tables: inverts the sbox's "skew"
  92. .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
  93. .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
  94. .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
  95. .align 2
  96. .align 6
  97. .text
  98. ##
  99. ## _aes_preheat
  100. ##
  101. ## Fills register %r10 -> .aes_consts (so you can -fPIC)
  102. ## and %xmm9-%xmm15 as specified below.
  103. ##
  104. .align 4
  105. _vpaes_encrypt_preheat:
  106. adrp x10, Lk_inv@PAGE
  107. add x10, x10, Lk_inv@PAGEOFF
  108. movi v17.16b, #0x0f
  109. ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
  110. ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
  111. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
  112. ret
  113. ##
  114. ## _aes_encrypt_core
  115. ##
  116. ## AES-encrypt %xmm0.
  117. ##
  118. ## Inputs:
  119. ## %xmm0 = input
  120. ## %xmm9-%xmm15 as in _vpaes_preheat
  121. ## (%rdx) = scheduled keys
  122. ##
  123. ## Output in %xmm0
  124. ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
  125. ## Preserves %xmm6 - %xmm8 so you get some local vectors
  126. ##
  127. ##
  128. .align 4
  129. _vpaes_encrypt_core:
  130. mov x9, x2
  131. ldr w8, [x2,#240] // pull rounds
  132. adrp x11, Lk_mc_forward@PAGE+16
  133. add x11, x11, Lk_mc_forward@PAGEOFF+16
  134. // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  135. ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
  136. and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  137. ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  138. tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
  139. // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  140. tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
  141. eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
  142. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  143. b Lenc_entry
  144. .align 4
  145. Lenc_loop:
  146. // middle of middle round
  147. add x10, x11, #0x40
  148. tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  149. ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
  150. tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  151. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  152. tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  153. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  154. tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  155. ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
  156. tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  157. eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  158. tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  159. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  160. tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  161. eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  162. and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
  163. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  164. sub w8, w8, #1 // nr--
  165. Lenc_entry:
  166. // top of round
  167. and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
  168. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  169. tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  170. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  171. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  172. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  173. eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  174. eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  175. tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  176. tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  177. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  178. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  179. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
  180. cbnz w8, Lenc_loop
  181. // middle of last round
  182. add x10, x11, #0x80
  183. // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  184. // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  185. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  186. ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
  187. tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  188. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  189. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  190. tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
  191. ret
  192. .globl _vpaes_encrypt
  193. .private_extern _vpaes_encrypt
  194. .align 4
  195. _vpaes_encrypt:
  196. AARCH64_SIGN_LINK_REGISTER
  197. stp x29,x30,[sp,#-16]!
  198. add x29,sp,#0
  199. ld1 {v7.16b}, [x0]
  200. bl _vpaes_encrypt_preheat
  201. bl _vpaes_encrypt_core
  202. st1 {v0.16b}, [x1]
  203. ldp x29,x30,[sp],#16
  204. AARCH64_VALIDATE_LINK_REGISTER
  205. ret
  206. .align 4
  207. _vpaes_encrypt_2x:
  208. mov x9, x2
  209. ldr w8, [x2,#240] // pull rounds
  210. adrp x11, Lk_mc_forward@PAGE+16
  211. add x11, x11, Lk_mc_forward@PAGEOFF+16
  212. // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
  213. ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
  214. and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  215. ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  216. and v9.16b, v15.16b, v17.16b
  217. ushr v8.16b, v15.16b, #4
  218. tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
  219. tbl v9.16b, {v20.16b}, v9.16b
  220. // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
  221. tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
  222. tbl v10.16b, {v21.16b}, v8.16b
  223. eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
  224. eor v8.16b, v9.16b, v16.16b
  225. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  226. eor v8.16b, v8.16b, v10.16b
  227. b Lenc_2x_entry
  228. .align 4
  229. Lenc_2x_loop:
  230. // middle of middle round
  231. add x10, x11, #0x40
  232. tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
  233. tbl v12.16b, {v25.16b}, v10.16b
  234. ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
  235. tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
  236. tbl v8.16b, {v24.16b}, v11.16b
  237. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  238. eor v12.16b, v12.16b, v16.16b
  239. tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
  240. tbl v13.16b, {v27.16b}, v10.16b
  241. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  242. eor v8.16b, v8.16b, v12.16b
  243. tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
  244. tbl v10.16b, {v26.16b}, v11.16b
  245. ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
  246. tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
  247. tbl v11.16b, {v8.16b}, v1.16b
  248. eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
  249. eor v10.16b, v10.16b, v13.16b
  250. tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
  251. tbl v8.16b, {v8.16b}, v4.16b
  252. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
  253. eor v11.16b, v11.16b, v10.16b
  254. tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
  255. tbl v12.16b, {v11.16b},v1.16b
  256. eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
  257. eor v8.16b, v8.16b, v11.16b
  258. and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
  259. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
  260. eor v8.16b, v8.16b, v12.16b
  261. sub w8, w8, #1 // nr--
  262. Lenc_2x_entry:
  263. // top of round
  264. and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
  265. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  266. and v9.16b, v8.16b, v17.16b
  267. ushr v8.16b, v8.16b, #4
  268. tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
  269. tbl v13.16b, {v19.16b},v9.16b
  270. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  271. eor v9.16b, v9.16b, v8.16b
  272. tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  273. tbl v11.16b, {v18.16b},v8.16b
  274. tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  275. tbl v12.16b, {v18.16b},v9.16b
  276. eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  277. eor v11.16b, v11.16b, v13.16b
  278. eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  279. eor v12.16b, v12.16b, v13.16b
  280. tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  281. tbl v10.16b, {v18.16b},v11.16b
  282. tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  283. tbl v11.16b, {v18.16b},v12.16b
  284. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  285. eor v10.16b, v10.16b, v9.16b
  286. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  287. eor v11.16b, v11.16b, v8.16b
  288. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
  289. cbnz w8, Lenc_2x_loop
  290. // middle of last round
  291. add x10, x11, #0x80
  292. // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
  293. // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
  294. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  295. tbl v12.16b, {v22.16b}, v10.16b
  296. ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
  297. tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
  298. tbl v8.16b, {v23.16b}, v11.16b
  299. eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
  300. eor v12.16b, v12.16b, v16.16b
  301. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
  302. eor v8.16b, v8.16b, v12.16b
  303. tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
  304. tbl v1.16b, {v8.16b},v1.16b
  305. ret
  306. .align 4
  307. _vpaes_decrypt_preheat:
  308. adrp x10, Lk_inv@PAGE
  309. add x10, x10, Lk_inv@PAGEOFF
  310. movi v17.16b, #0x0f
  311. adrp x11, Lk_dipt@PAGE
  312. add x11, x11, Lk_dipt@PAGEOFF
  313. ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
  314. ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
  315. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
  316. ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
  317. ret
  318. ##
  319. ## Decryption core
  320. ##
  321. ## Same API as encryption core.
  322. ##
  323. .align 4
  324. _vpaes_decrypt_core:
  325. mov x9, x2
  326. ldr w8, [x2,#240] // pull rounds
  327. // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
  328. lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
  329. eor x11, x11, #0x30 // xor $0x30, %r11
  330. adrp x10, Lk_sr@PAGE
  331. add x10, x10, Lk_sr@PAGEOFF
  332. and x11, x11, #0x30 // and $0x30, %r11
  333. add x11, x11, x10
  334. adrp x10, Lk_mc_forward@PAGE+48
  335. add x10, x10, Lk_mc_forward@PAGEOFF+48
  336. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
  337. and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  338. ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  339. tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  340. ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
  341. // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  342. tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  343. eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
  344. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  345. b Ldec_entry
  346. .align 4
  347. Ldec_loop:
  348. //
  349. // Inverse mix columns
  350. //
  351. // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  352. // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  353. tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  354. tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  355. eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
  356. // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  357. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  358. // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  359. tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  360. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  361. tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  362. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  363. // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  364. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  365. // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  366. tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  367. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  368. tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  369. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  370. // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  371. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  372. // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  373. tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  374. tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  375. tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  376. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  377. ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
  378. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  379. sub w8, w8, #1 // sub $1,%rax # nr--
  380. Ldec_entry:
  381. // top of round
  382. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  383. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  384. tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  385. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  386. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  387. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  388. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  389. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  390. tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  391. tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  392. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  393. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  394. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
  395. cbnz w8, Ldec_loop
  396. // middle of last round
  397. // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  398. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  399. // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  400. ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
  401. tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  402. eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  403. eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  404. tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
  405. ret
  406. .globl _vpaes_decrypt
  407. .private_extern _vpaes_decrypt
  408. .align 4
  409. _vpaes_decrypt:
  410. AARCH64_SIGN_LINK_REGISTER
  411. stp x29,x30,[sp,#-16]!
  412. add x29,sp,#0
  413. ld1 {v7.16b}, [x0]
  414. bl _vpaes_decrypt_preheat
  415. bl _vpaes_decrypt_core
  416. st1 {v0.16b}, [x1]
  417. ldp x29,x30,[sp],#16
  418. AARCH64_VALIDATE_LINK_REGISTER
  419. ret
  420. // v14-v15 input, v0-v1 output
  421. .align 4
  422. _vpaes_decrypt_2x:
  423. mov x9, x2
  424. ldr w8, [x2,#240] // pull rounds
  425. // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
  426. lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
  427. eor x11, x11, #0x30 // xor $0x30, %r11
  428. adrp x10, Lk_sr@PAGE
  429. add x10, x10, Lk_sr@PAGEOFF
  430. and x11, x11, #0x30 // and $0x30, %r11
  431. add x11, x11, x10
  432. adrp x10, Lk_mc_forward@PAGE+48
  433. add x10, x10, Lk_mc_forward@PAGEOFF+48
  434. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
  435. and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  436. ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  437. and v9.16b, v15.16b, v17.16b
  438. ushr v8.16b, v15.16b, #4
  439. tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  440. tbl v10.16b, {v20.16b},v9.16b
  441. ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
  442. // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
  443. tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  444. tbl v8.16b, {v21.16b},v8.16b
  445. eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
  446. eor v10.16b, v10.16b, v16.16b
  447. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  448. eor v8.16b, v8.16b, v10.16b
  449. b Ldec_2x_entry
  450. .align 4
  451. Ldec_2x_loop:
  452. //
  453. // Inverse mix columns
  454. //
  455. // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
  456. // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
  457. tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
  458. tbl v12.16b, {v24.16b}, v10.16b
  459. tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
  460. tbl v9.16b, {v25.16b}, v11.16b
  461. eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
  462. eor v8.16b, v12.16b, v16.16b
  463. // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
  464. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  465. eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  466. // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
  467. tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
  468. tbl v12.16b, {v26.16b}, v10.16b
  469. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  470. tbl v8.16b, {v8.16b},v5.16b
  471. tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
  472. tbl v9.16b, {v27.16b}, v11.16b
  473. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  474. eor v8.16b, v8.16b, v12.16b
  475. // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
  476. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  477. eor v8.16b, v8.16b, v9.16b
  478. // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
  479. tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
  480. tbl v12.16b, {v28.16b}, v10.16b
  481. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  482. tbl v8.16b, {v8.16b},v5.16b
  483. tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
  484. tbl v9.16b, {v29.16b}, v11.16b
  485. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  486. eor v8.16b, v8.16b, v12.16b
  487. // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
  488. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  489. eor v8.16b, v8.16b, v9.16b
  490. // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
  491. tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
  492. tbl v12.16b, {v30.16b}, v10.16b
  493. tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
  494. tbl v8.16b, {v8.16b},v5.16b
  495. tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
  496. tbl v9.16b, {v31.16b}, v11.16b
  497. eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
  498. eor v8.16b, v8.16b, v12.16b
  499. ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
  500. eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
  501. eor v8.16b, v8.16b, v9.16b
  502. sub w8, w8, #1 // sub $1,%rax # nr--
  503. Ldec_2x_entry:
  504. // top of round
  505. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  506. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  507. and v9.16b, v8.16b, v17.16b
  508. ushr v8.16b, v8.16b, #4
  509. tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  510. tbl v10.16b, {v19.16b},v9.16b
  511. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  512. eor v9.16b, v9.16b, v8.16b
  513. tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  514. tbl v11.16b, {v18.16b},v8.16b
  515. tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  516. tbl v12.16b, {v18.16b},v9.16b
  517. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  518. eor v11.16b, v11.16b, v10.16b
  519. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  520. eor v12.16b, v12.16b, v10.16b
  521. tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
  522. tbl v10.16b, {v18.16b},v11.16b
  523. tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
  524. tbl v11.16b, {v18.16b},v12.16b
  525. eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
  526. eor v10.16b, v10.16b, v9.16b
  527. eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
  528. eor v11.16b, v11.16b, v8.16b
  529. ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
  530. cbnz w8, Ldec_2x_loop
  531. // middle of last round
  532. // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
  533. tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
  534. tbl v12.16b, {v22.16b}, v10.16b
  535. // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
  536. tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
  537. tbl v9.16b, {v23.16b}, v11.16b
  538. ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
  539. eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
  540. eor v12.16b, v12.16b, v16.16b
  541. eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
  542. eor v8.16b, v9.16b, v12.16b
  543. tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
  544. tbl v1.16b, {v8.16b},v2.16b
  545. ret
  546. ########################################################
  547. ## ##
  548. ## AES key schedule ##
  549. ## ##
  550. ########################################################
  551. .align 4
  552. _vpaes_key_preheat:
  553. adrp x10, Lk_inv@PAGE
  554. add x10, x10, Lk_inv@PAGEOFF
  555. movi v16.16b, #0x5b // Lk_s63
  556. adrp x11, Lk_sb1@PAGE
  557. add x11, x11, Lk_sb1@PAGEOFF
  558. movi v17.16b, #0x0f // Lk_s0F
  559. ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
  560. adrp x10, Lk_dksd@PAGE
  561. add x10, x10, Lk_dksd@PAGEOFF
  562. ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
  563. adrp x11, Lk_mc_forward@PAGE
  564. add x11, x11, Lk_mc_forward@PAGEOFF
  565. ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
  566. ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
  567. ld1 {v8.2d}, [x10] // Lk_rcon
  568. ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
  569. ret
  570. .align 4
  571. _vpaes_schedule_core:
  572. AARCH64_SIGN_LINK_REGISTER
  573. stp x29, x30, [sp,#-16]!
  574. add x29,sp,#0
  575. bl _vpaes_key_preheat // load the tables
  576. ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
  577. // input transform
  578. mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
  579. bl _vpaes_schedule_transform
  580. mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
  581. adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10
  582. add x10, x10, Lk_sr@PAGEOFF
  583. add x8, x8, x10
  584. cbnz w3, Lschedule_am_decrypting
  585. // encrypting, output zeroth round key after transform
  586. st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
  587. b Lschedule_go
  588. Lschedule_am_decrypting:
  589. // decrypting, output zeroth round key after shiftrows
  590. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  591. tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  592. st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
  593. eor x8, x8, #0x30 // xor $0x30, %r8
  594. Lschedule_go:
  595. cmp w1, #192 // cmp $192, %esi
  596. b.hi Lschedule_256
  597. b.eq Lschedule_192
  598. // 128: fall though
  599. ##
  600. ## .schedule_128
  601. ##
  602. ## 128-bit specific part of key schedule.
  603. ##
  604. ## This schedule is really simple, because all its parts
  605. ## are accomplished by the subroutines.
  606. ##
  607. Lschedule_128:
  608. mov x0, #10 // mov $10, %esi
  609. Loop_schedule_128:
  610. sub x0, x0, #1 // dec %esi
  611. bl _vpaes_schedule_round
  612. cbz x0, Lschedule_mangle_last
  613. bl _vpaes_schedule_mangle // write output
  614. b Loop_schedule_128
  615. ##
  616. ## .aes_schedule_192
  617. ##
  618. ## 192-bit specific part of key schedule.
  619. ##
  620. ## The main body of this schedule is the same as the 128-bit
  621. ## schedule, but with more smearing. The long, high side is
  622. ## stored in %xmm7 as before, and the short, low side is in
  623. ## the high bits of %xmm6.
  624. ##
  625. ## This schedule is somewhat nastier, however, because each
  626. ## round produces 192 bits of key material, or 1.5 round keys.
  627. ## Therefore, on each cycle we do 2 rounds and produce 3 round
  628. ## keys.
  629. ##
  630. .align 4
  631. Lschedule_192:
  632. sub x0, x0, #8
  633. ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
  634. bl _vpaes_schedule_transform // input transform
  635. mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
  636. eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
  637. ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
  638. mov x0, #4 // mov $4, %esi
  639. Loop_schedule_192:
  640. sub x0, x0, #1 // dec %esi
  641. bl _vpaes_schedule_round
  642. ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
  643. bl _vpaes_schedule_mangle // save key n
  644. bl _vpaes_schedule_192_smear
  645. bl _vpaes_schedule_mangle // save key n+1
  646. bl _vpaes_schedule_round
  647. cbz x0, Lschedule_mangle_last
  648. bl _vpaes_schedule_mangle // save key n+2
  649. bl _vpaes_schedule_192_smear
  650. b Loop_schedule_192
  651. ##
  652. ## .aes_schedule_256
  653. ##
  654. ## 256-bit specific part of key schedule.
  655. ##
  656. ## The structure here is very similar to the 128-bit
  657. ## schedule, but with an additional "low side" in
  658. ## %xmm6. The low side's rounds are the same as the
  659. ## high side's, except no rcon and no rotation.
  660. ##
  661. .align 4
  662. Lschedule_256:
  663. ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
  664. bl _vpaes_schedule_transform // input transform
  665. mov x0, #7 // mov $7, %esi
  666. Loop_schedule_256:
  667. sub x0, x0, #1 // dec %esi
  668. bl _vpaes_schedule_mangle // output low result
  669. mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
  670. // high round
  671. bl _vpaes_schedule_round
  672. cbz x0, Lschedule_mangle_last
  673. bl _vpaes_schedule_mangle
  674. // low round. swap xmm7 and xmm6
  675. dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
  676. movi v4.16b, #0
  677. mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
  678. mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
  679. bl _vpaes_schedule_low_round
  680. mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
  681. b Loop_schedule_256
  682. ##
  683. ## .aes_schedule_mangle_last
  684. ##
  685. ## Mangler for last round of key schedule
  686. ## Mangles %xmm0
  687. ## when encrypting, outputs out(%xmm0) ^ 63
  688. ## when decrypting, outputs unskew(%xmm0)
  689. ##
  690. ## Always called right before return... jumps to cleanup and exits
  691. ##
  692. .align 4
  693. Lschedule_mangle_last:
  694. // schedule last round key from xmm0
  695. adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew
  696. add x11, x11, Lk_deskew@PAGEOFF
  697. cbnz w3, Lschedule_mangle_last_dec
  698. // encrypting
  699. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
  700. adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform
  701. add x11, x11, Lk_opt@PAGEOFF
  702. add x2, x2, #32 // add $32, %rdx
  703. tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
  704. Lschedule_mangle_last_dec:
  705. ld1 {v20.2d,v21.2d}, [x11] // reload constants
  706. sub x2, x2, #16 // add $-16, %rdx
  707. eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
  708. bl _vpaes_schedule_transform // output transform
  709. st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
  710. // cleanup
  711. eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
  712. eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
  713. eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
  714. eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
  715. eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
  716. eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
  717. eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
  718. eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
  719. ldp x29, x30, [sp],#16
  720. AARCH64_VALIDATE_LINK_REGISTER
  721. ret
  722. ##
  723. ## .aes_schedule_192_smear
  724. ##
  725. ## Smear the short, low side in the 192-bit key schedule.
  726. ##
  727. ## Inputs:
  728. ## %xmm7: high side, b a x y
  729. ## %xmm6: low side, d c 0 0
  730. ## %xmm13: 0
  731. ##
  732. ## Outputs:
  733. ## %xmm6: b+c+d b+c 0 0
  734. ## %xmm0: b+c+d b+c b a
  735. ##
  736. .align 4
  737. _vpaes_schedule_192_smear:
  738. movi v1.16b, #0
  739. dup v0.4s, v7.s[3]
  740. ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
  741. ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
  742. eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
  743. eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
  744. eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
  745. mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
  746. ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
  747. ret
  748. ##
  749. ## .aes_schedule_round
  750. ##
  751. ## Runs one main round of the key schedule on %xmm0, %xmm7
  752. ##
  753. ## Specifically, runs subbytes on the high dword of %xmm0
  754. ## then rotates it by one byte and xors into the low dword of
  755. ## %xmm7.
  756. ##
  757. ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
  758. ## next rcon.
  759. ##
  760. ## Smears the dwords of %xmm7 by xoring the low into the
  761. ## second low, result into third, result into highest.
  762. ##
  763. ## Returns results in %xmm7 = %xmm0.
  764. ## Clobbers %xmm1-%xmm4, %r11.
  765. ##
  766. .align 4
  767. _vpaes_schedule_round:
  768. // extract rcon from xmm8
  769. movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
  770. ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
  771. ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
  772. eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
  773. // rotate
  774. dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
  775. ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
  776. // fall through...
  777. // low round: same as high round, but no rotation and no rcon.
  778. _vpaes_schedule_low_round:
  779. // smear xmm7
  780. ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
  781. eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
  782. ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
  783. // subbytes
  784. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
  785. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
  786. eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
  787. tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
  788. eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
  789. tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
  790. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
  791. tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
  792. eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
  793. tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
  794. eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
  795. tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
  796. eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
  797. eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
  798. tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
  799. tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
  800. eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
  801. // add in smeared stuff
  802. eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
  803. eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
  804. ret
  805. ##
  806. ## .aes_schedule_transform
  807. ##
  808. ## Linear-transform %xmm0 according to tables at (%r11)
  809. ##
  810. ## Requires that %xmm9 = 0x0F0F... as in preheat
  811. ## Output in %xmm0
  812. ## Clobbers %xmm1, %xmm2
  813. ##
  814. .align 4
  815. _vpaes_schedule_transform:
  816. and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
  817. ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
  818. // vmovdqa (%r11), %xmm2 # lo
  819. tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
  820. // vmovdqa 16(%r11), %xmm1 # hi
  821. tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
  822. eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
  823. ret
  824. ##
  825. ## .aes_schedule_mangle
  826. ##
  827. ## Mangle xmm0 from (basis-transformed) standard version
  828. ## to our version.
  829. ##
  830. ## On encrypt,
  831. ## xor with 0x63
  832. ## multiply by circulant 0,1,1,1
  833. ## apply shiftrows transform
  834. ##
  835. ## On decrypt,
  836. ## xor with 0x63
  837. ## multiply by "inverse mixcolumns" circulant E,B,D,9
  838. ## deskew
  839. ## apply shiftrows transform
  840. ##
  841. ##
  842. ## Writes out to (%rdx), and increments or decrements it
  843. ## Keeps track of round number mod 4 in %r8
  844. ## Preserves xmm0
  845. ## Clobbers xmm1-xmm5
  846. ##
  847. .align 4
  848. _vpaes_schedule_mangle:
  849. mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
  850. // vmovdqa .Lk_mc_forward(%rip),%xmm5
  851. cbnz w3, Lschedule_mangle_dec
  852. // encrypting
  853. eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
  854. add x2, x2, #16 // add $16, %rdx
  855. tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
  856. tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
  857. tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
  858. eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
  859. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  860. eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
  861. b Lschedule_mangle_both
  862. .align 4
  863. Lschedule_mangle_dec:
  864. // inverse mix columns
  865. // lea .Lk_dksd(%rip),%r11
  866. ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
  867. and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
  868. // vmovdqa 0x00(%r11), %xmm2
  869. tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  870. // vmovdqa 0x10(%r11), %xmm3
  871. tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  872. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  873. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  874. // vmovdqa 0x20(%r11), %xmm2
  875. tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  876. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  877. // vmovdqa 0x30(%r11), %xmm3
  878. tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  879. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  880. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  881. // vmovdqa 0x40(%r11), %xmm2
  882. tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  883. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  884. // vmovdqa 0x50(%r11), %xmm3
  885. tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  886. eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
  887. // vmovdqa 0x60(%r11), %xmm2
  888. tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
  889. tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
  890. // vmovdqa 0x70(%r11), %xmm4
  891. tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
  892. ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
  893. eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
  894. eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
  895. sub x2, x2, #16 // add $-16, %rdx
  896. Lschedule_mangle_both:
  897. tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
  898. add x8, x8, #48 // add $-16, %r8
  899. and x8, x8, #~(1<<6) // and $0x30, %r8
  900. st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
  901. ret
  902. .globl _vpaes_set_encrypt_key
  903. .private_extern _vpaes_set_encrypt_key
  904. .align 4
  905. _vpaes_set_encrypt_key:
  906. AARCH64_SIGN_LINK_REGISTER
  907. stp x29,x30,[sp,#-16]!
  908. add x29,sp,#0
  909. stp d8,d9,[sp,#-16]! // ABI spec says so
  910. lsr w9, w1, #5 // shr $5,%eax
  911. add w9, w9, #5 // $5,%eax
  912. str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  913. mov w3, #0 // mov $0,%ecx
  914. mov x8, #0x30 // mov $0x30,%r8d
  915. bl _vpaes_schedule_core
  916. eor x0, x0, x0
  917. ldp d8,d9,[sp],#16
  918. ldp x29,x30,[sp],#16
  919. AARCH64_VALIDATE_LINK_REGISTER
  920. ret
  921. .globl _vpaes_set_decrypt_key
  922. .private_extern _vpaes_set_decrypt_key
  923. .align 4
  924. _vpaes_set_decrypt_key:
  925. AARCH64_SIGN_LINK_REGISTER
  926. stp x29,x30,[sp,#-16]!
  927. add x29,sp,#0
  928. stp d8,d9,[sp,#-16]! // ABI spec says so
  929. lsr w9, w1, #5 // shr $5,%eax
  930. add w9, w9, #5 // $5,%eax
  931. str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
  932. lsl w9, w9, #4 // shl $4,%eax
  933. add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
  934. add x2, x2, x9
  935. mov w3, #1 // mov $1,%ecx
  936. lsr w8, w1, #1 // shr $1,%r8d
  937. and x8, x8, #32 // and $32,%r8d
  938. eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
  939. bl _vpaes_schedule_core
  940. ldp d8,d9,[sp],#16
  941. ldp x29,x30,[sp],#16
  942. AARCH64_VALIDATE_LINK_REGISTER
  943. ret
  944. .globl _vpaes_cbc_encrypt
  945. .private_extern _vpaes_cbc_encrypt
  946. .align 4
  947. _vpaes_cbc_encrypt:
  948. AARCH64_SIGN_LINK_REGISTER
  949. cbz x2, Lcbc_abort
  950. cmp w5, #0 // check direction
  951. b.eq vpaes_cbc_decrypt
  952. stp x29,x30,[sp,#-16]!
  953. add x29,sp,#0
  954. mov x17, x2 // reassign
  955. mov x2, x3 // reassign
  956. ld1 {v0.16b}, [x4] // load ivec
  957. bl _vpaes_encrypt_preheat
  958. b Lcbc_enc_loop
  959. .align 4
  960. Lcbc_enc_loop:
  961. ld1 {v7.16b}, [x0],#16 // load input
  962. eor v7.16b, v7.16b, v0.16b // xor with ivec
  963. bl _vpaes_encrypt_core
  964. st1 {v0.16b}, [x1],#16 // save output
  965. subs x17, x17, #16
  966. b.hi Lcbc_enc_loop
  967. st1 {v0.16b}, [x4] // write ivec
  968. ldp x29,x30,[sp],#16
  969. Lcbc_abort:
  970. AARCH64_VALIDATE_LINK_REGISTER
  971. ret
  972. .align 4
  973. vpaes_cbc_decrypt:
  974. // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
  975. // only from vpaes_cbc_encrypt which has already signed the return address.
  976. stp x29,x30,[sp,#-16]!
  977. add x29,sp,#0
  978. stp d8,d9,[sp,#-16]! // ABI spec says so
  979. stp d10,d11,[sp,#-16]!
  980. stp d12,d13,[sp,#-16]!
  981. stp d14,d15,[sp,#-16]!
  982. mov x17, x2 // reassign
  983. mov x2, x3 // reassign
  984. ld1 {v6.16b}, [x4] // load ivec
  985. bl _vpaes_decrypt_preheat
  986. tst x17, #16
  987. b.eq Lcbc_dec_loop2x
  988. ld1 {v7.16b}, [x0], #16 // load input
  989. bl _vpaes_decrypt_core
  990. eor v0.16b, v0.16b, v6.16b // xor with ivec
  991. orr v6.16b, v7.16b, v7.16b // next ivec value
  992. st1 {v0.16b}, [x1], #16
  993. subs x17, x17, #16
  994. b.ls Lcbc_dec_done
  995. .align 4
  996. Lcbc_dec_loop2x:
  997. ld1 {v14.16b,v15.16b}, [x0], #32
  998. bl _vpaes_decrypt_2x
  999. eor v0.16b, v0.16b, v6.16b // xor with ivec
  1000. eor v1.16b, v1.16b, v14.16b
  1001. orr v6.16b, v15.16b, v15.16b
  1002. st1 {v0.16b,v1.16b}, [x1], #32
  1003. subs x17, x17, #32
  1004. b.hi Lcbc_dec_loop2x
  1005. Lcbc_dec_done:
  1006. st1 {v6.16b}, [x4]
  1007. ldp d14,d15,[sp],#16
  1008. ldp d12,d13,[sp],#16
  1009. ldp d10,d11,[sp],#16
  1010. ldp d8,d9,[sp],#16
  1011. ldp x29,x30,[sp],#16
  1012. AARCH64_VALIDATE_LINK_REGISTER
  1013. ret
  1014. .globl _vpaes_ctr32_encrypt_blocks
  1015. .private_extern _vpaes_ctr32_encrypt_blocks
  1016. .align 4
  1017. _vpaes_ctr32_encrypt_blocks:
  1018. AARCH64_SIGN_LINK_REGISTER
  1019. stp x29,x30,[sp,#-16]!
  1020. add x29,sp,#0
  1021. stp d8,d9,[sp,#-16]! // ABI spec says so
  1022. stp d10,d11,[sp,#-16]!
  1023. stp d12,d13,[sp,#-16]!
  1024. stp d14,d15,[sp,#-16]!
  1025. cbz x2, Lctr32_done
  1026. // Note, unlike the other functions, x2 here is measured in blocks,
  1027. // not bytes.
  1028. mov x17, x2
  1029. mov x2, x3
  1030. // Load the IV and counter portion.
  1031. ldr w6, [x4, #12]
  1032. ld1 {v7.16b}, [x4]
  1033. bl _vpaes_encrypt_preheat
  1034. tst x17, #1
  1035. rev w6, w6 // The counter is big-endian.
  1036. b.eq Lctr32_prep_loop
  1037. // Handle one block so the remaining block count is even for
  1038. // _vpaes_encrypt_2x.
  1039. ld1 {v6.16b}, [x0], #16 // Load input ahead of time
  1040. bl _vpaes_encrypt_core
  1041. eor v0.16b, v0.16b, v6.16b // XOR input and result
  1042. st1 {v0.16b}, [x1], #16
  1043. subs x17, x17, #1
  1044. // Update the counter.
  1045. add w6, w6, #1
  1046. rev w7, w6
  1047. mov v7.s[3], w7
  1048. b.ls Lctr32_done
  1049. Lctr32_prep_loop:
  1050. // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
  1051. // uses v14 and v15.
  1052. mov v15.16b, v7.16b
  1053. mov v14.16b, v7.16b
  1054. add w6, w6, #1
  1055. rev w7, w6
  1056. mov v15.s[3], w7
  1057. Lctr32_loop:
  1058. ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
  1059. bl _vpaes_encrypt_2x
  1060. eor v0.16b, v0.16b, v6.16b // XOR input and result
  1061. eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
  1062. st1 {v0.16b,v1.16b}, [x1], #32
  1063. subs x17, x17, #2
  1064. // Update the counter.
  1065. add w7, w6, #1
  1066. add w6, w6, #2
  1067. rev w7, w7
  1068. mov v14.s[3], w7
  1069. rev w7, w6
  1070. mov v15.s[3], w7
  1071. b.hi Lctr32_loop
  1072. Lctr32_done:
  1073. ldp d14,d15,[sp],#16
  1074. ldp d12,d13,[sp],#16
  1075. ldp d10,d11,[sp],#16
  1076. ldp d8,d9,[sp],#16
  1077. ldp x29,x30,[sp],#16
  1078. AARCH64_VALIDATE_LINK_REGISTER
  1079. ret
  1080. #endif // !OPENSSL_NO_ASM