chacha-x86_64.S 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if defined(__has_feature)
  4. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  5. #define OPENSSL_NO_ASM
  6. #endif
  7. #endif
  8. #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
  9. #if defined(BORINGSSL_PREFIX)
  10. #include <boringssl_prefix_symbols_asm.h>
  11. #endif
  12. .text
  13. .extern OPENSSL_ia32cap_P
  14. .hidden OPENSSL_ia32cap_P
  15. .align 64
  16. .Lzero:
  17. .long 0,0,0,0
  18. .Lone:
  19. .long 1,0,0,0
  20. .Linc:
  21. .long 0,1,2,3
  22. .Lfour:
  23. .long 4,4,4,4
  24. .Lincy:
  25. .long 0,2,4,6,1,3,5,7
  26. .Leight:
  27. .long 8,8,8,8,8,8,8,8
  28. .Lrot16:
  29. .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
  30. .Lrot24:
  31. .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
  32. .Lsigma:
  33. .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
  34. .align 64
  35. .Lzeroz:
  36. .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
  37. .Lfourz:
  38. .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
  39. .Lincz:
  40. .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  41. .Lsixteen:
  42. .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
  43. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  44. .globl ChaCha20_ctr32
  45. .hidden ChaCha20_ctr32
  46. .type ChaCha20_ctr32,@function
  47. .align 64
  48. ChaCha20_ctr32:
  49. .cfi_startproc
  50. cmpq $0,%rdx
  51. je .Lno_data
  52. movq OPENSSL_ia32cap_P+4(%rip),%r10
  53. testl $512,%r10d
  54. jnz .LChaCha20_ssse3
  55. pushq %rbx
  56. .cfi_adjust_cfa_offset 8
  57. .cfi_offset rbx,-16
  58. pushq %rbp
  59. .cfi_adjust_cfa_offset 8
  60. .cfi_offset rbp,-24
  61. pushq %r12
  62. .cfi_adjust_cfa_offset 8
  63. .cfi_offset r12,-32
  64. pushq %r13
  65. .cfi_adjust_cfa_offset 8
  66. .cfi_offset r13,-40
  67. pushq %r14
  68. .cfi_adjust_cfa_offset 8
  69. .cfi_offset r14,-48
  70. pushq %r15
  71. .cfi_adjust_cfa_offset 8
  72. .cfi_offset r15,-56
  73. subq $64+24,%rsp
  74. .cfi_adjust_cfa_offset 88
  75. .Lctr32_body:
  76. movdqu (%rcx),%xmm1
  77. movdqu 16(%rcx),%xmm2
  78. movdqu (%r8),%xmm3
  79. movdqa .Lone(%rip),%xmm4
  80. movdqa %xmm1,16(%rsp)
  81. movdqa %xmm2,32(%rsp)
  82. movdqa %xmm3,48(%rsp)
  83. movq %rdx,%rbp
  84. jmp .Loop_outer
  85. .align 32
  86. .Loop_outer:
  87. movl $0x61707865,%eax
  88. movl $0x3320646e,%ebx
  89. movl $0x79622d32,%ecx
  90. movl $0x6b206574,%edx
  91. movl 16(%rsp),%r8d
  92. movl 20(%rsp),%r9d
  93. movl 24(%rsp),%r10d
  94. movl 28(%rsp),%r11d
  95. movd %xmm3,%r12d
  96. movl 52(%rsp),%r13d
  97. movl 56(%rsp),%r14d
  98. movl 60(%rsp),%r15d
  99. movq %rbp,64+0(%rsp)
  100. movl $10,%ebp
  101. movq %rsi,64+8(%rsp)
  102. .byte 102,72,15,126,214
  103. movq %rdi,64+16(%rsp)
  104. movq %rsi,%rdi
  105. shrq $32,%rdi
  106. jmp .Loop
  107. .align 32
  108. .Loop:
  109. addl %r8d,%eax
  110. xorl %eax,%r12d
  111. roll $16,%r12d
  112. addl %r9d,%ebx
  113. xorl %ebx,%r13d
  114. roll $16,%r13d
  115. addl %r12d,%esi
  116. xorl %esi,%r8d
  117. roll $12,%r8d
  118. addl %r13d,%edi
  119. xorl %edi,%r9d
  120. roll $12,%r9d
  121. addl %r8d,%eax
  122. xorl %eax,%r12d
  123. roll $8,%r12d
  124. addl %r9d,%ebx
  125. xorl %ebx,%r13d
  126. roll $8,%r13d
  127. addl %r12d,%esi
  128. xorl %esi,%r8d
  129. roll $7,%r8d
  130. addl %r13d,%edi
  131. xorl %edi,%r9d
  132. roll $7,%r9d
  133. movl %esi,32(%rsp)
  134. movl %edi,36(%rsp)
  135. movl 40(%rsp),%esi
  136. movl 44(%rsp),%edi
  137. addl %r10d,%ecx
  138. xorl %ecx,%r14d
  139. roll $16,%r14d
  140. addl %r11d,%edx
  141. xorl %edx,%r15d
  142. roll $16,%r15d
  143. addl %r14d,%esi
  144. xorl %esi,%r10d
  145. roll $12,%r10d
  146. addl %r15d,%edi
  147. xorl %edi,%r11d
  148. roll $12,%r11d
  149. addl %r10d,%ecx
  150. xorl %ecx,%r14d
  151. roll $8,%r14d
  152. addl %r11d,%edx
  153. xorl %edx,%r15d
  154. roll $8,%r15d
  155. addl %r14d,%esi
  156. xorl %esi,%r10d
  157. roll $7,%r10d
  158. addl %r15d,%edi
  159. xorl %edi,%r11d
  160. roll $7,%r11d
  161. addl %r9d,%eax
  162. xorl %eax,%r15d
  163. roll $16,%r15d
  164. addl %r10d,%ebx
  165. xorl %ebx,%r12d
  166. roll $16,%r12d
  167. addl %r15d,%esi
  168. xorl %esi,%r9d
  169. roll $12,%r9d
  170. addl %r12d,%edi
  171. xorl %edi,%r10d
  172. roll $12,%r10d
  173. addl %r9d,%eax
  174. xorl %eax,%r15d
  175. roll $8,%r15d
  176. addl %r10d,%ebx
  177. xorl %ebx,%r12d
  178. roll $8,%r12d
  179. addl %r15d,%esi
  180. xorl %esi,%r9d
  181. roll $7,%r9d
  182. addl %r12d,%edi
  183. xorl %edi,%r10d
  184. roll $7,%r10d
  185. movl %esi,40(%rsp)
  186. movl %edi,44(%rsp)
  187. movl 32(%rsp),%esi
  188. movl 36(%rsp),%edi
  189. addl %r11d,%ecx
  190. xorl %ecx,%r13d
  191. roll $16,%r13d
  192. addl %r8d,%edx
  193. xorl %edx,%r14d
  194. roll $16,%r14d
  195. addl %r13d,%esi
  196. xorl %esi,%r11d
  197. roll $12,%r11d
  198. addl %r14d,%edi
  199. xorl %edi,%r8d
  200. roll $12,%r8d
  201. addl %r11d,%ecx
  202. xorl %ecx,%r13d
  203. roll $8,%r13d
  204. addl %r8d,%edx
  205. xorl %edx,%r14d
  206. roll $8,%r14d
  207. addl %r13d,%esi
  208. xorl %esi,%r11d
  209. roll $7,%r11d
  210. addl %r14d,%edi
  211. xorl %edi,%r8d
  212. roll $7,%r8d
  213. decl %ebp
  214. jnz .Loop
  215. movl %edi,36(%rsp)
  216. movl %esi,32(%rsp)
  217. movq 64(%rsp),%rbp
  218. movdqa %xmm2,%xmm1
  219. movq 64+8(%rsp),%rsi
  220. paddd %xmm4,%xmm3
  221. movq 64+16(%rsp),%rdi
  222. addl $0x61707865,%eax
  223. addl $0x3320646e,%ebx
  224. addl $0x79622d32,%ecx
  225. addl $0x6b206574,%edx
  226. addl 16(%rsp),%r8d
  227. addl 20(%rsp),%r9d
  228. addl 24(%rsp),%r10d
  229. addl 28(%rsp),%r11d
  230. addl 48(%rsp),%r12d
  231. addl 52(%rsp),%r13d
  232. addl 56(%rsp),%r14d
  233. addl 60(%rsp),%r15d
  234. paddd 32(%rsp),%xmm1
  235. cmpq $64,%rbp
  236. jb .Ltail
  237. xorl 0(%rsi),%eax
  238. xorl 4(%rsi),%ebx
  239. xorl 8(%rsi),%ecx
  240. xorl 12(%rsi),%edx
  241. xorl 16(%rsi),%r8d
  242. xorl 20(%rsi),%r9d
  243. xorl 24(%rsi),%r10d
  244. xorl 28(%rsi),%r11d
  245. movdqu 32(%rsi),%xmm0
  246. xorl 48(%rsi),%r12d
  247. xorl 52(%rsi),%r13d
  248. xorl 56(%rsi),%r14d
  249. xorl 60(%rsi),%r15d
  250. leaq 64(%rsi),%rsi
  251. pxor %xmm1,%xmm0
  252. movdqa %xmm2,32(%rsp)
  253. movd %xmm3,48(%rsp)
  254. movl %eax,0(%rdi)
  255. movl %ebx,4(%rdi)
  256. movl %ecx,8(%rdi)
  257. movl %edx,12(%rdi)
  258. movl %r8d,16(%rdi)
  259. movl %r9d,20(%rdi)
  260. movl %r10d,24(%rdi)
  261. movl %r11d,28(%rdi)
  262. movdqu %xmm0,32(%rdi)
  263. movl %r12d,48(%rdi)
  264. movl %r13d,52(%rdi)
  265. movl %r14d,56(%rdi)
  266. movl %r15d,60(%rdi)
  267. leaq 64(%rdi),%rdi
  268. subq $64,%rbp
  269. jnz .Loop_outer
  270. jmp .Ldone
  271. .align 16
  272. .Ltail:
  273. movl %eax,0(%rsp)
  274. movl %ebx,4(%rsp)
  275. xorq %rbx,%rbx
  276. movl %ecx,8(%rsp)
  277. movl %edx,12(%rsp)
  278. movl %r8d,16(%rsp)
  279. movl %r9d,20(%rsp)
  280. movl %r10d,24(%rsp)
  281. movl %r11d,28(%rsp)
  282. movdqa %xmm1,32(%rsp)
  283. movl %r12d,48(%rsp)
  284. movl %r13d,52(%rsp)
  285. movl %r14d,56(%rsp)
  286. movl %r15d,60(%rsp)
  287. .Loop_tail:
  288. movzbl (%rsi,%rbx,1),%eax
  289. movzbl (%rsp,%rbx,1),%edx
  290. leaq 1(%rbx),%rbx
  291. xorl %edx,%eax
  292. movb %al,-1(%rdi,%rbx,1)
  293. decq %rbp
  294. jnz .Loop_tail
  295. .Ldone:
  296. leaq 64+24+48(%rsp),%rsi
  297. movq -48(%rsi),%r15
  298. .cfi_restore r15
  299. movq -40(%rsi),%r14
  300. .cfi_restore r14
  301. movq -32(%rsi),%r13
  302. .cfi_restore r13
  303. movq -24(%rsi),%r12
  304. .cfi_restore r12
  305. movq -16(%rsi),%rbp
  306. .cfi_restore rbp
  307. movq -8(%rsi),%rbx
  308. .cfi_restore rbx
  309. leaq (%rsi),%rsp
  310. .cfi_adjust_cfa_offset -136
  311. .Lno_data:
  312. .byte 0xf3,0xc3
  313. .cfi_endproc
  314. .size ChaCha20_ctr32,.-ChaCha20_ctr32
  315. .type ChaCha20_ssse3,@function
  316. .align 32
  317. ChaCha20_ssse3:
  318. .LChaCha20_ssse3:
  319. .cfi_startproc
  320. movq %rsp,%r9
  321. .cfi_def_cfa_register r9
  322. cmpq $128,%rdx
  323. ja .LChaCha20_4x
  324. .Ldo_sse3_after_all:
  325. subq $64+8,%rsp
  326. movdqa .Lsigma(%rip),%xmm0
  327. movdqu (%rcx),%xmm1
  328. movdqu 16(%rcx),%xmm2
  329. movdqu (%r8),%xmm3
  330. movdqa .Lrot16(%rip),%xmm6
  331. movdqa .Lrot24(%rip),%xmm7
  332. movdqa %xmm0,0(%rsp)
  333. movdqa %xmm1,16(%rsp)
  334. movdqa %xmm2,32(%rsp)
  335. movdqa %xmm3,48(%rsp)
  336. movq $10,%r8
  337. jmp .Loop_ssse3
  338. .align 32
  339. .Loop_outer_ssse3:
  340. movdqa .Lone(%rip),%xmm3
  341. movdqa 0(%rsp),%xmm0
  342. movdqa 16(%rsp),%xmm1
  343. movdqa 32(%rsp),%xmm2
  344. paddd 48(%rsp),%xmm3
  345. movq $10,%r8
  346. movdqa %xmm3,48(%rsp)
  347. jmp .Loop_ssse3
  348. .align 32
  349. .Loop_ssse3:
  350. paddd %xmm1,%xmm0
  351. pxor %xmm0,%xmm3
  352. .byte 102,15,56,0,222
  353. paddd %xmm3,%xmm2
  354. pxor %xmm2,%xmm1
  355. movdqa %xmm1,%xmm4
  356. psrld $20,%xmm1
  357. pslld $12,%xmm4
  358. por %xmm4,%xmm1
  359. paddd %xmm1,%xmm0
  360. pxor %xmm0,%xmm3
  361. .byte 102,15,56,0,223
  362. paddd %xmm3,%xmm2
  363. pxor %xmm2,%xmm1
  364. movdqa %xmm1,%xmm4
  365. psrld $25,%xmm1
  366. pslld $7,%xmm4
  367. por %xmm4,%xmm1
  368. pshufd $78,%xmm2,%xmm2
  369. pshufd $57,%xmm1,%xmm1
  370. pshufd $147,%xmm3,%xmm3
  371. nop
  372. paddd %xmm1,%xmm0
  373. pxor %xmm0,%xmm3
  374. .byte 102,15,56,0,222
  375. paddd %xmm3,%xmm2
  376. pxor %xmm2,%xmm1
  377. movdqa %xmm1,%xmm4
  378. psrld $20,%xmm1
  379. pslld $12,%xmm4
  380. por %xmm4,%xmm1
  381. paddd %xmm1,%xmm0
  382. pxor %xmm0,%xmm3
  383. .byte 102,15,56,0,223
  384. paddd %xmm3,%xmm2
  385. pxor %xmm2,%xmm1
  386. movdqa %xmm1,%xmm4
  387. psrld $25,%xmm1
  388. pslld $7,%xmm4
  389. por %xmm4,%xmm1
  390. pshufd $78,%xmm2,%xmm2
  391. pshufd $147,%xmm1,%xmm1
  392. pshufd $57,%xmm3,%xmm3
  393. decq %r8
  394. jnz .Loop_ssse3
  395. paddd 0(%rsp),%xmm0
  396. paddd 16(%rsp),%xmm1
  397. paddd 32(%rsp),%xmm2
  398. paddd 48(%rsp),%xmm3
  399. cmpq $64,%rdx
  400. jb .Ltail_ssse3
  401. movdqu 0(%rsi),%xmm4
  402. movdqu 16(%rsi),%xmm5
  403. pxor %xmm4,%xmm0
  404. movdqu 32(%rsi),%xmm4
  405. pxor %xmm5,%xmm1
  406. movdqu 48(%rsi),%xmm5
  407. leaq 64(%rsi),%rsi
  408. pxor %xmm4,%xmm2
  409. pxor %xmm5,%xmm3
  410. movdqu %xmm0,0(%rdi)
  411. movdqu %xmm1,16(%rdi)
  412. movdqu %xmm2,32(%rdi)
  413. movdqu %xmm3,48(%rdi)
  414. leaq 64(%rdi),%rdi
  415. subq $64,%rdx
  416. jnz .Loop_outer_ssse3
  417. jmp .Ldone_ssse3
  418. .align 16
  419. .Ltail_ssse3:
  420. movdqa %xmm0,0(%rsp)
  421. movdqa %xmm1,16(%rsp)
  422. movdqa %xmm2,32(%rsp)
  423. movdqa %xmm3,48(%rsp)
  424. xorq %r8,%r8
  425. .Loop_tail_ssse3:
  426. movzbl (%rsi,%r8,1),%eax
  427. movzbl (%rsp,%r8,1),%ecx
  428. leaq 1(%r8),%r8
  429. xorl %ecx,%eax
  430. movb %al,-1(%rdi,%r8,1)
  431. decq %rdx
  432. jnz .Loop_tail_ssse3
  433. .Ldone_ssse3:
  434. leaq (%r9),%rsp
  435. .cfi_def_cfa_register rsp
  436. .Lssse3_epilogue:
  437. .byte 0xf3,0xc3
  438. .cfi_endproc
  439. .size ChaCha20_ssse3,.-ChaCha20_ssse3
  440. .type ChaCha20_4x,@function
  441. .align 32
  442. ChaCha20_4x:
  443. .LChaCha20_4x:
  444. .cfi_startproc
  445. movq %rsp,%r9
  446. .cfi_def_cfa_register r9
  447. movq %r10,%r11
  448. shrq $32,%r10
  449. testq $32,%r10
  450. jnz .LChaCha20_8x
  451. cmpq $192,%rdx
  452. ja .Lproceed4x
  453. andq $71303168,%r11
  454. cmpq $4194304,%r11
  455. je .Ldo_sse3_after_all
  456. .Lproceed4x:
  457. subq $0x140+8,%rsp
  458. movdqa .Lsigma(%rip),%xmm11
  459. movdqu (%rcx),%xmm15
  460. movdqu 16(%rcx),%xmm7
  461. movdqu (%r8),%xmm3
  462. leaq 256(%rsp),%rcx
  463. leaq .Lrot16(%rip),%r10
  464. leaq .Lrot24(%rip),%r11
  465. pshufd $0x00,%xmm11,%xmm8
  466. pshufd $0x55,%xmm11,%xmm9
  467. movdqa %xmm8,64(%rsp)
  468. pshufd $0xaa,%xmm11,%xmm10
  469. movdqa %xmm9,80(%rsp)
  470. pshufd $0xff,%xmm11,%xmm11
  471. movdqa %xmm10,96(%rsp)
  472. movdqa %xmm11,112(%rsp)
  473. pshufd $0x00,%xmm15,%xmm12
  474. pshufd $0x55,%xmm15,%xmm13
  475. movdqa %xmm12,128-256(%rcx)
  476. pshufd $0xaa,%xmm15,%xmm14
  477. movdqa %xmm13,144-256(%rcx)
  478. pshufd $0xff,%xmm15,%xmm15
  479. movdqa %xmm14,160-256(%rcx)
  480. movdqa %xmm15,176-256(%rcx)
  481. pshufd $0x00,%xmm7,%xmm4
  482. pshufd $0x55,%xmm7,%xmm5
  483. movdqa %xmm4,192-256(%rcx)
  484. pshufd $0xaa,%xmm7,%xmm6
  485. movdqa %xmm5,208-256(%rcx)
  486. pshufd $0xff,%xmm7,%xmm7
  487. movdqa %xmm6,224-256(%rcx)
  488. movdqa %xmm7,240-256(%rcx)
  489. pshufd $0x00,%xmm3,%xmm0
  490. pshufd $0x55,%xmm3,%xmm1
  491. paddd .Linc(%rip),%xmm0
  492. pshufd $0xaa,%xmm3,%xmm2
  493. movdqa %xmm1,272-256(%rcx)
  494. pshufd $0xff,%xmm3,%xmm3
  495. movdqa %xmm2,288-256(%rcx)
  496. movdqa %xmm3,304-256(%rcx)
  497. jmp .Loop_enter4x
  498. .align 32
  499. .Loop_outer4x:
  500. movdqa 64(%rsp),%xmm8
  501. movdqa 80(%rsp),%xmm9
  502. movdqa 96(%rsp),%xmm10
  503. movdqa 112(%rsp),%xmm11
  504. movdqa 128-256(%rcx),%xmm12
  505. movdqa 144-256(%rcx),%xmm13
  506. movdqa 160-256(%rcx),%xmm14
  507. movdqa 176-256(%rcx),%xmm15
  508. movdqa 192-256(%rcx),%xmm4
  509. movdqa 208-256(%rcx),%xmm5
  510. movdqa 224-256(%rcx),%xmm6
  511. movdqa 240-256(%rcx),%xmm7
  512. movdqa 256-256(%rcx),%xmm0
  513. movdqa 272-256(%rcx),%xmm1
  514. movdqa 288-256(%rcx),%xmm2
  515. movdqa 304-256(%rcx),%xmm3
  516. paddd .Lfour(%rip),%xmm0
  517. .Loop_enter4x:
  518. movdqa %xmm6,32(%rsp)
  519. movdqa %xmm7,48(%rsp)
  520. movdqa (%r10),%xmm7
  521. movl $10,%eax
  522. movdqa %xmm0,256-256(%rcx)
  523. jmp .Loop4x
  524. .align 32
  525. .Loop4x:
  526. paddd %xmm12,%xmm8
  527. paddd %xmm13,%xmm9
  528. pxor %xmm8,%xmm0
  529. pxor %xmm9,%xmm1
  530. .byte 102,15,56,0,199
  531. .byte 102,15,56,0,207
  532. paddd %xmm0,%xmm4
  533. paddd %xmm1,%xmm5
  534. pxor %xmm4,%xmm12
  535. pxor %xmm5,%xmm13
  536. movdqa %xmm12,%xmm6
  537. pslld $12,%xmm12
  538. psrld $20,%xmm6
  539. movdqa %xmm13,%xmm7
  540. pslld $12,%xmm13
  541. por %xmm6,%xmm12
  542. psrld $20,%xmm7
  543. movdqa (%r11),%xmm6
  544. por %xmm7,%xmm13
  545. paddd %xmm12,%xmm8
  546. paddd %xmm13,%xmm9
  547. pxor %xmm8,%xmm0
  548. pxor %xmm9,%xmm1
  549. .byte 102,15,56,0,198
  550. .byte 102,15,56,0,206
  551. paddd %xmm0,%xmm4
  552. paddd %xmm1,%xmm5
  553. pxor %xmm4,%xmm12
  554. pxor %xmm5,%xmm13
  555. movdqa %xmm12,%xmm7
  556. pslld $7,%xmm12
  557. psrld $25,%xmm7
  558. movdqa %xmm13,%xmm6
  559. pslld $7,%xmm13
  560. por %xmm7,%xmm12
  561. psrld $25,%xmm6
  562. movdqa (%r10),%xmm7
  563. por %xmm6,%xmm13
  564. movdqa %xmm4,0(%rsp)
  565. movdqa %xmm5,16(%rsp)
  566. movdqa 32(%rsp),%xmm4
  567. movdqa 48(%rsp),%xmm5
  568. paddd %xmm14,%xmm10
  569. paddd %xmm15,%xmm11
  570. pxor %xmm10,%xmm2
  571. pxor %xmm11,%xmm3
  572. .byte 102,15,56,0,215
  573. .byte 102,15,56,0,223
  574. paddd %xmm2,%xmm4
  575. paddd %xmm3,%xmm5
  576. pxor %xmm4,%xmm14
  577. pxor %xmm5,%xmm15
  578. movdqa %xmm14,%xmm6
  579. pslld $12,%xmm14
  580. psrld $20,%xmm6
  581. movdqa %xmm15,%xmm7
  582. pslld $12,%xmm15
  583. por %xmm6,%xmm14
  584. psrld $20,%xmm7
  585. movdqa (%r11),%xmm6
  586. por %xmm7,%xmm15
  587. paddd %xmm14,%xmm10
  588. paddd %xmm15,%xmm11
  589. pxor %xmm10,%xmm2
  590. pxor %xmm11,%xmm3
  591. .byte 102,15,56,0,214
  592. .byte 102,15,56,0,222
  593. paddd %xmm2,%xmm4
  594. paddd %xmm3,%xmm5
  595. pxor %xmm4,%xmm14
  596. pxor %xmm5,%xmm15
  597. movdqa %xmm14,%xmm7
  598. pslld $7,%xmm14
  599. psrld $25,%xmm7
  600. movdqa %xmm15,%xmm6
  601. pslld $7,%xmm15
  602. por %xmm7,%xmm14
  603. psrld $25,%xmm6
  604. movdqa (%r10),%xmm7
  605. por %xmm6,%xmm15
  606. paddd %xmm13,%xmm8
  607. paddd %xmm14,%xmm9
  608. pxor %xmm8,%xmm3
  609. pxor %xmm9,%xmm0
  610. .byte 102,15,56,0,223
  611. .byte 102,15,56,0,199
  612. paddd %xmm3,%xmm4
  613. paddd %xmm0,%xmm5
  614. pxor %xmm4,%xmm13
  615. pxor %xmm5,%xmm14
  616. movdqa %xmm13,%xmm6
  617. pslld $12,%xmm13
  618. psrld $20,%xmm6
  619. movdqa %xmm14,%xmm7
  620. pslld $12,%xmm14
  621. por %xmm6,%xmm13
  622. psrld $20,%xmm7
  623. movdqa (%r11),%xmm6
  624. por %xmm7,%xmm14
  625. paddd %xmm13,%xmm8
  626. paddd %xmm14,%xmm9
  627. pxor %xmm8,%xmm3
  628. pxor %xmm9,%xmm0
  629. .byte 102,15,56,0,222
  630. .byte 102,15,56,0,198
  631. paddd %xmm3,%xmm4
  632. paddd %xmm0,%xmm5
  633. pxor %xmm4,%xmm13
  634. pxor %xmm5,%xmm14
  635. movdqa %xmm13,%xmm7
  636. pslld $7,%xmm13
  637. psrld $25,%xmm7
  638. movdqa %xmm14,%xmm6
  639. pslld $7,%xmm14
  640. por %xmm7,%xmm13
  641. psrld $25,%xmm6
  642. movdqa (%r10),%xmm7
  643. por %xmm6,%xmm14
  644. movdqa %xmm4,32(%rsp)
  645. movdqa %xmm5,48(%rsp)
  646. movdqa 0(%rsp),%xmm4
  647. movdqa 16(%rsp),%xmm5
  648. paddd %xmm15,%xmm10
  649. paddd %xmm12,%xmm11
  650. pxor %xmm10,%xmm1
  651. pxor %xmm11,%xmm2
  652. .byte 102,15,56,0,207
  653. .byte 102,15,56,0,215
  654. paddd %xmm1,%xmm4
  655. paddd %xmm2,%xmm5
  656. pxor %xmm4,%xmm15
  657. pxor %xmm5,%xmm12
  658. movdqa %xmm15,%xmm6
  659. pslld $12,%xmm15
  660. psrld $20,%xmm6
  661. movdqa %xmm12,%xmm7
  662. pslld $12,%xmm12
  663. por %xmm6,%xmm15
  664. psrld $20,%xmm7
  665. movdqa (%r11),%xmm6
  666. por %xmm7,%xmm12
  667. paddd %xmm15,%xmm10
  668. paddd %xmm12,%xmm11
  669. pxor %xmm10,%xmm1
  670. pxor %xmm11,%xmm2
  671. .byte 102,15,56,0,206
  672. .byte 102,15,56,0,214
  673. paddd %xmm1,%xmm4
  674. paddd %xmm2,%xmm5
  675. pxor %xmm4,%xmm15
  676. pxor %xmm5,%xmm12
  677. movdqa %xmm15,%xmm7
  678. pslld $7,%xmm15
  679. psrld $25,%xmm7
  680. movdqa %xmm12,%xmm6
  681. pslld $7,%xmm12
  682. por %xmm7,%xmm15
  683. psrld $25,%xmm6
  684. movdqa (%r10),%xmm7
  685. por %xmm6,%xmm12
  686. decl %eax
  687. jnz .Loop4x
  688. paddd 64(%rsp),%xmm8
  689. paddd 80(%rsp),%xmm9
  690. paddd 96(%rsp),%xmm10
  691. paddd 112(%rsp),%xmm11
  692. movdqa %xmm8,%xmm6
  693. punpckldq %xmm9,%xmm8
  694. movdqa %xmm10,%xmm7
  695. punpckldq %xmm11,%xmm10
  696. punpckhdq %xmm9,%xmm6
  697. punpckhdq %xmm11,%xmm7
  698. movdqa %xmm8,%xmm9
  699. punpcklqdq %xmm10,%xmm8
  700. movdqa %xmm6,%xmm11
  701. punpcklqdq %xmm7,%xmm6
  702. punpckhqdq %xmm10,%xmm9
  703. punpckhqdq %xmm7,%xmm11
  704. paddd 128-256(%rcx),%xmm12
  705. paddd 144-256(%rcx),%xmm13
  706. paddd 160-256(%rcx),%xmm14
  707. paddd 176-256(%rcx),%xmm15
  708. movdqa %xmm8,0(%rsp)
  709. movdqa %xmm9,16(%rsp)
  710. movdqa 32(%rsp),%xmm8
  711. movdqa 48(%rsp),%xmm9
  712. movdqa %xmm12,%xmm10
  713. punpckldq %xmm13,%xmm12
  714. movdqa %xmm14,%xmm7
  715. punpckldq %xmm15,%xmm14
  716. punpckhdq %xmm13,%xmm10
  717. punpckhdq %xmm15,%xmm7
  718. movdqa %xmm12,%xmm13
  719. punpcklqdq %xmm14,%xmm12
  720. movdqa %xmm10,%xmm15
  721. punpcklqdq %xmm7,%xmm10
  722. punpckhqdq %xmm14,%xmm13
  723. punpckhqdq %xmm7,%xmm15
  724. paddd 192-256(%rcx),%xmm4
  725. paddd 208-256(%rcx),%xmm5
  726. paddd 224-256(%rcx),%xmm8
  727. paddd 240-256(%rcx),%xmm9
  728. movdqa %xmm6,32(%rsp)
  729. movdqa %xmm11,48(%rsp)
  730. movdqa %xmm4,%xmm14
  731. punpckldq %xmm5,%xmm4
  732. movdqa %xmm8,%xmm7
  733. punpckldq %xmm9,%xmm8
  734. punpckhdq %xmm5,%xmm14
  735. punpckhdq %xmm9,%xmm7
  736. movdqa %xmm4,%xmm5
  737. punpcklqdq %xmm8,%xmm4
  738. movdqa %xmm14,%xmm9
  739. punpcklqdq %xmm7,%xmm14
  740. punpckhqdq %xmm8,%xmm5
  741. punpckhqdq %xmm7,%xmm9
  742. paddd 256-256(%rcx),%xmm0
  743. paddd 272-256(%rcx),%xmm1
  744. paddd 288-256(%rcx),%xmm2
  745. paddd 304-256(%rcx),%xmm3
  746. movdqa %xmm0,%xmm8
  747. punpckldq %xmm1,%xmm0
  748. movdqa %xmm2,%xmm7
  749. punpckldq %xmm3,%xmm2
  750. punpckhdq %xmm1,%xmm8
  751. punpckhdq %xmm3,%xmm7
  752. movdqa %xmm0,%xmm1
  753. punpcklqdq %xmm2,%xmm0
  754. movdqa %xmm8,%xmm3
  755. punpcklqdq %xmm7,%xmm8
  756. punpckhqdq %xmm2,%xmm1
  757. punpckhqdq %xmm7,%xmm3
  758. cmpq $256,%rdx
  759. jb .Ltail4x
  760. movdqu 0(%rsi),%xmm6
  761. movdqu 16(%rsi),%xmm11
  762. movdqu 32(%rsi),%xmm2
  763. movdqu 48(%rsi),%xmm7
  764. pxor 0(%rsp),%xmm6
  765. pxor %xmm12,%xmm11
  766. pxor %xmm4,%xmm2
  767. pxor %xmm0,%xmm7
  768. movdqu %xmm6,0(%rdi)
  769. movdqu 64(%rsi),%xmm6
  770. movdqu %xmm11,16(%rdi)
  771. movdqu 80(%rsi),%xmm11
  772. movdqu %xmm2,32(%rdi)
  773. movdqu 96(%rsi),%xmm2
  774. movdqu %xmm7,48(%rdi)
  775. movdqu 112(%rsi),%xmm7
  776. leaq 128(%rsi),%rsi
  777. pxor 16(%rsp),%xmm6
  778. pxor %xmm13,%xmm11
  779. pxor %xmm5,%xmm2
  780. pxor %xmm1,%xmm7
  781. movdqu %xmm6,64(%rdi)
  782. movdqu 0(%rsi),%xmm6
  783. movdqu %xmm11,80(%rdi)
  784. movdqu 16(%rsi),%xmm11
  785. movdqu %xmm2,96(%rdi)
  786. movdqu 32(%rsi),%xmm2
  787. movdqu %xmm7,112(%rdi)
  788. leaq 128(%rdi),%rdi
  789. movdqu 48(%rsi),%xmm7
  790. pxor 32(%rsp),%xmm6
  791. pxor %xmm10,%xmm11
  792. pxor %xmm14,%xmm2
  793. pxor %xmm8,%xmm7
  794. movdqu %xmm6,0(%rdi)
  795. movdqu 64(%rsi),%xmm6
  796. movdqu %xmm11,16(%rdi)
  797. movdqu 80(%rsi),%xmm11
  798. movdqu %xmm2,32(%rdi)
  799. movdqu 96(%rsi),%xmm2
  800. movdqu %xmm7,48(%rdi)
  801. movdqu 112(%rsi),%xmm7
  802. leaq 128(%rsi),%rsi
  803. pxor 48(%rsp),%xmm6
  804. pxor %xmm15,%xmm11
  805. pxor %xmm9,%xmm2
  806. pxor %xmm3,%xmm7
  807. movdqu %xmm6,64(%rdi)
  808. movdqu %xmm11,80(%rdi)
  809. movdqu %xmm2,96(%rdi)
  810. movdqu %xmm7,112(%rdi)
  811. leaq 128(%rdi),%rdi
  812. subq $256,%rdx
  813. jnz .Loop_outer4x
  814. jmp .Ldone4x
  815. .Ltail4x:
  816. cmpq $192,%rdx
  817. jae .L192_or_more4x
  818. cmpq $128,%rdx
  819. jae .L128_or_more4x
  820. cmpq $64,%rdx
  821. jae .L64_or_more4x
  822. xorq %r10,%r10
  823. movdqa %xmm12,16(%rsp)
  824. movdqa %xmm4,32(%rsp)
  825. movdqa %xmm0,48(%rsp)
  826. jmp .Loop_tail4x
  827. .align 32
  828. .L64_or_more4x:
  829. movdqu 0(%rsi),%xmm6
  830. movdqu 16(%rsi),%xmm11
  831. movdqu 32(%rsi),%xmm2
  832. movdqu 48(%rsi),%xmm7
  833. pxor 0(%rsp),%xmm6
  834. pxor %xmm12,%xmm11
  835. pxor %xmm4,%xmm2
  836. pxor %xmm0,%xmm7
  837. movdqu %xmm6,0(%rdi)
  838. movdqu %xmm11,16(%rdi)
  839. movdqu %xmm2,32(%rdi)
  840. movdqu %xmm7,48(%rdi)
  841. je .Ldone4x
  842. movdqa 16(%rsp),%xmm6
  843. leaq 64(%rsi),%rsi
  844. xorq %r10,%r10
  845. movdqa %xmm6,0(%rsp)
  846. movdqa %xmm13,16(%rsp)
  847. leaq 64(%rdi),%rdi
  848. movdqa %xmm5,32(%rsp)
  849. subq $64,%rdx
  850. movdqa %xmm1,48(%rsp)
  851. jmp .Loop_tail4x
  852. .align 32
  853. .L128_or_more4x:
  854. movdqu 0(%rsi),%xmm6
  855. movdqu 16(%rsi),%xmm11
  856. movdqu 32(%rsi),%xmm2
  857. movdqu 48(%rsi),%xmm7
  858. pxor 0(%rsp),%xmm6
  859. pxor %xmm12,%xmm11
  860. pxor %xmm4,%xmm2
  861. pxor %xmm0,%xmm7
  862. movdqu %xmm6,0(%rdi)
  863. movdqu 64(%rsi),%xmm6
  864. movdqu %xmm11,16(%rdi)
  865. movdqu 80(%rsi),%xmm11
  866. movdqu %xmm2,32(%rdi)
  867. movdqu 96(%rsi),%xmm2
  868. movdqu %xmm7,48(%rdi)
  869. movdqu 112(%rsi),%xmm7
  870. pxor 16(%rsp),%xmm6
  871. pxor %xmm13,%xmm11
  872. pxor %xmm5,%xmm2
  873. pxor %xmm1,%xmm7
  874. movdqu %xmm6,64(%rdi)
  875. movdqu %xmm11,80(%rdi)
  876. movdqu %xmm2,96(%rdi)
  877. movdqu %xmm7,112(%rdi)
  878. je .Ldone4x
  879. movdqa 32(%rsp),%xmm6
  880. leaq 128(%rsi),%rsi
  881. xorq %r10,%r10
  882. movdqa %xmm6,0(%rsp)
  883. movdqa %xmm10,16(%rsp)
  884. leaq 128(%rdi),%rdi
  885. movdqa %xmm14,32(%rsp)
  886. subq $128,%rdx
  887. movdqa %xmm8,48(%rsp)
  888. jmp .Loop_tail4x
  889. .align 32
  890. .L192_or_more4x:
  891. movdqu 0(%rsi),%xmm6
  892. movdqu 16(%rsi),%xmm11
  893. movdqu 32(%rsi),%xmm2
  894. movdqu 48(%rsi),%xmm7
  895. pxor 0(%rsp),%xmm6
  896. pxor %xmm12,%xmm11
  897. pxor %xmm4,%xmm2
  898. pxor %xmm0,%xmm7
  899. movdqu %xmm6,0(%rdi)
  900. movdqu 64(%rsi),%xmm6
  901. movdqu %xmm11,16(%rdi)
  902. movdqu 80(%rsi),%xmm11
  903. movdqu %xmm2,32(%rdi)
  904. movdqu 96(%rsi),%xmm2
  905. movdqu %xmm7,48(%rdi)
  906. movdqu 112(%rsi),%xmm7
  907. leaq 128(%rsi),%rsi
  908. pxor 16(%rsp),%xmm6
  909. pxor %xmm13,%xmm11
  910. pxor %xmm5,%xmm2
  911. pxor %xmm1,%xmm7
  912. movdqu %xmm6,64(%rdi)
  913. movdqu 0(%rsi),%xmm6
  914. movdqu %xmm11,80(%rdi)
  915. movdqu 16(%rsi),%xmm11
  916. movdqu %xmm2,96(%rdi)
  917. movdqu 32(%rsi),%xmm2
  918. movdqu %xmm7,112(%rdi)
  919. leaq 128(%rdi),%rdi
  920. movdqu 48(%rsi),%xmm7
  921. pxor 32(%rsp),%xmm6
  922. pxor %xmm10,%xmm11
  923. pxor %xmm14,%xmm2
  924. pxor %xmm8,%xmm7
  925. movdqu %xmm6,0(%rdi)
  926. movdqu %xmm11,16(%rdi)
  927. movdqu %xmm2,32(%rdi)
  928. movdqu %xmm7,48(%rdi)
  929. je .Ldone4x
  930. movdqa 48(%rsp),%xmm6
  931. leaq 64(%rsi),%rsi
  932. xorq %r10,%r10
  933. movdqa %xmm6,0(%rsp)
  934. movdqa %xmm15,16(%rsp)
  935. leaq 64(%rdi),%rdi
  936. movdqa %xmm9,32(%rsp)
  937. subq $192,%rdx
  938. movdqa %xmm3,48(%rsp)
  939. .Loop_tail4x:
  940. movzbl (%rsi,%r10,1),%eax
  941. movzbl (%rsp,%r10,1),%ecx
  942. leaq 1(%r10),%r10
  943. xorl %ecx,%eax
  944. movb %al,-1(%rdi,%r10,1)
  945. decq %rdx
  946. jnz .Loop_tail4x
  947. .Ldone4x:
  948. leaq (%r9),%rsp
  949. .cfi_def_cfa_register rsp
  950. .L4x_epilogue:
  951. .byte 0xf3,0xc3
  952. .cfi_endproc
  953. .size ChaCha20_4x,.-ChaCha20_4x
  954. .type ChaCha20_8x,@function
  955. .align 32
  956. ChaCha20_8x:
  957. .LChaCha20_8x:
  958. .cfi_startproc
  959. movq %rsp,%r9
  960. .cfi_def_cfa_register r9
  961. subq $0x280+8,%rsp
  962. andq $-32,%rsp
  963. vzeroupper
  964. vbroadcasti128 .Lsigma(%rip),%ymm11
  965. vbroadcasti128 (%rcx),%ymm3
  966. vbroadcasti128 16(%rcx),%ymm15
  967. vbroadcasti128 (%r8),%ymm7
  968. leaq 256(%rsp),%rcx
  969. leaq 512(%rsp),%rax
  970. leaq .Lrot16(%rip),%r10
  971. leaq .Lrot24(%rip),%r11
  972. vpshufd $0x00,%ymm11,%ymm8
  973. vpshufd $0x55,%ymm11,%ymm9
  974. vmovdqa %ymm8,128-256(%rcx)
  975. vpshufd $0xaa,%ymm11,%ymm10
  976. vmovdqa %ymm9,160-256(%rcx)
  977. vpshufd $0xff,%ymm11,%ymm11
  978. vmovdqa %ymm10,192-256(%rcx)
  979. vmovdqa %ymm11,224-256(%rcx)
  980. vpshufd $0x00,%ymm3,%ymm0
  981. vpshufd $0x55,%ymm3,%ymm1
  982. vmovdqa %ymm0,256-256(%rcx)
  983. vpshufd $0xaa,%ymm3,%ymm2
  984. vmovdqa %ymm1,288-256(%rcx)
  985. vpshufd $0xff,%ymm3,%ymm3
  986. vmovdqa %ymm2,320-256(%rcx)
  987. vmovdqa %ymm3,352-256(%rcx)
  988. vpshufd $0x00,%ymm15,%ymm12
  989. vpshufd $0x55,%ymm15,%ymm13
  990. vmovdqa %ymm12,384-512(%rax)
  991. vpshufd $0xaa,%ymm15,%ymm14
  992. vmovdqa %ymm13,416-512(%rax)
  993. vpshufd $0xff,%ymm15,%ymm15
  994. vmovdqa %ymm14,448-512(%rax)
  995. vmovdqa %ymm15,480-512(%rax)
  996. vpshufd $0x00,%ymm7,%ymm4
  997. vpshufd $0x55,%ymm7,%ymm5
  998. vpaddd .Lincy(%rip),%ymm4,%ymm4
  999. vpshufd $0xaa,%ymm7,%ymm6
  1000. vmovdqa %ymm5,544-512(%rax)
  1001. vpshufd $0xff,%ymm7,%ymm7
  1002. vmovdqa %ymm6,576-512(%rax)
  1003. vmovdqa %ymm7,608-512(%rax)
  1004. jmp .Loop_enter8x
  1005. .align 32
  1006. .Loop_outer8x:
  1007. vmovdqa 128-256(%rcx),%ymm8
  1008. vmovdqa 160-256(%rcx),%ymm9
  1009. vmovdqa 192-256(%rcx),%ymm10
  1010. vmovdqa 224-256(%rcx),%ymm11
  1011. vmovdqa 256-256(%rcx),%ymm0
  1012. vmovdqa 288-256(%rcx),%ymm1
  1013. vmovdqa 320-256(%rcx),%ymm2
  1014. vmovdqa 352-256(%rcx),%ymm3
  1015. vmovdqa 384-512(%rax),%ymm12
  1016. vmovdqa 416-512(%rax),%ymm13
  1017. vmovdqa 448-512(%rax),%ymm14
  1018. vmovdqa 480-512(%rax),%ymm15
  1019. vmovdqa 512-512(%rax),%ymm4
  1020. vmovdqa 544-512(%rax),%ymm5
  1021. vmovdqa 576-512(%rax),%ymm6
  1022. vmovdqa 608-512(%rax),%ymm7
  1023. vpaddd .Leight(%rip),%ymm4,%ymm4
  1024. .Loop_enter8x:
  1025. vmovdqa %ymm14,64(%rsp)
  1026. vmovdqa %ymm15,96(%rsp)
  1027. vbroadcasti128 (%r10),%ymm15
  1028. vmovdqa %ymm4,512-512(%rax)
  1029. movl $10,%eax
  1030. jmp .Loop8x
  1031. .align 32
  1032. .Loop8x:
  1033. vpaddd %ymm0,%ymm8,%ymm8
  1034. vpxor %ymm4,%ymm8,%ymm4
  1035. vpshufb %ymm15,%ymm4,%ymm4
  1036. vpaddd %ymm1,%ymm9,%ymm9
  1037. vpxor %ymm5,%ymm9,%ymm5
  1038. vpshufb %ymm15,%ymm5,%ymm5
  1039. vpaddd %ymm4,%ymm12,%ymm12
  1040. vpxor %ymm0,%ymm12,%ymm0
  1041. vpslld $12,%ymm0,%ymm14
  1042. vpsrld $20,%ymm0,%ymm0
  1043. vpor %ymm0,%ymm14,%ymm0
  1044. vbroadcasti128 (%r11),%ymm14
  1045. vpaddd %ymm5,%ymm13,%ymm13
  1046. vpxor %ymm1,%ymm13,%ymm1
  1047. vpslld $12,%ymm1,%ymm15
  1048. vpsrld $20,%ymm1,%ymm1
  1049. vpor %ymm1,%ymm15,%ymm1
  1050. vpaddd %ymm0,%ymm8,%ymm8
  1051. vpxor %ymm4,%ymm8,%ymm4
  1052. vpshufb %ymm14,%ymm4,%ymm4
  1053. vpaddd %ymm1,%ymm9,%ymm9
  1054. vpxor %ymm5,%ymm9,%ymm5
  1055. vpshufb %ymm14,%ymm5,%ymm5
  1056. vpaddd %ymm4,%ymm12,%ymm12
  1057. vpxor %ymm0,%ymm12,%ymm0
  1058. vpslld $7,%ymm0,%ymm15
  1059. vpsrld $25,%ymm0,%ymm0
  1060. vpor %ymm0,%ymm15,%ymm0
  1061. vbroadcasti128 (%r10),%ymm15
  1062. vpaddd %ymm5,%ymm13,%ymm13
  1063. vpxor %ymm1,%ymm13,%ymm1
  1064. vpslld $7,%ymm1,%ymm14
  1065. vpsrld $25,%ymm1,%ymm1
  1066. vpor %ymm1,%ymm14,%ymm1
  1067. vmovdqa %ymm12,0(%rsp)
  1068. vmovdqa %ymm13,32(%rsp)
  1069. vmovdqa 64(%rsp),%ymm12
  1070. vmovdqa 96(%rsp),%ymm13
  1071. vpaddd %ymm2,%ymm10,%ymm10
  1072. vpxor %ymm6,%ymm10,%ymm6
  1073. vpshufb %ymm15,%ymm6,%ymm6
  1074. vpaddd %ymm3,%ymm11,%ymm11
  1075. vpxor %ymm7,%ymm11,%ymm7
  1076. vpshufb %ymm15,%ymm7,%ymm7
  1077. vpaddd %ymm6,%ymm12,%ymm12
  1078. vpxor %ymm2,%ymm12,%ymm2
  1079. vpslld $12,%ymm2,%ymm14
  1080. vpsrld $20,%ymm2,%ymm2
  1081. vpor %ymm2,%ymm14,%ymm2
  1082. vbroadcasti128 (%r11),%ymm14
  1083. vpaddd %ymm7,%ymm13,%ymm13
  1084. vpxor %ymm3,%ymm13,%ymm3
  1085. vpslld $12,%ymm3,%ymm15
  1086. vpsrld $20,%ymm3,%ymm3
  1087. vpor %ymm3,%ymm15,%ymm3
  1088. vpaddd %ymm2,%ymm10,%ymm10
  1089. vpxor %ymm6,%ymm10,%ymm6
  1090. vpshufb %ymm14,%ymm6,%ymm6
  1091. vpaddd %ymm3,%ymm11,%ymm11
  1092. vpxor %ymm7,%ymm11,%ymm7
  1093. vpshufb %ymm14,%ymm7,%ymm7
  1094. vpaddd %ymm6,%ymm12,%ymm12
  1095. vpxor %ymm2,%ymm12,%ymm2
  1096. vpslld $7,%ymm2,%ymm15
  1097. vpsrld $25,%ymm2,%ymm2
  1098. vpor %ymm2,%ymm15,%ymm2
  1099. vbroadcasti128 (%r10),%ymm15
  1100. vpaddd %ymm7,%ymm13,%ymm13
  1101. vpxor %ymm3,%ymm13,%ymm3
  1102. vpslld $7,%ymm3,%ymm14
  1103. vpsrld $25,%ymm3,%ymm3
  1104. vpor %ymm3,%ymm14,%ymm3
  1105. vpaddd %ymm1,%ymm8,%ymm8
  1106. vpxor %ymm7,%ymm8,%ymm7
  1107. vpshufb %ymm15,%ymm7,%ymm7
  1108. vpaddd %ymm2,%ymm9,%ymm9
  1109. vpxor %ymm4,%ymm9,%ymm4
  1110. vpshufb %ymm15,%ymm4,%ymm4
  1111. vpaddd %ymm7,%ymm12,%ymm12
  1112. vpxor %ymm1,%ymm12,%ymm1
  1113. vpslld $12,%ymm1,%ymm14
  1114. vpsrld $20,%ymm1,%ymm1
  1115. vpor %ymm1,%ymm14,%ymm1
  1116. vbroadcasti128 (%r11),%ymm14
  1117. vpaddd %ymm4,%ymm13,%ymm13
  1118. vpxor %ymm2,%ymm13,%ymm2
  1119. vpslld $12,%ymm2,%ymm15
  1120. vpsrld $20,%ymm2,%ymm2
  1121. vpor %ymm2,%ymm15,%ymm2
  1122. vpaddd %ymm1,%ymm8,%ymm8
  1123. vpxor %ymm7,%ymm8,%ymm7
  1124. vpshufb %ymm14,%ymm7,%ymm7
  1125. vpaddd %ymm2,%ymm9,%ymm9
  1126. vpxor %ymm4,%ymm9,%ymm4
  1127. vpshufb %ymm14,%ymm4,%ymm4
  1128. vpaddd %ymm7,%ymm12,%ymm12
  1129. vpxor %ymm1,%ymm12,%ymm1
  1130. vpslld $7,%ymm1,%ymm15
  1131. vpsrld $25,%ymm1,%ymm1
  1132. vpor %ymm1,%ymm15,%ymm1
  1133. vbroadcasti128 (%r10),%ymm15
  1134. vpaddd %ymm4,%ymm13,%ymm13
  1135. vpxor %ymm2,%ymm13,%ymm2
  1136. vpslld $7,%ymm2,%ymm14
  1137. vpsrld $25,%ymm2,%ymm2
  1138. vpor %ymm2,%ymm14,%ymm2
  1139. vmovdqa %ymm12,64(%rsp)
  1140. vmovdqa %ymm13,96(%rsp)
  1141. vmovdqa 0(%rsp),%ymm12
  1142. vmovdqa 32(%rsp),%ymm13
  1143. vpaddd %ymm3,%ymm10,%ymm10
  1144. vpxor %ymm5,%ymm10,%ymm5
  1145. vpshufb %ymm15,%ymm5,%ymm5
  1146. vpaddd %ymm0,%ymm11,%ymm11
  1147. vpxor %ymm6,%ymm11,%ymm6
  1148. vpshufb %ymm15,%ymm6,%ymm6
  1149. vpaddd %ymm5,%ymm12,%ymm12
  1150. vpxor %ymm3,%ymm12,%ymm3
  1151. vpslld $12,%ymm3,%ymm14
  1152. vpsrld $20,%ymm3,%ymm3
  1153. vpor %ymm3,%ymm14,%ymm3
  1154. vbroadcasti128 (%r11),%ymm14
  1155. vpaddd %ymm6,%ymm13,%ymm13
  1156. vpxor %ymm0,%ymm13,%ymm0
  1157. vpslld $12,%ymm0,%ymm15
  1158. vpsrld $20,%ymm0,%ymm0
  1159. vpor %ymm0,%ymm15,%ymm0
  1160. vpaddd %ymm3,%ymm10,%ymm10
  1161. vpxor %ymm5,%ymm10,%ymm5
  1162. vpshufb %ymm14,%ymm5,%ymm5
  1163. vpaddd %ymm0,%ymm11,%ymm11
  1164. vpxor %ymm6,%ymm11,%ymm6
  1165. vpshufb %ymm14,%ymm6,%ymm6
  1166. vpaddd %ymm5,%ymm12,%ymm12
  1167. vpxor %ymm3,%ymm12,%ymm3
  1168. vpslld $7,%ymm3,%ymm15
  1169. vpsrld $25,%ymm3,%ymm3
  1170. vpor %ymm3,%ymm15,%ymm3
  1171. vbroadcasti128 (%r10),%ymm15
  1172. vpaddd %ymm6,%ymm13,%ymm13
  1173. vpxor %ymm0,%ymm13,%ymm0
  1174. vpslld $7,%ymm0,%ymm14
  1175. vpsrld $25,%ymm0,%ymm0
  1176. vpor %ymm0,%ymm14,%ymm0
  1177. decl %eax
  1178. jnz .Loop8x
  1179. leaq 512(%rsp),%rax
  1180. vpaddd 128-256(%rcx),%ymm8,%ymm8
  1181. vpaddd 160-256(%rcx),%ymm9,%ymm9
  1182. vpaddd 192-256(%rcx),%ymm10,%ymm10
  1183. vpaddd 224-256(%rcx),%ymm11,%ymm11
  1184. vpunpckldq %ymm9,%ymm8,%ymm14
  1185. vpunpckldq %ymm11,%ymm10,%ymm15
  1186. vpunpckhdq %ymm9,%ymm8,%ymm8
  1187. vpunpckhdq %ymm11,%ymm10,%ymm10
  1188. vpunpcklqdq %ymm15,%ymm14,%ymm9
  1189. vpunpckhqdq %ymm15,%ymm14,%ymm14
  1190. vpunpcklqdq %ymm10,%ymm8,%ymm11
  1191. vpunpckhqdq %ymm10,%ymm8,%ymm8
  1192. vpaddd 256-256(%rcx),%ymm0,%ymm0
  1193. vpaddd 288-256(%rcx),%ymm1,%ymm1
  1194. vpaddd 320-256(%rcx),%ymm2,%ymm2
  1195. vpaddd 352-256(%rcx),%ymm3,%ymm3
  1196. vpunpckldq %ymm1,%ymm0,%ymm10
  1197. vpunpckldq %ymm3,%ymm2,%ymm15
  1198. vpunpckhdq %ymm1,%ymm0,%ymm0
  1199. vpunpckhdq %ymm3,%ymm2,%ymm2
  1200. vpunpcklqdq %ymm15,%ymm10,%ymm1
  1201. vpunpckhqdq %ymm15,%ymm10,%ymm10
  1202. vpunpcklqdq %ymm2,%ymm0,%ymm3
  1203. vpunpckhqdq %ymm2,%ymm0,%ymm0
  1204. vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
  1205. vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
  1206. vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
  1207. vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
  1208. vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
  1209. vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
  1210. vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
  1211. vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
  1212. vmovdqa %ymm15,0(%rsp)
  1213. vmovdqa %ymm9,32(%rsp)
  1214. vmovdqa 64(%rsp),%ymm15
  1215. vmovdqa 96(%rsp),%ymm9
  1216. vpaddd 384-512(%rax),%ymm12,%ymm12
  1217. vpaddd 416-512(%rax),%ymm13,%ymm13
  1218. vpaddd 448-512(%rax),%ymm15,%ymm15
  1219. vpaddd 480-512(%rax),%ymm9,%ymm9
  1220. vpunpckldq %ymm13,%ymm12,%ymm2
  1221. vpunpckldq %ymm9,%ymm15,%ymm8
  1222. vpunpckhdq %ymm13,%ymm12,%ymm12
  1223. vpunpckhdq %ymm9,%ymm15,%ymm15
  1224. vpunpcklqdq %ymm8,%ymm2,%ymm13
  1225. vpunpckhqdq %ymm8,%ymm2,%ymm2
  1226. vpunpcklqdq %ymm15,%ymm12,%ymm9
  1227. vpunpckhqdq %ymm15,%ymm12,%ymm12
  1228. vpaddd 512-512(%rax),%ymm4,%ymm4
  1229. vpaddd 544-512(%rax),%ymm5,%ymm5
  1230. vpaddd 576-512(%rax),%ymm6,%ymm6
  1231. vpaddd 608-512(%rax),%ymm7,%ymm7
  1232. vpunpckldq %ymm5,%ymm4,%ymm15
  1233. vpunpckldq %ymm7,%ymm6,%ymm8
  1234. vpunpckhdq %ymm5,%ymm4,%ymm4
  1235. vpunpckhdq %ymm7,%ymm6,%ymm6
  1236. vpunpcklqdq %ymm8,%ymm15,%ymm5
  1237. vpunpckhqdq %ymm8,%ymm15,%ymm15
  1238. vpunpcklqdq %ymm6,%ymm4,%ymm7
  1239. vpunpckhqdq %ymm6,%ymm4,%ymm4
  1240. vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
  1241. vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
  1242. vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
  1243. vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
  1244. vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
  1245. vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
  1246. vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
  1247. vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
  1248. vmovdqa 0(%rsp),%ymm6
  1249. vmovdqa 32(%rsp),%ymm12
  1250. cmpq $512,%rdx
  1251. jb .Ltail8x
  1252. vpxor 0(%rsi),%ymm6,%ymm6
  1253. vpxor 32(%rsi),%ymm8,%ymm8
  1254. vpxor 64(%rsi),%ymm1,%ymm1
  1255. vpxor 96(%rsi),%ymm5,%ymm5
  1256. leaq 128(%rsi),%rsi
  1257. vmovdqu %ymm6,0(%rdi)
  1258. vmovdqu %ymm8,32(%rdi)
  1259. vmovdqu %ymm1,64(%rdi)
  1260. vmovdqu %ymm5,96(%rdi)
  1261. leaq 128(%rdi),%rdi
  1262. vpxor 0(%rsi),%ymm12,%ymm12
  1263. vpxor 32(%rsi),%ymm13,%ymm13
  1264. vpxor 64(%rsi),%ymm10,%ymm10
  1265. vpxor 96(%rsi),%ymm15,%ymm15
  1266. leaq 128(%rsi),%rsi
  1267. vmovdqu %ymm12,0(%rdi)
  1268. vmovdqu %ymm13,32(%rdi)
  1269. vmovdqu %ymm10,64(%rdi)
  1270. vmovdqu %ymm15,96(%rdi)
  1271. leaq 128(%rdi),%rdi
  1272. vpxor 0(%rsi),%ymm14,%ymm14
  1273. vpxor 32(%rsi),%ymm2,%ymm2
  1274. vpxor 64(%rsi),%ymm3,%ymm3
  1275. vpxor 96(%rsi),%ymm7,%ymm7
  1276. leaq 128(%rsi),%rsi
  1277. vmovdqu %ymm14,0(%rdi)
  1278. vmovdqu %ymm2,32(%rdi)
  1279. vmovdqu %ymm3,64(%rdi)
  1280. vmovdqu %ymm7,96(%rdi)
  1281. leaq 128(%rdi),%rdi
  1282. vpxor 0(%rsi),%ymm11,%ymm11
  1283. vpxor 32(%rsi),%ymm9,%ymm9
  1284. vpxor 64(%rsi),%ymm0,%ymm0
  1285. vpxor 96(%rsi),%ymm4,%ymm4
  1286. leaq 128(%rsi),%rsi
  1287. vmovdqu %ymm11,0(%rdi)
  1288. vmovdqu %ymm9,32(%rdi)
  1289. vmovdqu %ymm0,64(%rdi)
  1290. vmovdqu %ymm4,96(%rdi)
  1291. leaq 128(%rdi),%rdi
  1292. subq $512,%rdx
  1293. jnz .Loop_outer8x
  1294. jmp .Ldone8x
  1295. .Ltail8x:
  1296. cmpq $448,%rdx
  1297. jae .L448_or_more8x
  1298. cmpq $384,%rdx
  1299. jae .L384_or_more8x
  1300. cmpq $320,%rdx
  1301. jae .L320_or_more8x
  1302. cmpq $256,%rdx
  1303. jae .L256_or_more8x
  1304. cmpq $192,%rdx
  1305. jae .L192_or_more8x
  1306. cmpq $128,%rdx
  1307. jae .L128_or_more8x
  1308. cmpq $64,%rdx
  1309. jae .L64_or_more8x
  1310. xorq %r10,%r10
  1311. vmovdqa %ymm6,0(%rsp)
  1312. vmovdqa %ymm8,32(%rsp)
  1313. jmp .Loop_tail8x
  1314. .align 32
  1315. .L64_or_more8x:
  1316. vpxor 0(%rsi),%ymm6,%ymm6
  1317. vpxor 32(%rsi),%ymm8,%ymm8
  1318. vmovdqu %ymm6,0(%rdi)
  1319. vmovdqu %ymm8,32(%rdi)
  1320. je .Ldone8x
  1321. leaq 64(%rsi),%rsi
  1322. xorq %r10,%r10
  1323. vmovdqa %ymm1,0(%rsp)
  1324. leaq 64(%rdi),%rdi
  1325. subq $64,%rdx
  1326. vmovdqa %ymm5,32(%rsp)
  1327. jmp .Loop_tail8x
  1328. .align 32
  1329. .L128_or_more8x:
  1330. vpxor 0(%rsi),%ymm6,%ymm6
  1331. vpxor 32(%rsi),%ymm8,%ymm8
  1332. vpxor 64(%rsi),%ymm1,%ymm1
  1333. vpxor 96(%rsi),%ymm5,%ymm5
  1334. vmovdqu %ymm6,0(%rdi)
  1335. vmovdqu %ymm8,32(%rdi)
  1336. vmovdqu %ymm1,64(%rdi)
  1337. vmovdqu %ymm5,96(%rdi)
  1338. je .Ldone8x
  1339. leaq 128(%rsi),%rsi
  1340. xorq %r10,%r10
  1341. vmovdqa %ymm12,0(%rsp)
  1342. leaq 128(%rdi),%rdi
  1343. subq $128,%rdx
  1344. vmovdqa %ymm13,32(%rsp)
  1345. jmp .Loop_tail8x
  1346. .align 32
  1347. .L192_or_more8x:
  1348. vpxor 0(%rsi),%ymm6,%ymm6
  1349. vpxor 32(%rsi),%ymm8,%ymm8
  1350. vpxor 64(%rsi),%ymm1,%ymm1
  1351. vpxor 96(%rsi),%ymm5,%ymm5
  1352. vpxor 128(%rsi),%ymm12,%ymm12
  1353. vpxor 160(%rsi),%ymm13,%ymm13
  1354. vmovdqu %ymm6,0(%rdi)
  1355. vmovdqu %ymm8,32(%rdi)
  1356. vmovdqu %ymm1,64(%rdi)
  1357. vmovdqu %ymm5,96(%rdi)
  1358. vmovdqu %ymm12,128(%rdi)
  1359. vmovdqu %ymm13,160(%rdi)
  1360. je .Ldone8x
  1361. leaq 192(%rsi),%rsi
  1362. xorq %r10,%r10
  1363. vmovdqa %ymm10,0(%rsp)
  1364. leaq 192(%rdi),%rdi
  1365. subq $192,%rdx
  1366. vmovdqa %ymm15,32(%rsp)
  1367. jmp .Loop_tail8x
  1368. .align 32
  1369. .L256_or_more8x:
  1370. vpxor 0(%rsi),%ymm6,%ymm6
  1371. vpxor 32(%rsi),%ymm8,%ymm8
  1372. vpxor 64(%rsi),%ymm1,%ymm1
  1373. vpxor 96(%rsi),%ymm5,%ymm5
  1374. vpxor 128(%rsi),%ymm12,%ymm12
  1375. vpxor 160(%rsi),%ymm13,%ymm13
  1376. vpxor 192(%rsi),%ymm10,%ymm10
  1377. vpxor 224(%rsi),%ymm15,%ymm15
  1378. vmovdqu %ymm6,0(%rdi)
  1379. vmovdqu %ymm8,32(%rdi)
  1380. vmovdqu %ymm1,64(%rdi)
  1381. vmovdqu %ymm5,96(%rdi)
  1382. vmovdqu %ymm12,128(%rdi)
  1383. vmovdqu %ymm13,160(%rdi)
  1384. vmovdqu %ymm10,192(%rdi)
  1385. vmovdqu %ymm15,224(%rdi)
  1386. je .Ldone8x
  1387. leaq 256(%rsi),%rsi
  1388. xorq %r10,%r10
  1389. vmovdqa %ymm14,0(%rsp)
  1390. leaq 256(%rdi),%rdi
  1391. subq $256,%rdx
  1392. vmovdqa %ymm2,32(%rsp)
  1393. jmp .Loop_tail8x
  1394. .align 32
  1395. .L320_or_more8x:
  1396. vpxor 0(%rsi),%ymm6,%ymm6
  1397. vpxor 32(%rsi),%ymm8,%ymm8
  1398. vpxor 64(%rsi),%ymm1,%ymm1
  1399. vpxor 96(%rsi),%ymm5,%ymm5
  1400. vpxor 128(%rsi),%ymm12,%ymm12
  1401. vpxor 160(%rsi),%ymm13,%ymm13
  1402. vpxor 192(%rsi),%ymm10,%ymm10
  1403. vpxor 224(%rsi),%ymm15,%ymm15
  1404. vpxor 256(%rsi),%ymm14,%ymm14
  1405. vpxor 288(%rsi),%ymm2,%ymm2
  1406. vmovdqu %ymm6,0(%rdi)
  1407. vmovdqu %ymm8,32(%rdi)
  1408. vmovdqu %ymm1,64(%rdi)
  1409. vmovdqu %ymm5,96(%rdi)
  1410. vmovdqu %ymm12,128(%rdi)
  1411. vmovdqu %ymm13,160(%rdi)
  1412. vmovdqu %ymm10,192(%rdi)
  1413. vmovdqu %ymm15,224(%rdi)
  1414. vmovdqu %ymm14,256(%rdi)
  1415. vmovdqu %ymm2,288(%rdi)
  1416. je .Ldone8x
  1417. leaq 320(%rsi),%rsi
  1418. xorq %r10,%r10
  1419. vmovdqa %ymm3,0(%rsp)
  1420. leaq 320(%rdi),%rdi
  1421. subq $320,%rdx
  1422. vmovdqa %ymm7,32(%rsp)
  1423. jmp .Loop_tail8x
  1424. .align 32
  1425. .L384_or_more8x:
  1426. vpxor 0(%rsi),%ymm6,%ymm6
  1427. vpxor 32(%rsi),%ymm8,%ymm8
  1428. vpxor 64(%rsi),%ymm1,%ymm1
  1429. vpxor 96(%rsi),%ymm5,%ymm5
  1430. vpxor 128(%rsi),%ymm12,%ymm12
  1431. vpxor 160(%rsi),%ymm13,%ymm13
  1432. vpxor 192(%rsi),%ymm10,%ymm10
  1433. vpxor 224(%rsi),%ymm15,%ymm15
  1434. vpxor 256(%rsi),%ymm14,%ymm14
  1435. vpxor 288(%rsi),%ymm2,%ymm2
  1436. vpxor 320(%rsi),%ymm3,%ymm3
  1437. vpxor 352(%rsi),%ymm7,%ymm7
  1438. vmovdqu %ymm6,0(%rdi)
  1439. vmovdqu %ymm8,32(%rdi)
  1440. vmovdqu %ymm1,64(%rdi)
  1441. vmovdqu %ymm5,96(%rdi)
  1442. vmovdqu %ymm12,128(%rdi)
  1443. vmovdqu %ymm13,160(%rdi)
  1444. vmovdqu %ymm10,192(%rdi)
  1445. vmovdqu %ymm15,224(%rdi)
  1446. vmovdqu %ymm14,256(%rdi)
  1447. vmovdqu %ymm2,288(%rdi)
  1448. vmovdqu %ymm3,320(%rdi)
  1449. vmovdqu %ymm7,352(%rdi)
  1450. je .Ldone8x
  1451. leaq 384(%rsi),%rsi
  1452. xorq %r10,%r10
  1453. vmovdqa %ymm11,0(%rsp)
  1454. leaq 384(%rdi),%rdi
  1455. subq $384,%rdx
  1456. vmovdqa %ymm9,32(%rsp)
  1457. jmp .Loop_tail8x
  1458. .align 32
  1459. .L448_or_more8x:
  1460. vpxor 0(%rsi),%ymm6,%ymm6
  1461. vpxor 32(%rsi),%ymm8,%ymm8
  1462. vpxor 64(%rsi),%ymm1,%ymm1
  1463. vpxor 96(%rsi),%ymm5,%ymm5
  1464. vpxor 128(%rsi),%ymm12,%ymm12
  1465. vpxor 160(%rsi),%ymm13,%ymm13
  1466. vpxor 192(%rsi),%ymm10,%ymm10
  1467. vpxor 224(%rsi),%ymm15,%ymm15
  1468. vpxor 256(%rsi),%ymm14,%ymm14
  1469. vpxor 288(%rsi),%ymm2,%ymm2
  1470. vpxor 320(%rsi),%ymm3,%ymm3
  1471. vpxor 352(%rsi),%ymm7,%ymm7
  1472. vpxor 384(%rsi),%ymm11,%ymm11
  1473. vpxor 416(%rsi),%ymm9,%ymm9
  1474. vmovdqu %ymm6,0(%rdi)
  1475. vmovdqu %ymm8,32(%rdi)
  1476. vmovdqu %ymm1,64(%rdi)
  1477. vmovdqu %ymm5,96(%rdi)
  1478. vmovdqu %ymm12,128(%rdi)
  1479. vmovdqu %ymm13,160(%rdi)
  1480. vmovdqu %ymm10,192(%rdi)
  1481. vmovdqu %ymm15,224(%rdi)
  1482. vmovdqu %ymm14,256(%rdi)
  1483. vmovdqu %ymm2,288(%rdi)
  1484. vmovdqu %ymm3,320(%rdi)
  1485. vmovdqu %ymm7,352(%rdi)
  1486. vmovdqu %ymm11,384(%rdi)
  1487. vmovdqu %ymm9,416(%rdi)
  1488. je .Ldone8x
  1489. leaq 448(%rsi),%rsi
  1490. xorq %r10,%r10
  1491. vmovdqa %ymm0,0(%rsp)
  1492. leaq 448(%rdi),%rdi
  1493. subq $448,%rdx
  1494. vmovdqa %ymm4,32(%rsp)
  1495. .Loop_tail8x:
  1496. movzbl (%rsi,%r10,1),%eax
  1497. movzbl (%rsp,%r10,1),%ecx
  1498. leaq 1(%r10),%r10
  1499. xorl %ecx,%eax
  1500. movb %al,-1(%rdi,%r10,1)
  1501. decq %rdx
  1502. jnz .Loop_tail8x
  1503. .Ldone8x:
  1504. vzeroall
  1505. leaq (%r9),%rsp
  1506. .cfi_def_cfa_register rsp
  1507. .L8x_epilogue:
  1508. .byte 0xf3,0xc3
  1509. .cfi_endproc
  1510. .size ChaCha20_8x,.-ChaCha20_8x
  1511. #endif
  1512. .section .note.GNU-stack,"",@progbits