1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633 |
- // This file is generated from a similarly-named Perl script in the BoringSSL
- // source tree. Do not edit by hand.
- #if defined(__has_feature)
- #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
- #define OPENSSL_NO_ASM
- #endif
- #endif
- #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
- #if defined(BORINGSSL_PREFIX)
- #include <boringssl_prefix_symbols_asm.h>
- #endif
- .text
- .extern OPENSSL_ia32cap_P
- .hidden OPENSSL_ia32cap_P
- .align 64
- .Lzero:
- .long 0,0,0,0
- .Lone:
- .long 1,0,0,0
- .Linc:
- .long 0,1,2,3
- .Lfour:
- .long 4,4,4,4
- .Lincy:
- .long 0,2,4,6,1,3,5,7
- .Leight:
- .long 8,8,8,8,8,8,8,8
- .Lrot16:
- .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
- .Lrot24:
- .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
- .Lsigma:
- .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
- .align 64
- .Lzeroz:
- .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
- .Lfourz:
- .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
- .Lincz:
- .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
- .Lsixteen:
- .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
- .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
- .globl ChaCha20_ctr32
- .hidden ChaCha20_ctr32
- .type ChaCha20_ctr32,@function
- .align 64
- ChaCha20_ctr32:
- .cfi_startproc
- cmpq $0,%rdx
- je .Lno_data
- movq OPENSSL_ia32cap_P+4(%rip),%r10
- testl $512,%r10d
- jnz .LChaCha20_ssse3
- pushq %rbx
- .cfi_adjust_cfa_offset 8
- .cfi_offset rbx,-16
- pushq %rbp
- .cfi_adjust_cfa_offset 8
- .cfi_offset rbp,-24
- pushq %r12
- .cfi_adjust_cfa_offset 8
- .cfi_offset r12,-32
- pushq %r13
- .cfi_adjust_cfa_offset 8
- .cfi_offset r13,-40
- pushq %r14
- .cfi_adjust_cfa_offset 8
- .cfi_offset r14,-48
- pushq %r15
- .cfi_adjust_cfa_offset 8
- .cfi_offset r15,-56
- subq $64+24,%rsp
- .cfi_adjust_cfa_offset 88
- .Lctr32_body:
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lone(%rip),%xmm4
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- movq %rdx,%rbp
- jmp .Loop_outer
- .align 32
- .Loop_outer:
- movl $0x61707865,%eax
- movl $0x3320646e,%ebx
- movl $0x79622d32,%ecx
- movl $0x6b206574,%edx
- movl 16(%rsp),%r8d
- movl 20(%rsp),%r9d
- movl 24(%rsp),%r10d
- movl 28(%rsp),%r11d
- movd %xmm3,%r12d
- movl 52(%rsp),%r13d
- movl 56(%rsp),%r14d
- movl 60(%rsp),%r15d
- movq %rbp,64+0(%rsp)
- movl $10,%ebp
- movq %rsi,64+8(%rsp)
- .byte 102,72,15,126,214
- movq %rdi,64+16(%rsp)
- movq %rsi,%rdi
- shrq $32,%rdi
- jmp .Loop
- .align 32
- .Loop:
- addl %r8d,%eax
- xorl %eax,%r12d
- roll $16,%r12d
- addl %r9d,%ebx
- xorl %ebx,%r13d
- roll $16,%r13d
- addl %r12d,%esi
- xorl %esi,%r8d
- roll $12,%r8d
- addl %r13d,%edi
- xorl %edi,%r9d
- roll $12,%r9d
- addl %r8d,%eax
- xorl %eax,%r12d
- roll $8,%r12d
- addl %r9d,%ebx
- xorl %ebx,%r13d
- roll $8,%r13d
- addl %r12d,%esi
- xorl %esi,%r8d
- roll $7,%r8d
- addl %r13d,%edi
- xorl %edi,%r9d
- roll $7,%r9d
- movl %esi,32(%rsp)
- movl %edi,36(%rsp)
- movl 40(%rsp),%esi
- movl 44(%rsp),%edi
- addl %r10d,%ecx
- xorl %ecx,%r14d
- roll $16,%r14d
- addl %r11d,%edx
- xorl %edx,%r15d
- roll $16,%r15d
- addl %r14d,%esi
- xorl %esi,%r10d
- roll $12,%r10d
- addl %r15d,%edi
- xorl %edi,%r11d
- roll $12,%r11d
- addl %r10d,%ecx
- xorl %ecx,%r14d
- roll $8,%r14d
- addl %r11d,%edx
- xorl %edx,%r15d
- roll $8,%r15d
- addl %r14d,%esi
- xorl %esi,%r10d
- roll $7,%r10d
- addl %r15d,%edi
- xorl %edi,%r11d
- roll $7,%r11d
- addl %r9d,%eax
- xorl %eax,%r15d
- roll $16,%r15d
- addl %r10d,%ebx
- xorl %ebx,%r12d
- roll $16,%r12d
- addl %r15d,%esi
- xorl %esi,%r9d
- roll $12,%r9d
- addl %r12d,%edi
- xorl %edi,%r10d
- roll $12,%r10d
- addl %r9d,%eax
- xorl %eax,%r15d
- roll $8,%r15d
- addl %r10d,%ebx
- xorl %ebx,%r12d
- roll $8,%r12d
- addl %r15d,%esi
- xorl %esi,%r9d
- roll $7,%r9d
- addl %r12d,%edi
- xorl %edi,%r10d
- roll $7,%r10d
- movl %esi,40(%rsp)
- movl %edi,44(%rsp)
- movl 32(%rsp),%esi
- movl 36(%rsp),%edi
- addl %r11d,%ecx
- xorl %ecx,%r13d
- roll $16,%r13d
- addl %r8d,%edx
- xorl %edx,%r14d
- roll $16,%r14d
- addl %r13d,%esi
- xorl %esi,%r11d
- roll $12,%r11d
- addl %r14d,%edi
- xorl %edi,%r8d
- roll $12,%r8d
- addl %r11d,%ecx
- xorl %ecx,%r13d
- roll $8,%r13d
- addl %r8d,%edx
- xorl %edx,%r14d
- roll $8,%r14d
- addl %r13d,%esi
- xorl %esi,%r11d
- roll $7,%r11d
- addl %r14d,%edi
- xorl %edi,%r8d
- roll $7,%r8d
- decl %ebp
- jnz .Loop
- movl %edi,36(%rsp)
- movl %esi,32(%rsp)
- movq 64(%rsp),%rbp
- movdqa %xmm2,%xmm1
- movq 64+8(%rsp),%rsi
- paddd %xmm4,%xmm3
- movq 64+16(%rsp),%rdi
- addl $0x61707865,%eax
- addl $0x3320646e,%ebx
- addl $0x79622d32,%ecx
- addl $0x6b206574,%edx
- addl 16(%rsp),%r8d
- addl 20(%rsp),%r9d
- addl 24(%rsp),%r10d
- addl 28(%rsp),%r11d
- addl 48(%rsp),%r12d
- addl 52(%rsp),%r13d
- addl 56(%rsp),%r14d
- addl 60(%rsp),%r15d
- paddd 32(%rsp),%xmm1
- cmpq $64,%rbp
- jb .Ltail
- xorl 0(%rsi),%eax
- xorl 4(%rsi),%ebx
- xorl 8(%rsi),%ecx
- xorl 12(%rsi),%edx
- xorl 16(%rsi),%r8d
- xorl 20(%rsi),%r9d
- xorl 24(%rsi),%r10d
- xorl 28(%rsi),%r11d
- movdqu 32(%rsi),%xmm0
- xorl 48(%rsi),%r12d
- xorl 52(%rsi),%r13d
- xorl 56(%rsi),%r14d
- xorl 60(%rsi),%r15d
- leaq 64(%rsi),%rsi
- pxor %xmm1,%xmm0
- movdqa %xmm2,32(%rsp)
- movd %xmm3,48(%rsp)
- movl %eax,0(%rdi)
- movl %ebx,4(%rdi)
- movl %ecx,8(%rdi)
- movl %edx,12(%rdi)
- movl %r8d,16(%rdi)
- movl %r9d,20(%rdi)
- movl %r10d,24(%rdi)
- movl %r11d,28(%rdi)
- movdqu %xmm0,32(%rdi)
- movl %r12d,48(%rdi)
- movl %r13d,52(%rdi)
- movl %r14d,56(%rdi)
- movl %r15d,60(%rdi)
- leaq 64(%rdi),%rdi
- subq $64,%rbp
- jnz .Loop_outer
- jmp .Ldone
- .align 16
- .Ltail:
- movl %eax,0(%rsp)
- movl %ebx,4(%rsp)
- xorq %rbx,%rbx
- movl %ecx,8(%rsp)
- movl %edx,12(%rsp)
- movl %r8d,16(%rsp)
- movl %r9d,20(%rsp)
- movl %r10d,24(%rsp)
- movl %r11d,28(%rsp)
- movdqa %xmm1,32(%rsp)
- movl %r12d,48(%rsp)
- movl %r13d,52(%rsp)
- movl %r14d,56(%rsp)
- movl %r15d,60(%rsp)
- .Loop_tail:
- movzbl (%rsi,%rbx,1),%eax
- movzbl (%rsp,%rbx,1),%edx
- leaq 1(%rbx),%rbx
- xorl %edx,%eax
- movb %al,-1(%rdi,%rbx,1)
- decq %rbp
- jnz .Loop_tail
- .Ldone:
- leaq 64+24+48(%rsp),%rsi
- movq -48(%rsi),%r15
- .cfi_restore r15
- movq -40(%rsi),%r14
- .cfi_restore r14
- movq -32(%rsi),%r13
- .cfi_restore r13
- movq -24(%rsi),%r12
- .cfi_restore r12
- movq -16(%rsi),%rbp
- .cfi_restore rbp
- movq -8(%rsi),%rbx
- .cfi_restore rbx
- leaq (%rsi),%rsp
- .cfi_adjust_cfa_offset -136
- .Lno_data:
- .byte 0xf3,0xc3
- .cfi_endproc
- .size ChaCha20_ctr32,.-ChaCha20_ctr32
- .type ChaCha20_ssse3,@function
- .align 32
- ChaCha20_ssse3:
- .LChaCha20_ssse3:
- .cfi_startproc
- movq %rsp,%r9
- .cfi_def_cfa_register r9
- cmpq $128,%rdx
- ja .LChaCha20_4x
- .Ldo_sse3_after_all:
- subq $64+8,%rsp
- movdqa .Lsigma(%rip),%xmm0
- movdqu (%rcx),%xmm1
- movdqu 16(%rcx),%xmm2
- movdqu (%r8),%xmm3
- movdqa .Lrot16(%rip),%xmm6
- movdqa .Lrot24(%rip),%xmm7
- movdqa %xmm0,0(%rsp)
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- movq $10,%r8
- jmp .Loop_ssse3
- .align 32
- .Loop_outer_ssse3:
- movdqa .Lone(%rip),%xmm3
- movdqa 0(%rsp),%xmm0
- movdqa 16(%rsp),%xmm1
- movdqa 32(%rsp),%xmm2
- paddd 48(%rsp),%xmm3
- movq $10,%r8
- movdqa %xmm3,48(%rsp)
- jmp .Loop_ssse3
- .align 32
- .Loop_ssse3:
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- .byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- .byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $57,%xmm1,%xmm1
- pshufd $147,%xmm3,%xmm3
- nop
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- .byte 102,15,56,0,222
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $20,%xmm1
- pslld $12,%xmm4
- por %xmm4,%xmm1
- paddd %xmm1,%xmm0
- pxor %xmm0,%xmm3
- .byte 102,15,56,0,223
- paddd %xmm3,%xmm2
- pxor %xmm2,%xmm1
- movdqa %xmm1,%xmm4
- psrld $25,%xmm1
- pslld $7,%xmm4
- por %xmm4,%xmm1
- pshufd $78,%xmm2,%xmm2
- pshufd $147,%xmm1,%xmm1
- pshufd $57,%xmm3,%xmm3
- decq %r8
- jnz .Loop_ssse3
- paddd 0(%rsp),%xmm0
- paddd 16(%rsp),%xmm1
- paddd 32(%rsp),%xmm2
- paddd 48(%rsp),%xmm3
- cmpq $64,%rdx
- jb .Ltail_ssse3
- movdqu 0(%rsi),%xmm4
- movdqu 16(%rsi),%xmm5
- pxor %xmm4,%xmm0
- movdqu 32(%rsi),%xmm4
- pxor %xmm5,%xmm1
- movdqu 48(%rsi),%xmm5
- leaq 64(%rsi),%rsi
- pxor %xmm4,%xmm2
- pxor %xmm5,%xmm3
- movdqu %xmm0,0(%rdi)
- movdqu %xmm1,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm3,48(%rdi)
- leaq 64(%rdi),%rdi
- subq $64,%rdx
- jnz .Loop_outer_ssse3
- jmp .Ldone_ssse3
- .align 16
- .Ltail_ssse3:
- movdqa %xmm0,0(%rsp)
- movdqa %xmm1,16(%rsp)
- movdqa %xmm2,32(%rsp)
- movdqa %xmm3,48(%rsp)
- xorq %r8,%r8
- .Loop_tail_ssse3:
- movzbl (%rsi,%r8,1),%eax
- movzbl (%rsp,%r8,1),%ecx
- leaq 1(%r8),%r8
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r8,1)
- decq %rdx
- jnz .Loop_tail_ssse3
- .Ldone_ssse3:
- leaq (%r9),%rsp
- .cfi_def_cfa_register rsp
- .Lssse3_epilogue:
- .byte 0xf3,0xc3
- .cfi_endproc
- .size ChaCha20_ssse3,.-ChaCha20_ssse3
- .type ChaCha20_4x,@function
- .align 32
- ChaCha20_4x:
- .LChaCha20_4x:
- .cfi_startproc
- movq %rsp,%r9
- .cfi_def_cfa_register r9
- movq %r10,%r11
- shrq $32,%r10
- testq $32,%r10
- jnz .LChaCha20_8x
- cmpq $192,%rdx
- ja .Lproceed4x
- andq $71303168,%r11
- cmpq $4194304,%r11
- je .Ldo_sse3_after_all
- .Lproceed4x:
- subq $0x140+8,%rsp
- movdqa .Lsigma(%rip),%xmm11
- movdqu (%rcx),%xmm15
- movdqu 16(%rcx),%xmm7
- movdqu (%r8),%xmm3
- leaq 256(%rsp),%rcx
- leaq .Lrot16(%rip),%r10
- leaq .Lrot24(%rip),%r11
- pshufd $0x00,%xmm11,%xmm8
- pshufd $0x55,%xmm11,%xmm9
- movdqa %xmm8,64(%rsp)
- pshufd $0xaa,%xmm11,%xmm10
- movdqa %xmm9,80(%rsp)
- pshufd $0xff,%xmm11,%xmm11
- movdqa %xmm10,96(%rsp)
- movdqa %xmm11,112(%rsp)
- pshufd $0x00,%xmm15,%xmm12
- pshufd $0x55,%xmm15,%xmm13
- movdqa %xmm12,128-256(%rcx)
- pshufd $0xaa,%xmm15,%xmm14
- movdqa %xmm13,144-256(%rcx)
- pshufd $0xff,%xmm15,%xmm15
- movdqa %xmm14,160-256(%rcx)
- movdqa %xmm15,176-256(%rcx)
- pshufd $0x00,%xmm7,%xmm4
- pshufd $0x55,%xmm7,%xmm5
- movdqa %xmm4,192-256(%rcx)
- pshufd $0xaa,%xmm7,%xmm6
- movdqa %xmm5,208-256(%rcx)
- pshufd $0xff,%xmm7,%xmm7
- movdqa %xmm6,224-256(%rcx)
- movdqa %xmm7,240-256(%rcx)
- pshufd $0x00,%xmm3,%xmm0
- pshufd $0x55,%xmm3,%xmm1
- paddd .Linc(%rip),%xmm0
- pshufd $0xaa,%xmm3,%xmm2
- movdqa %xmm1,272-256(%rcx)
- pshufd $0xff,%xmm3,%xmm3
- movdqa %xmm2,288-256(%rcx)
- movdqa %xmm3,304-256(%rcx)
- jmp .Loop_enter4x
- .align 32
- .Loop_outer4x:
- movdqa 64(%rsp),%xmm8
- movdqa 80(%rsp),%xmm9
- movdqa 96(%rsp),%xmm10
- movdqa 112(%rsp),%xmm11
- movdqa 128-256(%rcx),%xmm12
- movdqa 144-256(%rcx),%xmm13
- movdqa 160-256(%rcx),%xmm14
- movdqa 176-256(%rcx),%xmm15
- movdqa 192-256(%rcx),%xmm4
- movdqa 208-256(%rcx),%xmm5
- movdqa 224-256(%rcx),%xmm6
- movdqa 240-256(%rcx),%xmm7
- movdqa 256-256(%rcx),%xmm0
- movdqa 272-256(%rcx),%xmm1
- movdqa 288-256(%rcx),%xmm2
- movdqa 304-256(%rcx),%xmm3
- paddd .Lfour(%rip),%xmm0
- .Loop_enter4x:
- movdqa %xmm6,32(%rsp)
- movdqa %xmm7,48(%rsp)
- movdqa (%r10),%xmm7
- movl $10,%eax
- movdqa %xmm0,256-256(%rcx)
- jmp .Loop4x
- .align 32
- .Loop4x:
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- .byte 102,15,56,0,199
- .byte 102,15,56,0,207
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm6
- pslld $12,%xmm12
- psrld $20,%xmm6
- movdqa %xmm13,%xmm7
- pslld $12,%xmm13
- por %xmm6,%xmm12
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm13
- paddd %xmm12,%xmm8
- paddd %xmm13,%xmm9
- pxor %xmm8,%xmm0
- pxor %xmm9,%xmm1
- .byte 102,15,56,0,198
- .byte 102,15,56,0,206
- paddd %xmm0,%xmm4
- paddd %xmm1,%xmm5
- pxor %xmm4,%xmm12
- pxor %xmm5,%xmm13
- movdqa %xmm12,%xmm7
- pslld $7,%xmm12
- psrld $25,%xmm7
- movdqa %xmm13,%xmm6
- pslld $7,%xmm13
- por %xmm7,%xmm12
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm13
- movdqa %xmm4,0(%rsp)
- movdqa %xmm5,16(%rsp)
- movdqa 32(%rsp),%xmm4
- movdqa 48(%rsp),%xmm5
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- .byte 102,15,56,0,215
- .byte 102,15,56,0,223
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm6
- pslld $12,%xmm14
- psrld $20,%xmm6
- movdqa %xmm15,%xmm7
- pslld $12,%xmm15
- por %xmm6,%xmm14
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm15
- paddd %xmm14,%xmm10
- paddd %xmm15,%xmm11
- pxor %xmm10,%xmm2
- pxor %xmm11,%xmm3
- .byte 102,15,56,0,214
- .byte 102,15,56,0,222
- paddd %xmm2,%xmm4
- paddd %xmm3,%xmm5
- pxor %xmm4,%xmm14
- pxor %xmm5,%xmm15
- movdqa %xmm14,%xmm7
- pslld $7,%xmm14
- psrld $25,%xmm7
- movdqa %xmm15,%xmm6
- pslld $7,%xmm15
- por %xmm7,%xmm14
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm15
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- .byte 102,15,56,0,223
- .byte 102,15,56,0,199
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm6
- pslld $12,%xmm13
- psrld $20,%xmm6
- movdqa %xmm14,%xmm7
- pslld $12,%xmm14
- por %xmm6,%xmm13
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm14
- paddd %xmm13,%xmm8
- paddd %xmm14,%xmm9
- pxor %xmm8,%xmm3
- pxor %xmm9,%xmm0
- .byte 102,15,56,0,222
- .byte 102,15,56,0,198
- paddd %xmm3,%xmm4
- paddd %xmm0,%xmm5
- pxor %xmm4,%xmm13
- pxor %xmm5,%xmm14
- movdqa %xmm13,%xmm7
- pslld $7,%xmm13
- psrld $25,%xmm7
- movdqa %xmm14,%xmm6
- pslld $7,%xmm14
- por %xmm7,%xmm13
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm14
- movdqa %xmm4,32(%rsp)
- movdqa %xmm5,48(%rsp)
- movdqa 0(%rsp),%xmm4
- movdqa 16(%rsp),%xmm5
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- .byte 102,15,56,0,207
- .byte 102,15,56,0,215
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm6
- pslld $12,%xmm15
- psrld $20,%xmm6
- movdqa %xmm12,%xmm7
- pslld $12,%xmm12
- por %xmm6,%xmm15
- psrld $20,%xmm7
- movdqa (%r11),%xmm6
- por %xmm7,%xmm12
- paddd %xmm15,%xmm10
- paddd %xmm12,%xmm11
- pxor %xmm10,%xmm1
- pxor %xmm11,%xmm2
- .byte 102,15,56,0,206
- .byte 102,15,56,0,214
- paddd %xmm1,%xmm4
- paddd %xmm2,%xmm5
- pxor %xmm4,%xmm15
- pxor %xmm5,%xmm12
- movdqa %xmm15,%xmm7
- pslld $7,%xmm15
- psrld $25,%xmm7
- movdqa %xmm12,%xmm6
- pslld $7,%xmm12
- por %xmm7,%xmm15
- psrld $25,%xmm6
- movdqa (%r10),%xmm7
- por %xmm6,%xmm12
- decl %eax
- jnz .Loop4x
- paddd 64(%rsp),%xmm8
- paddd 80(%rsp),%xmm9
- paddd 96(%rsp),%xmm10
- paddd 112(%rsp),%xmm11
- movdqa %xmm8,%xmm6
- punpckldq %xmm9,%xmm8
- movdqa %xmm10,%xmm7
- punpckldq %xmm11,%xmm10
- punpckhdq %xmm9,%xmm6
- punpckhdq %xmm11,%xmm7
- movdqa %xmm8,%xmm9
- punpcklqdq %xmm10,%xmm8
- movdqa %xmm6,%xmm11
- punpcklqdq %xmm7,%xmm6
- punpckhqdq %xmm10,%xmm9
- punpckhqdq %xmm7,%xmm11
- paddd 128-256(%rcx),%xmm12
- paddd 144-256(%rcx),%xmm13
- paddd 160-256(%rcx),%xmm14
- paddd 176-256(%rcx),%xmm15
- movdqa %xmm8,0(%rsp)
- movdqa %xmm9,16(%rsp)
- movdqa 32(%rsp),%xmm8
- movdqa 48(%rsp),%xmm9
- movdqa %xmm12,%xmm10
- punpckldq %xmm13,%xmm12
- movdqa %xmm14,%xmm7
- punpckldq %xmm15,%xmm14
- punpckhdq %xmm13,%xmm10
- punpckhdq %xmm15,%xmm7
- movdqa %xmm12,%xmm13
- punpcklqdq %xmm14,%xmm12
- movdqa %xmm10,%xmm15
- punpcklqdq %xmm7,%xmm10
- punpckhqdq %xmm14,%xmm13
- punpckhqdq %xmm7,%xmm15
- paddd 192-256(%rcx),%xmm4
- paddd 208-256(%rcx),%xmm5
- paddd 224-256(%rcx),%xmm8
- paddd 240-256(%rcx),%xmm9
- movdqa %xmm6,32(%rsp)
- movdqa %xmm11,48(%rsp)
- movdqa %xmm4,%xmm14
- punpckldq %xmm5,%xmm4
- movdqa %xmm8,%xmm7
- punpckldq %xmm9,%xmm8
- punpckhdq %xmm5,%xmm14
- punpckhdq %xmm9,%xmm7
- movdqa %xmm4,%xmm5
- punpcklqdq %xmm8,%xmm4
- movdqa %xmm14,%xmm9
- punpcklqdq %xmm7,%xmm14
- punpckhqdq %xmm8,%xmm5
- punpckhqdq %xmm7,%xmm9
- paddd 256-256(%rcx),%xmm0
- paddd 272-256(%rcx),%xmm1
- paddd 288-256(%rcx),%xmm2
- paddd 304-256(%rcx),%xmm3
- movdqa %xmm0,%xmm8
- punpckldq %xmm1,%xmm0
- movdqa %xmm2,%xmm7
- punpckldq %xmm3,%xmm2
- punpckhdq %xmm1,%xmm8
- punpckhdq %xmm3,%xmm7
- movdqa %xmm0,%xmm1
- punpcklqdq %xmm2,%xmm0
- movdqa %xmm8,%xmm3
- punpcklqdq %xmm7,%xmm8
- punpckhqdq %xmm2,%xmm1
- punpckhqdq %xmm7,%xmm3
- cmpq $256,%rdx
- jb .Ltail4x
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu 0(%rsi),%xmm6
- movdqu %xmm11,80(%rdi)
- movdqu 16(%rsi),%xmm11
- movdqu %xmm2,96(%rdi)
- movdqu 32(%rsi),%xmm2
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
- movdqu 48(%rsi),%xmm7
- pxor 32(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 48(%rsp),%xmm6
- pxor %xmm15,%xmm11
- pxor %xmm9,%xmm2
- pxor %xmm3,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu %xmm11,80(%rdi)
- movdqu %xmm2,96(%rdi)
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
- subq $256,%rdx
- jnz .Loop_outer4x
- jmp .Ldone4x
- .Ltail4x:
- cmpq $192,%rdx
- jae .L192_or_more4x
- cmpq $128,%rdx
- jae .L128_or_more4x
- cmpq $64,%rdx
- jae .L64_or_more4x
- xorq %r10,%r10
- movdqa %xmm12,16(%rsp)
- movdqa %xmm4,32(%rsp)
- movdqa %xmm0,48(%rsp)
- jmp .Loop_tail4x
- .align 32
- .L64_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu %xmm11,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm7,48(%rdi)
- je .Ldone4x
- movdqa 16(%rsp),%xmm6
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm13,16(%rsp)
- leaq 64(%rdi),%rdi
- movdqa %xmm5,32(%rsp)
- subq $64,%rdx
- movdqa %xmm1,48(%rsp)
- jmp .Loop_tail4x
- .align 32
- .L128_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu %xmm11,80(%rdi)
- movdqu %xmm2,96(%rdi)
- movdqu %xmm7,112(%rdi)
- je .Ldone4x
- movdqa 32(%rsp),%xmm6
- leaq 128(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm10,16(%rsp)
- leaq 128(%rdi),%rdi
- movdqa %xmm14,32(%rsp)
- subq $128,%rdx
- movdqa %xmm8,48(%rsp)
- jmp .Loop_tail4x
- .align 32
- .L192_or_more4x:
- movdqu 0(%rsi),%xmm6
- movdqu 16(%rsi),%xmm11
- movdqu 32(%rsi),%xmm2
- movdqu 48(%rsi),%xmm7
- pxor 0(%rsp),%xmm6
- pxor %xmm12,%xmm11
- pxor %xmm4,%xmm2
- pxor %xmm0,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu 64(%rsi),%xmm6
- movdqu %xmm11,16(%rdi)
- movdqu 80(%rsi),%xmm11
- movdqu %xmm2,32(%rdi)
- movdqu 96(%rsi),%xmm2
- movdqu %xmm7,48(%rdi)
- movdqu 112(%rsi),%xmm7
- leaq 128(%rsi),%rsi
- pxor 16(%rsp),%xmm6
- pxor %xmm13,%xmm11
- pxor %xmm5,%xmm2
- pxor %xmm1,%xmm7
- movdqu %xmm6,64(%rdi)
- movdqu 0(%rsi),%xmm6
- movdqu %xmm11,80(%rdi)
- movdqu 16(%rsi),%xmm11
- movdqu %xmm2,96(%rdi)
- movdqu 32(%rsi),%xmm2
- movdqu %xmm7,112(%rdi)
- leaq 128(%rdi),%rdi
- movdqu 48(%rsi),%xmm7
- pxor 32(%rsp),%xmm6
- pxor %xmm10,%xmm11
- pxor %xmm14,%xmm2
- pxor %xmm8,%xmm7
- movdqu %xmm6,0(%rdi)
- movdqu %xmm11,16(%rdi)
- movdqu %xmm2,32(%rdi)
- movdqu %xmm7,48(%rdi)
- je .Ldone4x
- movdqa 48(%rsp),%xmm6
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- movdqa %xmm6,0(%rsp)
- movdqa %xmm15,16(%rsp)
- leaq 64(%rdi),%rdi
- movdqa %xmm9,32(%rsp)
- subq $192,%rdx
- movdqa %xmm3,48(%rsp)
- .Loop_tail4x:
- movzbl (%rsi,%r10,1),%eax
- movzbl (%rsp,%r10,1),%ecx
- leaq 1(%r10),%r10
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r10,1)
- decq %rdx
- jnz .Loop_tail4x
- .Ldone4x:
- leaq (%r9),%rsp
- .cfi_def_cfa_register rsp
- .L4x_epilogue:
- .byte 0xf3,0xc3
- .cfi_endproc
- .size ChaCha20_4x,.-ChaCha20_4x
- .type ChaCha20_8x,@function
- .align 32
- ChaCha20_8x:
- .LChaCha20_8x:
- .cfi_startproc
- movq %rsp,%r9
- .cfi_def_cfa_register r9
- subq $0x280+8,%rsp
- andq $-32,%rsp
- vzeroupper
- vbroadcasti128 .Lsigma(%rip),%ymm11
- vbroadcasti128 (%rcx),%ymm3
- vbroadcasti128 16(%rcx),%ymm15
- vbroadcasti128 (%r8),%ymm7
- leaq 256(%rsp),%rcx
- leaq 512(%rsp),%rax
- leaq .Lrot16(%rip),%r10
- leaq .Lrot24(%rip),%r11
- vpshufd $0x00,%ymm11,%ymm8
- vpshufd $0x55,%ymm11,%ymm9
- vmovdqa %ymm8,128-256(%rcx)
- vpshufd $0xaa,%ymm11,%ymm10
- vmovdqa %ymm9,160-256(%rcx)
- vpshufd $0xff,%ymm11,%ymm11
- vmovdqa %ymm10,192-256(%rcx)
- vmovdqa %ymm11,224-256(%rcx)
- vpshufd $0x00,%ymm3,%ymm0
- vpshufd $0x55,%ymm3,%ymm1
- vmovdqa %ymm0,256-256(%rcx)
- vpshufd $0xaa,%ymm3,%ymm2
- vmovdqa %ymm1,288-256(%rcx)
- vpshufd $0xff,%ymm3,%ymm3
- vmovdqa %ymm2,320-256(%rcx)
- vmovdqa %ymm3,352-256(%rcx)
- vpshufd $0x00,%ymm15,%ymm12
- vpshufd $0x55,%ymm15,%ymm13
- vmovdqa %ymm12,384-512(%rax)
- vpshufd $0xaa,%ymm15,%ymm14
- vmovdqa %ymm13,416-512(%rax)
- vpshufd $0xff,%ymm15,%ymm15
- vmovdqa %ymm14,448-512(%rax)
- vmovdqa %ymm15,480-512(%rax)
- vpshufd $0x00,%ymm7,%ymm4
- vpshufd $0x55,%ymm7,%ymm5
- vpaddd .Lincy(%rip),%ymm4,%ymm4
- vpshufd $0xaa,%ymm7,%ymm6
- vmovdqa %ymm5,544-512(%rax)
- vpshufd $0xff,%ymm7,%ymm7
- vmovdqa %ymm6,576-512(%rax)
- vmovdqa %ymm7,608-512(%rax)
- jmp .Loop_enter8x
- .align 32
- .Loop_outer8x:
- vmovdqa 128-256(%rcx),%ymm8
- vmovdqa 160-256(%rcx),%ymm9
- vmovdqa 192-256(%rcx),%ymm10
- vmovdqa 224-256(%rcx),%ymm11
- vmovdqa 256-256(%rcx),%ymm0
- vmovdqa 288-256(%rcx),%ymm1
- vmovdqa 320-256(%rcx),%ymm2
- vmovdqa 352-256(%rcx),%ymm3
- vmovdqa 384-512(%rax),%ymm12
- vmovdqa 416-512(%rax),%ymm13
- vmovdqa 448-512(%rax),%ymm14
- vmovdqa 480-512(%rax),%ymm15
- vmovdqa 512-512(%rax),%ymm4
- vmovdqa 544-512(%rax),%ymm5
- vmovdqa 576-512(%rax),%ymm6
- vmovdqa 608-512(%rax),%ymm7
- vpaddd .Leight(%rip),%ymm4,%ymm4
- .Loop_enter8x:
- vmovdqa %ymm14,64(%rsp)
- vmovdqa %ymm15,96(%rsp)
- vbroadcasti128 (%r10),%ymm15
- vmovdqa %ymm4,512-512(%rax)
- movl $10,%eax
- jmp .Loop8x
- .align 32
- .Loop8x:
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $12,%ymm0,%ymm14
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $12,%ymm1,%ymm15
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vpaddd %ymm0,%ymm8,%ymm8
- vpxor %ymm4,%ymm8,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm1,%ymm9,%ymm9
- vpxor %ymm5,%ymm9,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm4,%ymm12,%ymm12
- vpxor %ymm0,%ymm12,%ymm0
- vpslld $7,%ymm0,%ymm15
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm5,%ymm13,%ymm13
- vpxor %ymm1,%ymm13,%ymm1
- vpslld $7,%ymm1,%ymm14
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vmovdqa %ymm12,0(%rsp)
- vmovdqa %ymm13,32(%rsp)
- vmovdqa 64(%rsp),%ymm12
- vmovdqa 96(%rsp),%ymm13
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $12,%ymm2,%ymm14
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $12,%ymm3,%ymm15
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vpaddd %ymm2,%ymm10,%ymm10
- vpxor %ymm6,%ymm10,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm3,%ymm11,%ymm11
- vpxor %ymm7,%ymm11,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm6,%ymm12,%ymm12
- vpxor %ymm2,%ymm12,%ymm2
- vpslld $7,%ymm2,%ymm15
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm7,%ymm13,%ymm13
- vpxor %ymm3,%ymm13,%ymm3
- vpslld $7,%ymm3,%ymm14
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm15,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm15,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $12,%ymm1,%ymm14
- vpsrld $20,%ymm1,%ymm1
- vpor %ymm1,%ymm14,%ymm1
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $12,%ymm2,%ymm15
- vpsrld $20,%ymm2,%ymm2
- vpor %ymm2,%ymm15,%ymm2
- vpaddd %ymm1,%ymm8,%ymm8
- vpxor %ymm7,%ymm8,%ymm7
- vpshufb %ymm14,%ymm7,%ymm7
- vpaddd %ymm2,%ymm9,%ymm9
- vpxor %ymm4,%ymm9,%ymm4
- vpshufb %ymm14,%ymm4,%ymm4
- vpaddd %ymm7,%ymm12,%ymm12
- vpxor %ymm1,%ymm12,%ymm1
- vpslld $7,%ymm1,%ymm15
- vpsrld $25,%ymm1,%ymm1
- vpor %ymm1,%ymm15,%ymm1
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm4,%ymm13,%ymm13
- vpxor %ymm2,%ymm13,%ymm2
- vpslld $7,%ymm2,%ymm14
- vpsrld $25,%ymm2,%ymm2
- vpor %ymm2,%ymm14,%ymm2
- vmovdqa %ymm12,64(%rsp)
- vmovdqa %ymm13,96(%rsp)
- vmovdqa 0(%rsp),%ymm12
- vmovdqa 32(%rsp),%ymm13
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm15,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm15,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $12,%ymm3,%ymm14
- vpsrld $20,%ymm3,%ymm3
- vpor %ymm3,%ymm14,%ymm3
- vbroadcasti128 (%r11),%ymm14
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $12,%ymm0,%ymm15
- vpsrld $20,%ymm0,%ymm0
- vpor %ymm0,%ymm15,%ymm0
- vpaddd %ymm3,%ymm10,%ymm10
- vpxor %ymm5,%ymm10,%ymm5
- vpshufb %ymm14,%ymm5,%ymm5
- vpaddd %ymm0,%ymm11,%ymm11
- vpxor %ymm6,%ymm11,%ymm6
- vpshufb %ymm14,%ymm6,%ymm6
- vpaddd %ymm5,%ymm12,%ymm12
- vpxor %ymm3,%ymm12,%ymm3
- vpslld $7,%ymm3,%ymm15
- vpsrld $25,%ymm3,%ymm3
- vpor %ymm3,%ymm15,%ymm3
- vbroadcasti128 (%r10),%ymm15
- vpaddd %ymm6,%ymm13,%ymm13
- vpxor %ymm0,%ymm13,%ymm0
- vpslld $7,%ymm0,%ymm14
- vpsrld $25,%ymm0,%ymm0
- vpor %ymm0,%ymm14,%ymm0
- decl %eax
- jnz .Loop8x
- leaq 512(%rsp),%rax
- vpaddd 128-256(%rcx),%ymm8,%ymm8
- vpaddd 160-256(%rcx),%ymm9,%ymm9
- vpaddd 192-256(%rcx),%ymm10,%ymm10
- vpaddd 224-256(%rcx),%ymm11,%ymm11
- vpunpckldq %ymm9,%ymm8,%ymm14
- vpunpckldq %ymm11,%ymm10,%ymm15
- vpunpckhdq %ymm9,%ymm8,%ymm8
- vpunpckhdq %ymm11,%ymm10,%ymm10
- vpunpcklqdq %ymm15,%ymm14,%ymm9
- vpunpckhqdq %ymm15,%ymm14,%ymm14
- vpunpcklqdq %ymm10,%ymm8,%ymm11
- vpunpckhqdq %ymm10,%ymm8,%ymm8
- vpaddd 256-256(%rcx),%ymm0,%ymm0
- vpaddd 288-256(%rcx),%ymm1,%ymm1
- vpaddd 320-256(%rcx),%ymm2,%ymm2
- vpaddd 352-256(%rcx),%ymm3,%ymm3
- vpunpckldq %ymm1,%ymm0,%ymm10
- vpunpckldq %ymm3,%ymm2,%ymm15
- vpunpckhdq %ymm1,%ymm0,%ymm0
- vpunpckhdq %ymm3,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm10,%ymm1
- vpunpckhqdq %ymm15,%ymm10,%ymm10
- vpunpcklqdq %ymm2,%ymm0,%ymm3
- vpunpckhqdq %ymm2,%ymm0,%ymm0
- vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
- vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
- vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
- vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
- vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
- vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
- vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
- vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
- vmovdqa %ymm15,0(%rsp)
- vmovdqa %ymm9,32(%rsp)
- vmovdqa 64(%rsp),%ymm15
- vmovdqa 96(%rsp),%ymm9
- vpaddd 384-512(%rax),%ymm12,%ymm12
- vpaddd 416-512(%rax),%ymm13,%ymm13
- vpaddd 448-512(%rax),%ymm15,%ymm15
- vpaddd 480-512(%rax),%ymm9,%ymm9
- vpunpckldq %ymm13,%ymm12,%ymm2
- vpunpckldq %ymm9,%ymm15,%ymm8
- vpunpckhdq %ymm13,%ymm12,%ymm12
- vpunpckhdq %ymm9,%ymm15,%ymm15
- vpunpcklqdq %ymm8,%ymm2,%ymm13
- vpunpckhqdq %ymm8,%ymm2,%ymm2
- vpunpcklqdq %ymm15,%ymm12,%ymm9
- vpunpckhqdq %ymm15,%ymm12,%ymm12
- vpaddd 512-512(%rax),%ymm4,%ymm4
- vpaddd 544-512(%rax),%ymm5,%ymm5
- vpaddd 576-512(%rax),%ymm6,%ymm6
- vpaddd 608-512(%rax),%ymm7,%ymm7
- vpunpckldq %ymm5,%ymm4,%ymm15
- vpunpckldq %ymm7,%ymm6,%ymm8
- vpunpckhdq %ymm5,%ymm4,%ymm4
- vpunpckhdq %ymm7,%ymm6,%ymm6
- vpunpcklqdq %ymm8,%ymm15,%ymm5
- vpunpckhqdq %ymm8,%ymm15,%ymm15
- vpunpcklqdq %ymm6,%ymm4,%ymm7
- vpunpckhqdq %ymm6,%ymm4,%ymm4
- vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
- vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
- vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
- vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
- vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
- vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
- vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
- vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
- vmovdqa 0(%rsp),%ymm6
- vmovdqa 32(%rsp),%ymm12
- cmpq $512,%rdx
- jb .Ltail8x
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- leaq 128(%rsi),%rsi
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- leaq 128(%rdi),%rdi
- vpxor 0(%rsi),%ymm12,%ymm12
- vpxor 32(%rsi),%ymm13,%ymm13
- vpxor 64(%rsi),%ymm10,%ymm10
- vpxor 96(%rsi),%ymm15,%ymm15
- leaq 128(%rsi),%rsi
- vmovdqu %ymm12,0(%rdi)
- vmovdqu %ymm13,32(%rdi)
- vmovdqu %ymm10,64(%rdi)
- vmovdqu %ymm15,96(%rdi)
- leaq 128(%rdi),%rdi
- vpxor 0(%rsi),%ymm14,%ymm14
- vpxor 32(%rsi),%ymm2,%ymm2
- vpxor 64(%rsi),%ymm3,%ymm3
- vpxor 96(%rsi),%ymm7,%ymm7
- leaq 128(%rsi),%rsi
- vmovdqu %ymm14,0(%rdi)
- vmovdqu %ymm2,32(%rdi)
- vmovdqu %ymm3,64(%rdi)
- vmovdqu %ymm7,96(%rdi)
- leaq 128(%rdi),%rdi
- vpxor 0(%rsi),%ymm11,%ymm11
- vpxor 32(%rsi),%ymm9,%ymm9
- vpxor 64(%rsi),%ymm0,%ymm0
- vpxor 96(%rsi),%ymm4,%ymm4
- leaq 128(%rsi),%rsi
- vmovdqu %ymm11,0(%rdi)
- vmovdqu %ymm9,32(%rdi)
- vmovdqu %ymm0,64(%rdi)
- vmovdqu %ymm4,96(%rdi)
- leaq 128(%rdi),%rdi
- subq $512,%rdx
- jnz .Loop_outer8x
- jmp .Ldone8x
- .Ltail8x:
- cmpq $448,%rdx
- jae .L448_or_more8x
- cmpq $384,%rdx
- jae .L384_or_more8x
- cmpq $320,%rdx
- jae .L320_or_more8x
- cmpq $256,%rdx
- jae .L256_or_more8x
- cmpq $192,%rdx
- jae .L192_or_more8x
- cmpq $128,%rdx
- jae .L128_or_more8x
- cmpq $64,%rdx
- jae .L64_or_more8x
- xorq %r10,%r10
- vmovdqa %ymm6,0(%rsp)
- vmovdqa %ymm8,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L64_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- je .Ldone8x
- leaq 64(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm1,0(%rsp)
- leaq 64(%rdi),%rdi
- subq $64,%rdx
- vmovdqa %ymm5,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L128_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- je .Ldone8x
- leaq 128(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm12,0(%rsp)
- leaq 128(%rdi),%rdi
- subq $128,%rdx
- vmovdqa %ymm13,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L192_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- je .Ldone8x
- leaq 192(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm10,0(%rsp)
- leaq 192(%rdi),%rdi
- subq $192,%rdx
- vmovdqa %ymm15,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L256_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- je .Ldone8x
- leaq 256(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm14,0(%rsp)
- leaq 256(%rdi),%rdi
- subq $256,%rdx
- vmovdqa %ymm2,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L320_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- je .Ldone8x
- leaq 320(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm3,0(%rsp)
- leaq 320(%rdi),%rdi
- subq $320,%rdx
- vmovdqa %ymm7,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L384_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vpxor 320(%rsi),%ymm3,%ymm3
- vpxor 352(%rsi),%ymm7,%ymm7
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- vmovdqu %ymm3,320(%rdi)
- vmovdqu %ymm7,352(%rdi)
- je .Ldone8x
- leaq 384(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm11,0(%rsp)
- leaq 384(%rdi),%rdi
- subq $384,%rdx
- vmovdqa %ymm9,32(%rsp)
- jmp .Loop_tail8x
- .align 32
- .L448_or_more8x:
- vpxor 0(%rsi),%ymm6,%ymm6
- vpxor 32(%rsi),%ymm8,%ymm8
- vpxor 64(%rsi),%ymm1,%ymm1
- vpxor 96(%rsi),%ymm5,%ymm5
- vpxor 128(%rsi),%ymm12,%ymm12
- vpxor 160(%rsi),%ymm13,%ymm13
- vpxor 192(%rsi),%ymm10,%ymm10
- vpxor 224(%rsi),%ymm15,%ymm15
- vpxor 256(%rsi),%ymm14,%ymm14
- vpxor 288(%rsi),%ymm2,%ymm2
- vpxor 320(%rsi),%ymm3,%ymm3
- vpxor 352(%rsi),%ymm7,%ymm7
- vpxor 384(%rsi),%ymm11,%ymm11
- vpxor 416(%rsi),%ymm9,%ymm9
- vmovdqu %ymm6,0(%rdi)
- vmovdqu %ymm8,32(%rdi)
- vmovdqu %ymm1,64(%rdi)
- vmovdqu %ymm5,96(%rdi)
- vmovdqu %ymm12,128(%rdi)
- vmovdqu %ymm13,160(%rdi)
- vmovdqu %ymm10,192(%rdi)
- vmovdqu %ymm15,224(%rdi)
- vmovdqu %ymm14,256(%rdi)
- vmovdqu %ymm2,288(%rdi)
- vmovdqu %ymm3,320(%rdi)
- vmovdqu %ymm7,352(%rdi)
- vmovdqu %ymm11,384(%rdi)
- vmovdqu %ymm9,416(%rdi)
- je .Ldone8x
- leaq 448(%rsi),%rsi
- xorq %r10,%r10
- vmovdqa %ymm0,0(%rsp)
- leaq 448(%rdi),%rdi
- subq $448,%rdx
- vmovdqa %ymm4,32(%rsp)
- .Loop_tail8x:
- movzbl (%rsi,%r10,1),%eax
- movzbl (%rsp,%r10,1),%ecx
- leaq 1(%r10),%r10
- xorl %ecx,%eax
- movb %al,-1(%rdi,%r10,1)
- decq %rdx
- jnz .Loop_tail8x
- .Ldone8x:
- vzeroall
- leaq (%r9),%rsp
- .cfi_def_cfa_register rsp
- .L8x_epilogue:
- .byte 0xf3,0xc3
- .cfi_endproc
- .size ChaCha20_8x,.-ChaCha20_8x
- #endif
- .section .note.GNU-stack,"",@progbits
|