chacha-x86_64.S 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if defined(__has_feature)
  4. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  5. #define OPENSSL_NO_ASM
  6. #endif
  7. #endif
  8. #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
  9. #if defined(BORINGSSL_PREFIX)
  10. #include <boringssl_prefix_symbols_asm.h>
  11. #endif
  12. .text
  13. .p2align 6
  14. L$zero:
  15. .long 0,0,0,0
  16. L$one:
  17. .long 1,0,0,0
  18. L$inc:
  19. .long 0,1,2,3
  20. L$four:
  21. .long 4,4,4,4
  22. L$incy:
  23. .long 0,2,4,6,1,3,5,7
  24. L$eight:
  25. .long 8,8,8,8,8,8,8,8
  26. L$rot16:
  27. .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
  28. L$rot24:
  29. .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
  30. L$sigma:
  31. .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
  32. .p2align 6
  33. L$zeroz:
  34. .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
  35. L$fourz:
  36. .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
  37. L$incz:
  38. .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  39. L$sixteen:
  40. .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
  41. .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  42. .globl _ChaCha20_ctr32
  43. .private_extern _ChaCha20_ctr32
  44. .p2align 6
  45. _ChaCha20_ctr32:
  46. cmpq $0,%rdx
  47. je L$no_data
  48. movq _OPENSSL_ia32cap_P+4(%rip),%r10
  49. testl $512,%r10d
  50. jnz L$ChaCha20_ssse3
  51. pushq %rbx
  52. pushq %rbp
  53. pushq %r12
  54. pushq %r13
  55. pushq %r14
  56. pushq %r15
  57. subq $64+24,%rsp
  58. L$ctr32_body:
  59. movdqu (%rcx),%xmm1
  60. movdqu 16(%rcx),%xmm2
  61. movdqu (%r8),%xmm3
  62. movdqa L$one(%rip),%xmm4
  63. movdqa %xmm1,16(%rsp)
  64. movdqa %xmm2,32(%rsp)
  65. movdqa %xmm3,48(%rsp)
  66. movq %rdx,%rbp
  67. jmp L$oop_outer
  68. .p2align 5
  69. L$oop_outer:
  70. movl $0x61707865,%eax
  71. movl $0x3320646e,%ebx
  72. movl $0x79622d32,%ecx
  73. movl $0x6b206574,%edx
  74. movl 16(%rsp),%r8d
  75. movl 20(%rsp),%r9d
  76. movl 24(%rsp),%r10d
  77. movl 28(%rsp),%r11d
  78. movd %xmm3,%r12d
  79. movl 52(%rsp),%r13d
  80. movl 56(%rsp),%r14d
  81. movl 60(%rsp),%r15d
  82. movq %rbp,64+0(%rsp)
  83. movl $10,%ebp
  84. movq %rsi,64+8(%rsp)
  85. .byte 102,72,15,126,214
  86. movq %rdi,64+16(%rsp)
  87. movq %rsi,%rdi
  88. shrq $32,%rdi
  89. jmp L$oop
  90. .p2align 5
  91. L$oop:
  92. addl %r8d,%eax
  93. xorl %eax,%r12d
  94. roll $16,%r12d
  95. addl %r9d,%ebx
  96. xorl %ebx,%r13d
  97. roll $16,%r13d
  98. addl %r12d,%esi
  99. xorl %esi,%r8d
  100. roll $12,%r8d
  101. addl %r13d,%edi
  102. xorl %edi,%r9d
  103. roll $12,%r9d
  104. addl %r8d,%eax
  105. xorl %eax,%r12d
  106. roll $8,%r12d
  107. addl %r9d,%ebx
  108. xorl %ebx,%r13d
  109. roll $8,%r13d
  110. addl %r12d,%esi
  111. xorl %esi,%r8d
  112. roll $7,%r8d
  113. addl %r13d,%edi
  114. xorl %edi,%r9d
  115. roll $7,%r9d
  116. movl %esi,32(%rsp)
  117. movl %edi,36(%rsp)
  118. movl 40(%rsp),%esi
  119. movl 44(%rsp),%edi
  120. addl %r10d,%ecx
  121. xorl %ecx,%r14d
  122. roll $16,%r14d
  123. addl %r11d,%edx
  124. xorl %edx,%r15d
  125. roll $16,%r15d
  126. addl %r14d,%esi
  127. xorl %esi,%r10d
  128. roll $12,%r10d
  129. addl %r15d,%edi
  130. xorl %edi,%r11d
  131. roll $12,%r11d
  132. addl %r10d,%ecx
  133. xorl %ecx,%r14d
  134. roll $8,%r14d
  135. addl %r11d,%edx
  136. xorl %edx,%r15d
  137. roll $8,%r15d
  138. addl %r14d,%esi
  139. xorl %esi,%r10d
  140. roll $7,%r10d
  141. addl %r15d,%edi
  142. xorl %edi,%r11d
  143. roll $7,%r11d
  144. addl %r9d,%eax
  145. xorl %eax,%r15d
  146. roll $16,%r15d
  147. addl %r10d,%ebx
  148. xorl %ebx,%r12d
  149. roll $16,%r12d
  150. addl %r15d,%esi
  151. xorl %esi,%r9d
  152. roll $12,%r9d
  153. addl %r12d,%edi
  154. xorl %edi,%r10d
  155. roll $12,%r10d
  156. addl %r9d,%eax
  157. xorl %eax,%r15d
  158. roll $8,%r15d
  159. addl %r10d,%ebx
  160. xorl %ebx,%r12d
  161. roll $8,%r12d
  162. addl %r15d,%esi
  163. xorl %esi,%r9d
  164. roll $7,%r9d
  165. addl %r12d,%edi
  166. xorl %edi,%r10d
  167. roll $7,%r10d
  168. movl %esi,40(%rsp)
  169. movl %edi,44(%rsp)
  170. movl 32(%rsp),%esi
  171. movl 36(%rsp),%edi
  172. addl %r11d,%ecx
  173. xorl %ecx,%r13d
  174. roll $16,%r13d
  175. addl %r8d,%edx
  176. xorl %edx,%r14d
  177. roll $16,%r14d
  178. addl %r13d,%esi
  179. xorl %esi,%r11d
  180. roll $12,%r11d
  181. addl %r14d,%edi
  182. xorl %edi,%r8d
  183. roll $12,%r8d
  184. addl %r11d,%ecx
  185. xorl %ecx,%r13d
  186. roll $8,%r13d
  187. addl %r8d,%edx
  188. xorl %edx,%r14d
  189. roll $8,%r14d
  190. addl %r13d,%esi
  191. xorl %esi,%r11d
  192. roll $7,%r11d
  193. addl %r14d,%edi
  194. xorl %edi,%r8d
  195. roll $7,%r8d
  196. decl %ebp
  197. jnz L$oop
  198. movl %edi,36(%rsp)
  199. movl %esi,32(%rsp)
  200. movq 64(%rsp),%rbp
  201. movdqa %xmm2,%xmm1
  202. movq 64+8(%rsp),%rsi
  203. paddd %xmm4,%xmm3
  204. movq 64+16(%rsp),%rdi
  205. addl $0x61707865,%eax
  206. addl $0x3320646e,%ebx
  207. addl $0x79622d32,%ecx
  208. addl $0x6b206574,%edx
  209. addl 16(%rsp),%r8d
  210. addl 20(%rsp),%r9d
  211. addl 24(%rsp),%r10d
  212. addl 28(%rsp),%r11d
  213. addl 48(%rsp),%r12d
  214. addl 52(%rsp),%r13d
  215. addl 56(%rsp),%r14d
  216. addl 60(%rsp),%r15d
  217. paddd 32(%rsp),%xmm1
  218. cmpq $64,%rbp
  219. jb L$tail
  220. xorl 0(%rsi),%eax
  221. xorl 4(%rsi),%ebx
  222. xorl 8(%rsi),%ecx
  223. xorl 12(%rsi),%edx
  224. xorl 16(%rsi),%r8d
  225. xorl 20(%rsi),%r9d
  226. xorl 24(%rsi),%r10d
  227. xorl 28(%rsi),%r11d
  228. movdqu 32(%rsi),%xmm0
  229. xorl 48(%rsi),%r12d
  230. xorl 52(%rsi),%r13d
  231. xorl 56(%rsi),%r14d
  232. xorl 60(%rsi),%r15d
  233. leaq 64(%rsi),%rsi
  234. pxor %xmm1,%xmm0
  235. movdqa %xmm2,32(%rsp)
  236. movd %xmm3,48(%rsp)
  237. movl %eax,0(%rdi)
  238. movl %ebx,4(%rdi)
  239. movl %ecx,8(%rdi)
  240. movl %edx,12(%rdi)
  241. movl %r8d,16(%rdi)
  242. movl %r9d,20(%rdi)
  243. movl %r10d,24(%rdi)
  244. movl %r11d,28(%rdi)
  245. movdqu %xmm0,32(%rdi)
  246. movl %r12d,48(%rdi)
  247. movl %r13d,52(%rdi)
  248. movl %r14d,56(%rdi)
  249. movl %r15d,60(%rdi)
  250. leaq 64(%rdi),%rdi
  251. subq $64,%rbp
  252. jnz L$oop_outer
  253. jmp L$done
  254. .p2align 4
  255. L$tail:
  256. movl %eax,0(%rsp)
  257. movl %ebx,4(%rsp)
  258. xorq %rbx,%rbx
  259. movl %ecx,8(%rsp)
  260. movl %edx,12(%rsp)
  261. movl %r8d,16(%rsp)
  262. movl %r9d,20(%rsp)
  263. movl %r10d,24(%rsp)
  264. movl %r11d,28(%rsp)
  265. movdqa %xmm1,32(%rsp)
  266. movl %r12d,48(%rsp)
  267. movl %r13d,52(%rsp)
  268. movl %r14d,56(%rsp)
  269. movl %r15d,60(%rsp)
  270. L$oop_tail:
  271. movzbl (%rsi,%rbx,1),%eax
  272. movzbl (%rsp,%rbx,1),%edx
  273. leaq 1(%rbx),%rbx
  274. xorl %edx,%eax
  275. movb %al,-1(%rdi,%rbx,1)
  276. decq %rbp
  277. jnz L$oop_tail
  278. L$done:
  279. leaq 64+24+48(%rsp),%rsi
  280. movq -48(%rsi),%r15
  281. movq -40(%rsi),%r14
  282. movq -32(%rsi),%r13
  283. movq -24(%rsi),%r12
  284. movq -16(%rsi),%rbp
  285. movq -8(%rsi),%rbx
  286. leaq (%rsi),%rsp
  287. L$no_data:
  288. .byte 0xf3,0xc3
  289. .p2align 5
  290. ChaCha20_ssse3:
  291. L$ChaCha20_ssse3:
  292. movq %rsp,%r9
  293. cmpq $128,%rdx
  294. ja L$ChaCha20_4x
  295. L$do_sse3_after_all:
  296. subq $64+8,%rsp
  297. movdqa L$sigma(%rip),%xmm0
  298. movdqu (%rcx),%xmm1
  299. movdqu 16(%rcx),%xmm2
  300. movdqu (%r8),%xmm3
  301. movdqa L$rot16(%rip),%xmm6
  302. movdqa L$rot24(%rip),%xmm7
  303. movdqa %xmm0,0(%rsp)
  304. movdqa %xmm1,16(%rsp)
  305. movdqa %xmm2,32(%rsp)
  306. movdqa %xmm3,48(%rsp)
  307. movq $10,%r8
  308. jmp L$oop_ssse3
  309. .p2align 5
  310. L$oop_outer_ssse3:
  311. movdqa L$one(%rip),%xmm3
  312. movdqa 0(%rsp),%xmm0
  313. movdqa 16(%rsp),%xmm1
  314. movdqa 32(%rsp),%xmm2
  315. paddd 48(%rsp),%xmm3
  316. movq $10,%r8
  317. movdqa %xmm3,48(%rsp)
  318. jmp L$oop_ssse3
  319. .p2align 5
  320. L$oop_ssse3:
  321. paddd %xmm1,%xmm0
  322. pxor %xmm0,%xmm3
  323. .byte 102,15,56,0,222
  324. paddd %xmm3,%xmm2
  325. pxor %xmm2,%xmm1
  326. movdqa %xmm1,%xmm4
  327. psrld $20,%xmm1
  328. pslld $12,%xmm4
  329. por %xmm4,%xmm1
  330. paddd %xmm1,%xmm0
  331. pxor %xmm0,%xmm3
  332. .byte 102,15,56,0,223
  333. paddd %xmm3,%xmm2
  334. pxor %xmm2,%xmm1
  335. movdqa %xmm1,%xmm4
  336. psrld $25,%xmm1
  337. pslld $7,%xmm4
  338. por %xmm4,%xmm1
  339. pshufd $78,%xmm2,%xmm2
  340. pshufd $57,%xmm1,%xmm1
  341. pshufd $147,%xmm3,%xmm3
  342. nop
  343. paddd %xmm1,%xmm0
  344. pxor %xmm0,%xmm3
  345. .byte 102,15,56,0,222
  346. paddd %xmm3,%xmm2
  347. pxor %xmm2,%xmm1
  348. movdqa %xmm1,%xmm4
  349. psrld $20,%xmm1
  350. pslld $12,%xmm4
  351. por %xmm4,%xmm1
  352. paddd %xmm1,%xmm0
  353. pxor %xmm0,%xmm3
  354. .byte 102,15,56,0,223
  355. paddd %xmm3,%xmm2
  356. pxor %xmm2,%xmm1
  357. movdqa %xmm1,%xmm4
  358. psrld $25,%xmm1
  359. pslld $7,%xmm4
  360. por %xmm4,%xmm1
  361. pshufd $78,%xmm2,%xmm2
  362. pshufd $147,%xmm1,%xmm1
  363. pshufd $57,%xmm3,%xmm3
  364. decq %r8
  365. jnz L$oop_ssse3
  366. paddd 0(%rsp),%xmm0
  367. paddd 16(%rsp),%xmm1
  368. paddd 32(%rsp),%xmm2
  369. paddd 48(%rsp),%xmm3
  370. cmpq $64,%rdx
  371. jb L$tail_ssse3
  372. movdqu 0(%rsi),%xmm4
  373. movdqu 16(%rsi),%xmm5
  374. pxor %xmm4,%xmm0
  375. movdqu 32(%rsi),%xmm4
  376. pxor %xmm5,%xmm1
  377. movdqu 48(%rsi),%xmm5
  378. leaq 64(%rsi),%rsi
  379. pxor %xmm4,%xmm2
  380. pxor %xmm5,%xmm3
  381. movdqu %xmm0,0(%rdi)
  382. movdqu %xmm1,16(%rdi)
  383. movdqu %xmm2,32(%rdi)
  384. movdqu %xmm3,48(%rdi)
  385. leaq 64(%rdi),%rdi
  386. subq $64,%rdx
  387. jnz L$oop_outer_ssse3
  388. jmp L$done_ssse3
  389. .p2align 4
  390. L$tail_ssse3:
  391. movdqa %xmm0,0(%rsp)
  392. movdqa %xmm1,16(%rsp)
  393. movdqa %xmm2,32(%rsp)
  394. movdqa %xmm3,48(%rsp)
  395. xorq %r8,%r8
  396. L$oop_tail_ssse3:
  397. movzbl (%rsi,%r8,1),%eax
  398. movzbl (%rsp,%r8,1),%ecx
  399. leaq 1(%r8),%r8
  400. xorl %ecx,%eax
  401. movb %al,-1(%rdi,%r8,1)
  402. decq %rdx
  403. jnz L$oop_tail_ssse3
  404. L$done_ssse3:
  405. leaq (%r9),%rsp
  406. L$ssse3_epilogue:
  407. .byte 0xf3,0xc3
  408. .p2align 5
  409. ChaCha20_4x:
  410. L$ChaCha20_4x:
  411. movq %rsp,%r9
  412. movq %r10,%r11
  413. shrq $32,%r10
  414. testq $32,%r10
  415. jnz L$ChaCha20_8x
  416. cmpq $192,%rdx
  417. ja L$proceed4x
  418. andq $71303168,%r11
  419. cmpq $4194304,%r11
  420. je L$do_sse3_after_all
  421. L$proceed4x:
  422. subq $0x140+8,%rsp
  423. movdqa L$sigma(%rip),%xmm11
  424. movdqu (%rcx),%xmm15
  425. movdqu 16(%rcx),%xmm7
  426. movdqu (%r8),%xmm3
  427. leaq 256(%rsp),%rcx
  428. leaq L$rot16(%rip),%r10
  429. leaq L$rot24(%rip),%r11
  430. pshufd $0x00,%xmm11,%xmm8
  431. pshufd $0x55,%xmm11,%xmm9
  432. movdqa %xmm8,64(%rsp)
  433. pshufd $0xaa,%xmm11,%xmm10
  434. movdqa %xmm9,80(%rsp)
  435. pshufd $0xff,%xmm11,%xmm11
  436. movdqa %xmm10,96(%rsp)
  437. movdqa %xmm11,112(%rsp)
  438. pshufd $0x00,%xmm15,%xmm12
  439. pshufd $0x55,%xmm15,%xmm13
  440. movdqa %xmm12,128-256(%rcx)
  441. pshufd $0xaa,%xmm15,%xmm14
  442. movdqa %xmm13,144-256(%rcx)
  443. pshufd $0xff,%xmm15,%xmm15
  444. movdqa %xmm14,160-256(%rcx)
  445. movdqa %xmm15,176-256(%rcx)
  446. pshufd $0x00,%xmm7,%xmm4
  447. pshufd $0x55,%xmm7,%xmm5
  448. movdqa %xmm4,192-256(%rcx)
  449. pshufd $0xaa,%xmm7,%xmm6
  450. movdqa %xmm5,208-256(%rcx)
  451. pshufd $0xff,%xmm7,%xmm7
  452. movdqa %xmm6,224-256(%rcx)
  453. movdqa %xmm7,240-256(%rcx)
  454. pshufd $0x00,%xmm3,%xmm0
  455. pshufd $0x55,%xmm3,%xmm1
  456. paddd L$inc(%rip),%xmm0
  457. pshufd $0xaa,%xmm3,%xmm2
  458. movdqa %xmm1,272-256(%rcx)
  459. pshufd $0xff,%xmm3,%xmm3
  460. movdqa %xmm2,288-256(%rcx)
  461. movdqa %xmm3,304-256(%rcx)
  462. jmp L$oop_enter4x
  463. .p2align 5
  464. L$oop_outer4x:
  465. movdqa 64(%rsp),%xmm8
  466. movdqa 80(%rsp),%xmm9
  467. movdqa 96(%rsp),%xmm10
  468. movdqa 112(%rsp),%xmm11
  469. movdqa 128-256(%rcx),%xmm12
  470. movdqa 144-256(%rcx),%xmm13
  471. movdqa 160-256(%rcx),%xmm14
  472. movdqa 176-256(%rcx),%xmm15
  473. movdqa 192-256(%rcx),%xmm4
  474. movdqa 208-256(%rcx),%xmm5
  475. movdqa 224-256(%rcx),%xmm6
  476. movdqa 240-256(%rcx),%xmm7
  477. movdqa 256-256(%rcx),%xmm0
  478. movdqa 272-256(%rcx),%xmm1
  479. movdqa 288-256(%rcx),%xmm2
  480. movdqa 304-256(%rcx),%xmm3
  481. paddd L$four(%rip),%xmm0
  482. L$oop_enter4x:
  483. movdqa %xmm6,32(%rsp)
  484. movdqa %xmm7,48(%rsp)
  485. movdqa (%r10),%xmm7
  486. movl $10,%eax
  487. movdqa %xmm0,256-256(%rcx)
  488. jmp L$oop4x
  489. .p2align 5
  490. L$oop4x:
  491. paddd %xmm12,%xmm8
  492. paddd %xmm13,%xmm9
  493. pxor %xmm8,%xmm0
  494. pxor %xmm9,%xmm1
  495. .byte 102,15,56,0,199
  496. .byte 102,15,56,0,207
  497. paddd %xmm0,%xmm4
  498. paddd %xmm1,%xmm5
  499. pxor %xmm4,%xmm12
  500. pxor %xmm5,%xmm13
  501. movdqa %xmm12,%xmm6
  502. pslld $12,%xmm12
  503. psrld $20,%xmm6
  504. movdqa %xmm13,%xmm7
  505. pslld $12,%xmm13
  506. por %xmm6,%xmm12
  507. psrld $20,%xmm7
  508. movdqa (%r11),%xmm6
  509. por %xmm7,%xmm13
  510. paddd %xmm12,%xmm8
  511. paddd %xmm13,%xmm9
  512. pxor %xmm8,%xmm0
  513. pxor %xmm9,%xmm1
  514. .byte 102,15,56,0,198
  515. .byte 102,15,56,0,206
  516. paddd %xmm0,%xmm4
  517. paddd %xmm1,%xmm5
  518. pxor %xmm4,%xmm12
  519. pxor %xmm5,%xmm13
  520. movdqa %xmm12,%xmm7
  521. pslld $7,%xmm12
  522. psrld $25,%xmm7
  523. movdqa %xmm13,%xmm6
  524. pslld $7,%xmm13
  525. por %xmm7,%xmm12
  526. psrld $25,%xmm6
  527. movdqa (%r10),%xmm7
  528. por %xmm6,%xmm13
  529. movdqa %xmm4,0(%rsp)
  530. movdqa %xmm5,16(%rsp)
  531. movdqa 32(%rsp),%xmm4
  532. movdqa 48(%rsp),%xmm5
  533. paddd %xmm14,%xmm10
  534. paddd %xmm15,%xmm11
  535. pxor %xmm10,%xmm2
  536. pxor %xmm11,%xmm3
  537. .byte 102,15,56,0,215
  538. .byte 102,15,56,0,223
  539. paddd %xmm2,%xmm4
  540. paddd %xmm3,%xmm5
  541. pxor %xmm4,%xmm14
  542. pxor %xmm5,%xmm15
  543. movdqa %xmm14,%xmm6
  544. pslld $12,%xmm14
  545. psrld $20,%xmm6
  546. movdqa %xmm15,%xmm7
  547. pslld $12,%xmm15
  548. por %xmm6,%xmm14
  549. psrld $20,%xmm7
  550. movdqa (%r11),%xmm6
  551. por %xmm7,%xmm15
  552. paddd %xmm14,%xmm10
  553. paddd %xmm15,%xmm11
  554. pxor %xmm10,%xmm2
  555. pxor %xmm11,%xmm3
  556. .byte 102,15,56,0,214
  557. .byte 102,15,56,0,222
  558. paddd %xmm2,%xmm4
  559. paddd %xmm3,%xmm5
  560. pxor %xmm4,%xmm14
  561. pxor %xmm5,%xmm15
  562. movdqa %xmm14,%xmm7
  563. pslld $7,%xmm14
  564. psrld $25,%xmm7
  565. movdqa %xmm15,%xmm6
  566. pslld $7,%xmm15
  567. por %xmm7,%xmm14
  568. psrld $25,%xmm6
  569. movdqa (%r10),%xmm7
  570. por %xmm6,%xmm15
  571. paddd %xmm13,%xmm8
  572. paddd %xmm14,%xmm9
  573. pxor %xmm8,%xmm3
  574. pxor %xmm9,%xmm0
  575. .byte 102,15,56,0,223
  576. .byte 102,15,56,0,199
  577. paddd %xmm3,%xmm4
  578. paddd %xmm0,%xmm5
  579. pxor %xmm4,%xmm13
  580. pxor %xmm5,%xmm14
  581. movdqa %xmm13,%xmm6
  582. pslld $12,%xmm13
  583. psrld $20,%xmm6
  584. movdqa %xmm14,%xmm7
  585. pslld $12,%xmm14
  586. por %xmm6,%xmm13
  587. psrld $20,%xmm7
  588. movdqa (%r11),%xmm6
  589. por %xmm7,%xmm14
  590. paddd %xmm13,%xmm8
  591. paddd %xmm14,%xmm9
  592. pxor %xmm8,%xmm3
  593. pxor %xmm9,%xmm0
  594. .byte 102,15,56,0,222
  595. .byte 102,15,56,0,198
  596. paddd %xmm3,%xmm4
  597. paddd %xmm0,%xmm5
  598. pxor %xmm4,%xmm13
  599. pxor %xmm5,%xmm14
  600. movdqa %xmm13,%xmm7
  601. pslld $7,%xmm13
  602. psrld $25,%xmm7
  603. movdqa %xmm14,%xmm6
  604. pslld $7,%xmm14
  605. por %xmm7,%xmm13
  606. psrld $25,%xmm6
  607. movdqa (%r10),%xmm7
  608. por %xmm6,%xmm14
  609. movdqa %xmm4,32(%rsp)
  610. movdqa %xmm5,48(%rsp)
  611. movdqa 0(%rsp),%xmm4
  612. movdqa 16(%rsp),%xmm5
  613. paddd %xmm15,%xmm10
  614. paddd %xmm12,%xmm11
  615. pxor %xmm10,%xmm1
  616. pxor %xmm11,%xmm2
  617. .byte 102,15,56,0,207
  618. .byte 102,15,56,0,215
  619. paddd %xmm1,%xmm4
  620. paddd %xmm2,%xmm5
  621. pxor %xmm4,%xmm15
  622. pxor %xmm5,%xmm12
  623. movdqa %xmm15,%xmm6
  624. pslld $12,%xmm15
  625. psrld $20,%xmm6
  626. movdqa %xmm12,%xmm7
  627. pslld $12,%xmm12
  628. por %xmm6,%xmm15
  629. psrld $20,%xmm7
  630. movdqa (%r11),%xmm6
  631. por %xmm7,%xmm12
  632. paddd %xmm15,%xmm10
  633. paddd %xmm12,%xmm11
  634. pxor %xmm10,%xmm1
  635. pxor %xmm11,%xmm2
  636. .byte 102,15,56,0,206
  637. .byte 102,15,56,0,214
  638. paddd %xmm1,%xmm4
  639. paddd %xmm2,%xmm5
  640. pxor %xmm4,%xmm15
  641. pxor %xmm5,%xmm12
  642. movdqa %xmm15,%xmm7
  643. pslld $7,%xmm15
  644. psrld $25,%xmm7
  645. movdqa %xmm12,%xmm6
  646. pslld $7,%xmm12
  647. por %xmm7,%xmm15
  648. psrld $25,%xmm6
  649. movdqa (%r10),%xmm7
  650. por %xmm6,%xmm12
  651. decl %eax
  652. jnz L$oop4x
  653. paddd 64(%rsp),%xmm8
  654. paddd 80(%rsp),%xmm9
  655. paddd 96(%rsp),%xmm10
  656. paddd 112(%rsp),%xmm11
  657. movdqa %xmm8,%xmm6
  658. punpckldq %xmm9,%xmm8
  659. movdqa %xmm10,%xmm7
  660. punpckldq %xmm11,%xmm10
  661. punpckhdq %xmm9,%xmm6
  662. punpckhdq %xmm11,%xmm7
  663. movdqa %xmm8,%xmm9
  664. punpcklqdq %xmm10,%xmm8
  665. movdqa %xmm6,%xmm11
  666. punpcklqdq %xmm7,%xmm6
  667. punpckhqdq %xmm10,%xmm9
  668. punpckhqdq %xmm7,%xmm11
  669. paddd 128-256(%rcx),%xmm12
  670. paddd 144-256(%rcx),%xmm13
  671. paddd 160-256(%rcx),%xmm14
  672. paddd 176-256(%rcx),%xmm15
  673. movdqa %xmm8,0(%rsp)
  674. movdqa %xmm9,16(%rsp)
  675. movdqa 32(%rsp),%xmm8
  676. movdqa 48(%rsp),%xmm9
  677. movdqa %xmm12,%xmm10
  678. punpckldq %xmm13,%xmm12
  679. movdqa %xmm14,%xmm7
  680. punpckldq %xmm15,%xmm14
  681. punpckhdq %xmm13,%xmm10
  682. punpckhdq %xmm15,%xmm7
  683. movdqa %xmm12,%xmm13
  684. punpcklqdq %xmm14,%xmm12
  685. movdqa %xmm10,%xmm15
  686. punpcklqdq %xmm7,%xmm10
  687. punpckhqdq %xmm14,%xmm13
  688. punpckhqdq %xmm7,%xmm15
  689. paddd 192-256(%rcx),%xmm4
  690. paddd 208-256(%rcx),%xmm5
  691. paddd 224-256(%rcx),%xmm8
  692. paddd 240-256(%rcx),%xmm9
  693. movdqa %xmm6,32(%rsp)
  694. movdqa %xmm11,48(%rsp)
  695. movdqa %xmm4,%xmm14
  696. punpckldq %xmm5,%xmm4
  697. movdqa %xmm8,%xmm7
  698. punpckldq %xmm9,%xmm8
  699. punpckhdq %xmm5,%xmm14
  700. punpckhdq %xmm9,%xmm7
  701. movdqa %xmm4,%xmm5
  702. punpcklqdq %xmm8,%xmm4
  703. movdqa %xmm14,%xmm9
  704. punpcklqdq %xmm7,%xmm14
  705. punpckhqdq %xmm8,%xmm5
  706. punpckhqdq %xmm7,%xmm9
  707. paddd 256-256(%rcx),%xmm0
  708. paddd 272-256(%rcx),%xmm1
  709. paddd 288-256(%rcx),%xmm2
  710. paddd 304-256(%rcx),%xmm3
  711. movdqa %xmm0,%xmm8
  712. punpckldq %xmm1,%xmm0
  713. movdqa %xmm2,%xmm7
  714. punpckldq %xmm3,%xmm2
  715. punpckhdq %xmm1,%xmm8
  716. punpckhdq %xmm3,%xmm7
  717. movdqa %xmm0,%xmm1
  718. punpcklqdq %xmm2,%xmm0
  719. movdqa %xmm8,%xmm3
  720. punpcklqdq %xmm7,%xmm8
  721. punpckhqdq %xmm2,%xmm1
  722. punpckhqdq %xmm7,%xmm3
  723. cmpq $256,%rdx
  724. jb L$tail4x
  725. movdqu 0(%rsi),%xmm6
  726. movdqu 16(%rsi),%xmm11
  727. movdqu 32(%rsi),%xmm2
  728. movdqu 48(%rsi),%xmm7
  729. pxor 0(%rsp),%xmm6
  730. pxor %xmm12,%xmm11
  731. pxor %xmm4,%xmm2
  732. pxor %xmm0,%xmm7
  733. movdqu %xmm6,0(%rdi)
  734. movdqu 64(%rsi),%xmm6
  735. movdqu %xmm11,16(%rdi)
  736. movdqu 80(%rsi),%xmm11
  737. movdqu %xmm2,32(%rdi)
  738. movdqu 96(%rsi),%xmm2
  739. movdqu %xmm7,48(%rdi)
  740. movdqu 112(%rsi),%xmm7
  741. leaq 128(%rsi),%rsi
  742. pxor 16(%rsp),%xmm6
  743. pxor %xmm13,%xmm11
  744. pxor %xmm5,%xmm2
  745. pxor %xmm1,%xmm7
  746. movdqu %xmm6,64(%rdi)
  747. movdqu 0(%rsi),%xmm6
  748. movdqu %xmm11,80(%rdi)
  749. movdqu 16(%rsi),%xmm11
  750. movdqu %xmm2,96(%rdi)
  751. movdqu 32(%rsi),%xmm2
  752. movdqu %xmm7,112(%rdi)
  753. leaq 128(%rdi),%rdi
  754. movdqu 48(%rsi),%xmm7
  755. pxor 32(%rsp),%xmm6
  756. pxor %xmm10,%xmm11
  757. pxor %xmm14,%xmm2
  758. pxor %xmm8,%xmm7
  759. movdqu %xmm6,0(%rdi)
  760. movdqu 64(%rsi),%xmm6
  761. movdqu %xmm11,16(%rdi)
  762. movdqu 80(%rsi),%xmm11
  763. movdqu %xmm2,32(%rdi)
  764. movdqu 96(%rsi),%xmm2
  765. movdqu %xmm7,48(%rdi)
  766. movdqu 112(%rsi),%xmm7
  767. leaq 128(%rsi),%rsi
  768. pxor 48(%rsp),%xmm6
  769. pxor %xmm15,%xmm11
  770. pxor %xmm9,%xmm2
  771. pxor %xmm3,%xmm7
  772. movdqu %xmm6,64(%rdi)
  773. movdqu %xmm11,80(%rdi)
  774. movdqu %xmm2,96(%rdi)
  775. movdqu %xmm7,112(%rdi)
  776. leaq 128(%rdi),%rdi
  777. subq $256,%rdx
  778. jnz L$oop_outer4x
  779. jmp L$done4x
  780. L$tail4x:
  781. cmpq $192,%rdx
  782. jae L$192_or_more4x
  783. cmpq $128,%rdx
  784. jae L$128_or_more4x
  785. cmpq $64,%rdx
  786. jae L$64_or_more4x
  787. xorq %r10,%r10
  788. movdqa %xmm12,16(%rsp)
  789. movdqa %xmm4,32(%rsp)
  790. movdqa %xmm0,48(%rsp)
  791. jmp L$oop_tail4x
  792. .p2align 5
  793. L$64_or_more4x:
  794. movdqu 0(%rsi),%xmm6
  795. movdqu 16(%rsi),%xmm11
  796. movdqu 32(%rsi),%xmm2
  797. movdqu 48(%rsi),%xmm7
  798. pxor 0(%rsp),%xmm6
  799. pxor %xmm12,%xmm11
  800. pxor %xmm4,%xmm2
  801. pxor %xmm0,%xmm7
  802. movdqu %xmm6,0(%rdi)
  803. movdqu %xmm11,16(%rdi)
  804. movdqu %xmm2,32(%rdi)
  805. movdqu %xmm7,48(%rdi)
  806. je L$done4x
  807. movdqa 16(%rsp),%xmm6
  808. leaq 64(%rsi),%rsi
  809. xorq %r10,%r10
  810. movdqa %xmm6,0(%rsp)
  811. movdqa %xmm13,16(%rsp)
  812. leaq 64(%rdi),%rdi
  813. movdqa %xmm5,32(%rsp)
  814. subq $64,%rdx
  815. movdqa %xmm1,48(%rsp)
  816. jmp L$oop_tail4x
  817. .p2align 5
  818. L$128_or_more4x:
  819. movdqu 0(%rsi),%xmm6
  820. movdqu 16(%rsi),%xmm11
  821. movdqu 32(%rsi),%xmm2
  822. movdqu 48(%rsi),%xmm7
  823. pxor 0(%rsp),%xmm6
  824. pxor %xmm12,%xmm11
  825. pxor %xmm4,%xmm2
  826. pxor %xmm0,%xmm7
  827. movdqu %xmm6,0(%rdi)
  828. movdqu 64(%rsi),%xmm6
  829. movdqu %xmm11,16(%rdi)
  830. movdqu 80(%rsi),%xmm11
  831. movdqu %xmm2,32(%rdi)
  832. movdqu 96(%rsi),%xmm2
  833. movdqu %xmm7,48(%rdi)
  834. movdqu 112(%rsi),%xmm7
  835. pxor 16(%rsp),%xmm6
  836. pxor %xmm13,%xmm11
  837. pxor %xmm5,%xmm2
  838. pxor %xmm1,%xmm7
  839. movdqu %xmm6,64(%rdi)
  840. movdqu %xmm11,80(%rdi)
  841. movdqu %xmm2,96(%rdi)
  842. movdqu %xmm7,112(%rdi)
  843. je L$done4x
  844. movdqa 32(%rsp),%xmm6
  845. leaq 128(%rsi),%rsi
  846. xorq %r10,%r10
  847. movdqa %xmm6,0(%rsp)
  848. movdqa %xmm10,16(%rsp)
  849. leaq 128(%rdi),%rdi
  850. movdqa %xmm14,32(%rsp)
  851. subq $128,%rdx
  852. movdqa %xmm8,48(%rsp)
  853. jmp L$oop_tail4x
  854. .p2align 5
  855. L$192_or_more4x:
  856. movdqu 0(%rsi),%xmm6
  857. movdqu 16(%rsi),%xmm11
  858. movdqu 32(%rsi),%xmm2
  859. movdqu 48(%rsi),%xmm7
  860. pxor 0(%rsp),%xmm6
  861. pxor %xmm12,%xmm11
  862. pxor %xmm4,%xmm2
  863. pxor %xmm0,%xmm7
  864. movdqu %xmm6,0(%rdi)
  865. movdqu 64(%rsi),%xmm6
  866. movdqu %xmm11,16(%rdi)
  867. movdqu 80(%rsi),%xmm11
  868. movdqu %xmm2,32(%rdi)
  869. movdqu 96(%rsi),%xmm2
  870. movdqu %xmm7,48(%rdi)
  871. movdqu 112(%rsi),%xmm7
  872. leaq 128(%rsi),%rsi
  873. pxor 16(%rsp),%xmm6
  874. pxor %xmm13,%xmm11
  875. pxor %xmm5,%xmm2
  876. pxor %xmm1,%xmm7
  877. movdqu %xmm6,64(%rdi)
  878. movdqu 0(%rsi),%xmm6
  879. movdqu %xmm11,80(%rdi)
  880. movdqu 16(%rsi),%xmm11
  881. movdqu %xmm2,96(%rdi)
  882. movdqu 32(%rsi),%xmm2
  883. movdqu %xmm7,112(%rdi)
  884. leaq 128(%rdi),%rdi
  885. movdqu 48(%rsi),%xmm7
  886. pxor 32(%rsp),%xmm6
  887. pxor %xmm10,%xmm11
  888. pxor %xmm14,%xmm2
  889. pxor %xmm8,%xmm7
  890. movdqu %xmm6,0(%rdi)
  891. movdqu %xmm11,16(%rdi)
  892. movdqu %xmm2,32(%rdi)
  893. movdqu %xmm7,48(%rdi)
  894. je L$done4x
  895. movdqa 48(%rsp),%xmm6
  896. leaq 64(%rsi),%rsi
  897. xorq %r10,%r10
  898. movdqa %xmm6,0(%rsp)
  899. movdqa %xmm15,16(%rsp)
  900. leaq 64(%rdi),%rdi
  901. movdqa %xmm9,32(%rsp)
  902. subq $192,%rdx
  903. movdqa %xmm3,48(%rsp)
  904. L$oop_tail4x:
  905. movzbl (%rsi,%r10,1),%eax
  906. movzbl (%rsp,%r10,1),%ecx
  907. leaq 1(%r10),%r10
  908. xorl %ecx,%eax
  909. movb %al,-1(%rdi,%r10,1)
  910. decq %rdx
  911. jnz L$oop_tail4x
  912. L$done4x:
  913. leaq (%r9),%rsp
  914. L$4x_epilogue:
  915. .byte 0xf3,0xc3
  916. .p2align 5
  917. ChaCha20_8x:
  918. L$ChaCha20_8x:
  919. movq %rsp,%r9
  920. subq $0x280+8,%rsp
  921. andq $-32,%rsp
  922. vzeroupper
  923. vbroadcasti128 L$sigma(%rip),%ymm11
  924. vbroadcasti128 (%rcx),%ymm3
  925. vbroadcasti128 16(%rcx),%ymm15
  926. vbroadcasti128 (%r8),%ymm7
  927. leaq 256(%rsp),%rcx
  928. leaq 512(%rsp),%rax
  929. leaq L$rot16(%rip),%r10
  930. leaq L$rot24(%rip),%r11
  931. vpshufd $0x00,%ymm11,%ymm8
  932. vpshufd $0x55,%ymm11,%ymm9
  933. vmovdqa %ymm8,128-256(%rcx)
  934. vpshufd $0xaa,%ymm11,%ymm10
  935. vmovdqa %ymm9,160-256(%rcx)
  936. vpshufd $0xff,%ymm11,%ymm11
  937. vmovdqa %ymm10,192-256(%rcx)
  938. vmovdqa %ymm11,224-256(%rcx)
  939. vpshufd $0x00,%ymm3,%ymm0
  940. vpshufd $0x55,%ymm3,%ymm1
  941. vmovdqa %ymm0,256-256(%rcx)
  942. vpshufd $0xaa,%ymm3,%ymm2
  943. vmovdqa %ymm1,288-256(%rcx)
  944. vpshufd $0xff,%ymm3,%ymm3
  945. vmovdqa %ymm2,320-256(%rcx)
  946. vmovdqa %ymm3,352-256(%rcx)
  947. vpshufd $0x00,%ymm15,%ymm12
  948. vpshufd $0x55,%ymm15,%ymm13
  949. vmovdqa %ymm12,384-512(%rax)
  950. vpshufd $0xaa,%ymm15,%ymm14
  951. vmovdqa %ymm13,416-512(%rax)
  952. vpshufd $0xff,%ymm15,%ymm15
  953. vmovdqa %ymm14,448-512(%rax)
  954. vmovdqa %ymm15,480-512(%rax)
  955. vpshufd $0x00,%ymm7,%ymm4
  956. vpshufd $0x55,%ymm7,%ymm5
  957. vpaddd L$incy(%rip),%ymm4,%ymm4
  958. vpshufd $0xaa,%ymm7,%ymm6
  959. vmovdqa %ymm5,544-512(%rax)
  960. vpshufd $0xff,%ymm7,%ymm7
  961. vmovdqa %ymm6,576-512(%rax)
  962. vmovdqa %ymm7,608-512(%rax)
  963. jmp L$oop_enter8x
  964. .p2align 5
  965. L$oop_outer8x:
  966. vmovdqa 128-256(%rcx),%ymm8
  967. vmovdqa 160-256(%rcx),%ymm9
  968. vmovdqa 192-256(%rcx),%ymm10
  969. vmovdqa 224-256(%rcx),%ymm11
  970. vmovdqa 256-256(%rcx),%ymm0
  971. vmovdqa 288-256(%rcx),%ymm1
  972. vmovdqa 320-256(%rcx),%ymm2
  973. vmovdqa 352-256(%rcx),%ymm3
  974. vmovdqa 384-512(%rax),%ymm12
  975. vmovdqa 416-512(%rax),%ymm13
  976. vmovdqa 448-512(%rax),%ymm14
  977. vmovdqa 480-512(%rax),%ymm15
  978. vmovdqa 512-512(%rax),%ymm4
  979. vmovdqa 544-512(%rax),%ymm5
  980. vmovdqa 576-512(%rax),%ymm6
  981. vmovdqa 608-512(%rax),%ymm7
  982. vpaddd L$eight(%rip),%ymm4,%ymm4
  983. L$oop_enter8x:
  984. vmovdqa %ymm14,64(%rsp)
  985. vmovdqa %ymm15,96(%rsp)
  986. vbroadcasti128 (%r10),%ymm15
  987. vmovdqa %ymm4,512-512(%rax)
  988. movl $10,%eax
  989. jmp L$oop8x
  990. .p2align 5
  991. L$oop8x:
  992. vpaddd %ymm0,%ymm8,%ymm8
  993. vpxor %ymm4,%ymm8,%ymm4
  994. vpshufb %ymm15,%ymm4,%ymm4
  995. vpaddd %ymm1,%ymm9,%ymm9
  996. vpxor %ymm5,%ymm9,%ymm5
  997. vpshufb %ymm15,%ymm5,%ymm5
  998. vpaddd %ymm4,%ymm12,%ymm12
  999. vpxor %ymm0,%ymm12,%ymm0
  1000. vpslld $12,%ymm0,%ymm14
  1001. vpsrld $20,%ymm0,%ymm0
  1002. vpor %ymm0,%ymm14,%ymm0
  1003. vbroadcasti128 (%r11),%ymm14
  1004. vpaddd %ymm5,%ymm13,%ymm13
  1005. vpxor %ymm1,%ymm13,%ymm1
  1006. vpslld $12,%ymm1,%ymm15
  1007. vpsrld $20,%ymm1,%ymm1
  1008. vpor %ymm1,%ymm15,%ymm1
  1009. vpaddd %ymm0,%ymm8,%ymm8
  1010. vpxor %ymm4,%ymm8,%ymm4
  1011. vpshufb %ymm14,%ymm4,%ymm4
  1012. vpaddd %ymm1,%ymm9,%ymm9
  1013. vpxor %ymm5,%ymm9,%ymm5
  1014. vpshufb %ymm14,%ymm5,%ymm5
  1015. vpaddd %ymm4,%ymm12,%ymm12
  1016. vpxor %ymm0,%ymm12,%ymm0
  1017. vpslld $7,%ymm0,%ymm15
  1018. vpsrld $25,%ymm0,%ymm0
  1019. vpor %ymm0,%ymm15,%ymm0
  1020. vbroadcasti128 (%r10),%ymm15
  1021. vpaddd %ymm5,%ymm13,%ymm13
  1022. vpxor %ymm1,%ymm13,%ymm1
  1023. vpslld $7,%ymm1,%ymm14
  1024. vpsrld $25,%ymm1,%ymm1
  1025. vpor %ymm1,%ymm14,%ymm1
  1026. vmovdqa %ymm12,0(%rsp)
  1027. vmovdqa %ymm13,32(%rsp)
  1028. vmovdqa 64(%rsp),%ymm12
  1029. vmovdqa 96(%rsp),%ymm13
  1030. vpaddd %ymm2,%ymm10,%ymm10
  1031. vpxor %ymm6,%ymm10,%ymm6
  1032. vpshufb %ymm15,%ymm6,%ymm6
  1033. vpaddd %ymm3,%ymm11,%ymm11
  1034. vpxor %ymm7,%ymm11,%ymm7
  1035. vpshufb %ymm15,%ymm7,%ymm7
  1036. vpaddd %ymm6,%ymm12,%ymm12
  1037. vpxor %ymm2,%ymm12,%ymm2
  1038. vpslld $12,%ymm2,%ymm14
  1039. vpsrld $20,%ymm2,%ymm2
  1040. vpor %ymm2,%ymm14,%ymm2
  1041. vbroadcasti128 (%r11),%ymm14
  1042. vpaddd %ymm7,%ymm13,%ymm13
  1043. vpxor %ymm3,%ymm13,%ymm3
  1044. vpslld $12,%ymm3,%ymm15
  1045. vpsrld $20,%ymm3,%ymm3
  1046. vpor %ymm3,%ymm15,%ymm3
  1047. vpaddd %ymm2,%ymm10,%ymm10
  1048. vpxor %ymm6,%ymm10,%ymm6
  1049. vpshufb %ymm14,%ymm6,%ymm6
  1050. vpaddd %ymm3,%ymm11,%ymm11
  1051. vpxor %ymm7,%ymm11,%ymm7
  1052. vpshufb %ymm14,%ymm7,%ymm7
  1053. vpaddd %ymm6,%ymm12,%ymm12
  1054. vpxor %ymm2,%ymm12,%ymm2
  1055. vpslld $7,%ymm2,%ymm15
  1056. vpsrld $25,%ymm2,%ymm2
  1057. vpor %ymm2,%ymm15,%ymm2
  1058. vbroadcasti128 (%r10),%ymm15
  1059. vpaddd %ymm7,%ymm13,%ymm13
  1060. vpxor %ymm3,%ymm13,%ymm3
  1061. vpslld $7,%ymm3,%ymm14
  1062. vpsrld $25,%ymm3,%ymm3
  1063. vpor %ymm3,%ymm14,%ymm3
  1064. vpaddd %ymm1,%ymm8,%ymm8
  1065. vpxor %ymm7,%ymm8,%ymm7
  1066. vpshufb %ymm15,%ymm7,%ymm7
  1067. vpaddd %ymm2,%ymm9,%ymm9
  1068. vpxor %ymm4,%ymm9,%ymm4
  1069. vpshufb %ymm15,%ymm4,%ymm4
  1070. vpaddd %ymm7,%ymm12,%ymm12
  1071. vpxor %ymm1,%ymm12,%ymm1
  1072. vpslld $12,%ymm1,%ymm14
  1073. vpsrld $20,%ymm1,%ymm1
  1074. vpor %ymm1,%ymm14,%ymm1
  1075. vbroadcasti128 (%r11),%ymm14
  1076. vpaddd %ymm4,%ymm13,%ymm13
  1077. vpxor %ymm2,%ymm13,%ymm2
  1078. vpslld $12,%ymm2,%ymm15
  1079. vpsrld $20,%ymm2,%ymm2
  1080. vpor %ymm2,%ymm15,%ymm2
  1081. vpaddd %ymm1,%ymm8,%ymm8
  1082. vpxor %ymm7,%ymm8,%ymm7
  1083. vpshufb %ymm14,%ymm7,%ymm7
  1084. vpaddd %ymm2,%ymm9,%ymm9
  1085. vpxor %ymm4,%ymm9,%ymm4
  1086. vpshufb %ymm14,%ymm4,%ymm4
  1087. vpaddd %ymm7,%ymm12,%ymm12
  1088. vpxor %ymm1,%ymm12,%ymm1
  1089. vpslld $7,%ymm1,%ymm15
  1090. vpsrld $25,%ymm1,%ymm1
  1091. vpor %ymm1,%ymm15,%ymm1
  1092. vbroadcasti128 (%r10),%ymm15
  1093. vpaddd %ymm4,%ymm13,%ymm13
  1094. vpxor %ymm2,%ymm13,%ymm2
  1095. vpslld $7,%ymm2,%ymm14
  1096. vpsrld $25,%ymm2,%ymm2
  1097. vpor %ymm2,%ymm14,%ymm2
  1098. vmovdqa %ymm12,64(%rsp)
  1099. vmovdqa %ymm13,96(%rsp)
  1100. vmovdqa 0(%rsp),%ymm12
  1101. vmovdqa 32(%rsp),%ymm13
  1102. vpaddd %ymm3,%ymm10,%ymm10
  1103. vpxor %ymm5,%ymm10,%ymm5
  1104. vpshufb %ymm15,%ymm5,%ymm5
  1105. vpaddd %ymm0,%ymm11,%ymm11
  1106. vpxor %ymm6,%ymm11,%ymm6
  1107. vpshufb %ymm15,%ymm6,%ymm6
  1108. vpaddd %ymm5,%ymm12,%ymm12
  1109. vpxor %ymm3,%ymm12,%ymm3
  1110. vpslld $12,%ymm3,%ymm14
  1111. vpsrld $20,%ymm3,%ymm3
  1112. vpor %ymm3,%ymm14,%ymm3
  1113. vbroadcasti128 (%r11),%ymm14
  1114. vpaddd %ymm6,%ymm13,%ymm13
  1115. vpxor %ymm0,%ymm13,%ymm0
  1116. vpslld $12,%ymm0,%ymm15
  1117. vpsrld $20,%ymm0,%ymm0
  1118. vpor %ymm0,%ymm15,%ymm0
  1119. vpaddd %ymm3,%ymm10,%ymm10
  1120. vpxor %ymm5,%ymm10,%ymm5
  1121. vpshufb %ymm14,%ymm5,%ymm5
  1122. vpaddd %ymm0,%ymm11,%ymm11
  1123. vpxor %ymm6,%ymm11,%ymm6
  1124. vpshufb %ymm14,%ymm6,%ymm6
  1125. vpaddd %ymm5,%ymm12,%ymm12
  1126. vpxor %ymm3,%ymm12,%ymm3
  1127. vpslld $7,%ymm3,%ymm15
  1128. vpsrld $25,%ymm3,%ymm3
  1129. vpor %ymm3,%ymm15,%ymm3
  1130. vbroadcasti128 (%r10),%ymm15
  1131. vpaddd %ymm6,%ymm13,%ymm13
  1132. vpxor %ymm0,%ymm13,%ymm0
  1133. vpslld $7,%ymm0,%ymm14
  1134. vpsrld $25,%ymm0,%ymm0
  1135. vpor %ymm0,%ymm14,%ymm0
  1136. decl %eax
  1137. jnz L$oop8x
  1138. leaq 512(%rsp),%rax
  1139. vpaddd 128-256(%rcx),%ymm8,%ymm8
  1140. vpaddd 160-256(%rcx),%ymm9,%ymm9
  1141. vpaddd 192-256(%rcx),%ymm10,%ymm10
  1142. vpaddd 224-256(%rcx),%ymm11,%ymm11
  1143. vpunpckldq %ymm9,%ymm8,%ymm14
  1144. vpunpckldq %ymm11,%ymm10,%ymm15
  1145. vpunpckhdq %ymm9,%ymm8,%ymm8
  1146. vpunpckhdq %ymm11,%ymm10,%ymm10
  1147. vpunpcklqdq %ymm15,%ymm14,%ymm9
  1148. vpunpckhqdq %ymm15,%ymm14,%ymm14
  1149. vpunpcklqdq %ymm10,%ymm8,%ymm11
  1150. vpunpckhqdq %ymm10,%ymm8,%ymm8
  1151. vpaddd 256-256(%rcx),%ymm0,%ymm0
  1152. vpaddd 288-256(%rcx),%ymm1,%ymm1
  1153. vpaddd 320-256(%rcx),%ymm2,%ymm2
  1154. vpaddd 352-256(%rcx),%ymm3,%ymm3
  1155. vpunpckldq %ymm1,%ymm0,%ymm10
  1156. vpunpckldq %ymm3,%ymm2,%ymm15
  1157. vpunpckhdq %ymm1,%ymm0,%ymm0
  1158. vpunpckhdq %ymm3,%ymm2,%ymm2
  1159. vpunpcklqdq %ymm15,%ymm10,%ymm1
  1160. vpunpckhqdq %ymm15,%ymm10,%ymm10
  1161. vpunpcklqdq %ymm2,%ymm0,%ymm3
  1162. vpunpckhqdq %ymm2,%ymm0,%ymm0
  1163. vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
  1164. vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
  1165. vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
  1166. vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
  1167. vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
  1168. vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
  1169. vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
  1170. vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
  1171. vmovdqa %ymm15,0(%rsp)
  1172. vmovdqa %ymm9,32(%rsp)
  1173. vmovdqa 64(%rsp),%ymm15
  1174. vmovdqa 96(%rsp),%ymm9
  1175. vpaddd 384-512(%rax),%ymm12,%ymm12
  1176. vpaddd 416-512(%rax),%ymm13,%ymm13
  1177. vpaddd 448-512(%rax),%ymm15,%ymm15
  1178. vpaddd 480-512(%rax),%ymm9,%ymm9
  1179. vpunpckldq %ymm13,%ymm12,%ymm2
  1180. vpunpckldq %ymm9,%ymm15,%ymm8
  1181. vpunpckhdq %ymm13,%ymm12,%ymm12
  1182. vpunpckhdq %ymm9,%ymm15,%ymm15
  1183. vpunpcklqdq %ymm8,%ymm2,%ymm13
  1184. vpunpckhqdq %ymm8,%ymm2,%ymm2
  1185. vpunpcklqdq %ymm15,%ymm12,%ymm9
  1186. vpunpckhqdq %ymm15,%ymm12,%ymm12
  1187. vpaddd 512-512(%rax),%ymm4,%ymm4
  1188. vpaddd 544-512(%rax),%ymm5,%ymm5
  1189. vpaddd 576-512(%rax),%ymm6,%ymm6
  1190. vpaddd 608-512(%rax),%ymm7,%ymm7
  1191. vpunpckldq %ymm5,%ymm4,%ymm15
  1192. vpunpckldq %ymm7,%ymm6,%ymm8
  1193. vpunpckhdq %ymm5,%ymm4,%ymm4
  1194. vpunpckhdq %ymm7,%ymm6,%ymm6
  1195. vpunpcklqdq %ymm8,%ymm15,%ymm5
  1196. vpunpckhqdq %ymm8,%ymm15,%ymm15
  1197. vpunpcklqdq %ymm6,%ymm4,%ymm7
  1198. vpunpckhqdq %ymm6,%ymm4,%ymm4
  1199. vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
  1200. vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
  1201. vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
  1202. vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
  1203. vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
  1204. vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
  1205. vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
  1206. vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
  1207. vmovdqa 0(%rsp),%ymm6
  1208. vmovdqa 32(%rsp),%ymm12
  1209. cmpq $512,%rdx
  1210. jb L$tail8x
  1211. vpxor 0(%rsi),%ymm6,%ymm6
  1212. vpxor 32(%rsi),%ymm8,%ymm8
  1213. vpxor 64(%rsi),%ymm1,%ymm1
  1214. vpxor 96(%rsi),%ymm5,%ymm5
  1215. leaq 128(%rsi),%rsi
  1216. vmovdqu %ymm6,0(%rdi)
  1217. vmovdqu %ymm8,32(%rdi)
  1218. vmovdqu %ymm1,64(%rdi)
  1219. vmovdqu %ymm5,96(%rdi)
  1220. leaq 128(%rdi),%rdi
  1221. vpxor 0(%rsi),%ymm12,%ymm12
  1222. vpxor 32(%rsi),%ymm13,%ymm13
  1223. vpxor 64(%rsi),%ymm10,%ymm10
  1224. vpxor 96(%rsi),%ymm15,%ymm15
  1225. leaq 128(%rsi),%rsi
  1226. vmovdqu %ymm12,0(%rdi)
  1227. vmovdqu %ymm13,32(%rdi)
  1228. vmovdqu %ymm10,64(%rdi)
  1229. vmovdqu %ymm15,96(%rdi)
  1230. leaq 128(%rdi),%rdi
  1231. vpxor 0(%rsi),%ymm14,%ymm14
  1232. vpxor 32(%rsi),%ymm2,%ymm2
  1233. vpxor 64(%rsi),%ymm3,%ymm3
  1234. vpxor 96(%rsi),%ymm7,%ymm7
  1235. leaq 128(%rsi),%rsi
  1236. vmovdqu %ymm14,0(%rdi)
  1237. vmovdqu %ymm2,32(%rdi)
  1238. vmovdqu %ymm3,64(%rdi)
  1239. vmovdqu %ymm7,96(%rdi)
  1240. leaq 128(%rdi),%rdi
  1241. vpxor 0(%rsi),%ymm11,%ymm11
  1242. vpxor 32(%rsi),%ymm9,%ymm9
  1243. vpxor 64(%rsi),%ymm0,%ymm0
  1244. vpxor 96(%rsi),%ymm4,%ymm4
  1245. leaq 128(%rsi),%rsi
  1246. vmovdqu %ymm11,0(%rdi)
  1247. vmovdqu %ymm9,32(%rdi)
  1248. vmovdqu %ymm0,64(%rdi)
  1249. vmovdqu %ymm4,96(%rdi)
  1250. leaq 128(%rdi),%rdi
  1251. subq $512,%rdx
  1252. jnz L$oop_outer8x
  1253. jmp L$done8x
  1254. L$tail8x:
  1255. cmpq $448,%rdx
  1256. jae L$448_or_more8x
  1257. cmpq $384,%rdx
  1258. jae L$384_or_more8x
  1259. cmpq $320,%rdx
  1260. jae L$320_or_more8x
  1261. cmpq $256,%rdx
  1262. jae L$256_or_more8x
  1263. cmpq $192,%rdx
  1264. jae L$192_or_more8x
  1265. cmpq $128,%rdx
  1266. jae L$128_or_more8x
  1267. cmpq $64,%rdx
  1268. jae L$64_or_more8x
  1269. xorq %r10,%r10
  1270. vmovdqa %ymm6,0(%rsp)
  1271. vmovdqa %ymm8,32(%rsp)
  1272. jmp L$oop_tail8x
  1273. .p2align 5
  1274. L$64_or_more8x:
  1275. vpxor 0(%rsi),%ymm6,%ymm6
  1276. vpxor 32(%rsi),%ymm8,%ymm8
  1277. vmovdqu %ymm6,0(%rdi)
  1278. vmovdqu %ymm8,32(%rdi)
  1279. je L$done8x
  1280. leaq 64(%rsi),%rsi
  1281. xorq %r10,%r10
  1282. vmovdqa %ymm1,0(%rsp)
  1283. leaq 64(%rdi),%rdi
  1284. subq $64,%rdx
  1285. vmovdqa %ymm5,32(%rsp)
  1286. jmp L$oop_tail8x
  1287. .p2align 5
  1288. L$128_or_more8x:
  1289. vpxor 0(%rsi),%ymm6,%ymm6
  1290. vpxor 32(%rsi),%ymm8,%ymm8
  1291. vpxor 64(%rsi),%ymm1,%ymm1
  1292. vpxor 96(%rsi),%ymm5,%ymm5
  1293. vmovdqu %ymm6,0(%rdi)
  1294. vmovdqu %ymm8,32(%rdi)
  1295. vmovdqu %ymm1,64(%rdi)
  1296. vmovdqu %ymm5,96(%rdi)
  1297. je L$done8x
  1298. leaq 128(%rsi),%rsi
  1299. xorq %r10,%r10
  1300. vmovdqa %ymm12,0(%rsp)
  1301. leaq 128(%rdi),%rdi
  1302. subq $128,%rdx
  1303. vmovdqa %ymm13,32(%rsp)
  1304. jmp L$oop_tail8x
  1305. .p2align 5
  1306. L$192_or_more8x:
  1307. vpxor 0(%rsi),%ymm6,%ymm6
  1308. vpxor 32(%rsi),%ymm8,%ymm8
  1309. vpxor 64(%rsi),%ymm1,%ymm1
  1310. vpxor 96(%rsi),%ymm5,%ymm5
  1311. vpxor 128(%rsi),%ymm12,%ymm12
  1312. vpxor 160(%rsi),%ymm13,%ymm13
  1313. vmovdqu %ymm6,0(%rdi)
  1314. vmovdqu %ymm8,32(%rdi)
  1315. vmovdqu %ymm1,64(%rdi)
  1316. vmovdqu %ymm5,96(%rdi)
  1317. vmovdqu %ymm12,128(%rdi)
  1318. vmovdqu %ymm13,160(%rdi)
  1319. je L$done8x
  1320. leaq 192(%rsi),%rsi
  1321. xorq %r10,%r10
  1322. vmovdqa %ymm10,0(%rsp)
  1323. leaq 192(%rdi),%rdi
  1324. subq $192,%rdx
  1325. vmovdqa %ymm15,32(%rsp)
  1326. jmp L$oop_tail8x
  1327. .p2align 5
  1328. L$256_or_more8x:
  1329. vpxor 0(%rsi),%ymm6,%ymm6
  1330. vpxor 32(%rsi),%ymm8,%ymm8
  1331. vpxor 64(%rsi),%ymm1,%ymm1
  1332. vpxor 96(%rsi),%ymm5,%ymm5
  1333. vpxor 128(%rsi),%ymm12,%ymm12
  1334. vpxor 160(%rsi),%ymm13,%ymm13
  1335. vpxor 192(%rsi),%ymm10,%ymm10
  1336. vpxor 224(%rsi),%ymm15,%ymm15
  1337. vmovdqu %ymm6,0(%rdi)
  1338. vmovdqu %ymm8,32(%rdi)
  1339. vmovdqu %ymm1,64(%rdi)
  1340. vmovdqu %ymm5,96(%rdi)
  1341. vmovdqu %ymm12,128(%rdi)
  1342. vmovdqu %ymm13,160(%rdi)
  1343. vmovdqu %ymm10,192(%rdi)
  1344. vmovdqu %ymm15,224(%rdi)
  1345. je L$done8x
  1346. leaq 256(%rsi),%rsi
  1347. xorq %r10,%r10
  1348. vmovdqa %ymm14,0(%rsp)
  1349. leaq 256(%rdi),%rdi
  1350. subq $256,%rdx
  1351. vmovdqa %ymm2,32(%rsp)
  1352. jmp L$oop_tail8x
  1353. .p2align 5
  1354. L$320_or_more8x:
  1355. vpxor 0(%rsi),%ymm6,%ymm6
  1356. vpxor 32(%rsi),%ymm8,%ymm8
  1357. vpxor 64(%rsi),%ymm1,%ymm1
  1358. vpxor 96(%rsi),%ymm5,%ymm5
  1359. vpxor 128(%rsi),%ymm12,%ymm12
  1360. vpxor 160(%rsi),%ymm13,%ymm13
  1361. vpxor 192(%rsi),%ymm10,%ymm10
  1362. vpxor 224(%rsi),%ymm15,%ymm15
  1363. vpxor 256(%rsi),%ymm14,%ymm14
  1364. vpxor 288(%rsi),%ymm2,%ymm2
  1365. vmovdqu %ymm6,0(%rdi)
  1366. vmovdqu %ymm8,32(%rdi)
  1367. vmovdqu %ymm1,64(%rdi)
  1368. vmovdqu %ymm5,96(%rdi)
  1369. vmovdqu %ymm12,128(%rdi)
  1370. vmovdqu %ymm13,160(%rdi)
  1371. vmovdqu %ymm10,192(%rdi)
  1372. vmovdqu %ymm15,224(%rdi)
  1373. vmovdqu %ymm14,256(%rdi)
  1374. vmovdqu %ymm2,288(%rdi)
  1375. je L$done8x
  1376. leaq 320(%rsi),%rsi
  1377. xorq %r10,%r10
  1378. vmovdqa %ymm3,0(%rsp)
  1379. leaq 320(%rdi),%rdi
  1380. subq $320,%rdx
  1381. vmovdqa %ymm7,32(%rsp)
  1382. jmp L$oop_tail8x
  1383. .p2align 5
  1384. L$384_or_more8x:
  1385. vpxor 0(%rsi),%ymm6,%ymm6
  1386. vpxor 32(%rsi),%ymm8,%ymm8
  1387. vpxor 64(%rsi),%ymm1,%ymm1
  1388. vpxor 96(%rsi),%ymm5,%ymm5
  1389. vpxor 128(%rsi),%ymm12,%ymm12
  1390. vpxor 160(%rsi),%ymm13,%ymm13
  1391. vpxor 192(%rsi),%ymm10,%ymm10
  1392. vpxor 224(%rsi),%ymm15,%ymm15
  1393. vpxor 256(%rsi),%ymm14,%ymm14
  1394. vpxor 288(%rsi),%ymm2,%ymm2
  1395. vpxor 320(%rsi),%ymm3,%ymm3
  1396. vpxor 352(%rsi),%ymm7,%ymm7
  1397. vmovdqu %ymm6,0(%rdi)
  1398. vmovdqu %ymm8,32(%rdi)
  1399. vmovdqu %ymm1,64(%rdi)
  1400. vmovdqu %ymm5,96(%rdi)
  1401. vmovdqu %ymm12,128(%rdi)
  1402. vmovdqu %ymm13,160(%rdi)
  1403. vmovdqu %ymm10,192(%rdi)
  1404. vmovdqu %ymm15,224(%rdi)
  1405. vmovdqu %ymm14,256(%rdi)
  1406. vmovdqu %ymm2,288(%rdi)
  1407. vmovdqu %ymm3,320(%rdi)
  1408. vmovdqu %ymm7,352(%rdi)
  1409. je L$done8x
  1410. leaq 384(%rsi),%rsi
  1411. xorq %r10,%r10
  1412. vmovdqa %ymm11,0(%rsp)
  1413. leaq 384(%rdi),%rdi
  1414. subq $384,%rdx
  1415. vmovdqa %ymm9,32(%rsp)
  1416. jmp L$oop_tail8x
  1417. .p2align 5
  1418. L$448_or_more8x:
  1419. vpxor 0(%rsi),%ymm6,%ymm6
  1420. vpxor 32(%rsi),%ymm8,%ymm8
  1421. vpxor 64(%rsi),%ymm1,%ymm1
  1422. vpxor 96(%rsi),%ymm5,%ymm5
  1423. vpxor 128(%rsi),%ymm12,%ymm12
  1424. vpxor 160(%rsi),%ymm13,%ymm13
  1425. vpxor 192(%rsi),%ymm10,%ymm10
  1426. vpxor 224(%rsi),%ymm15,%ymm15
  1427. vpxor 256(%rsi),%ymm14,%ymm14
  1428. vpxor 288(%rsi),%ymm2,%ymm2
  1429. vpxor 320(%rsi),%ymm3,%ymm3
  1430. vpxor 352(%rsi),%ymm7,%ymm7
  1431. vpxor 384(%rsi),%ymm11,%ymm11
  1432. vpxor 416(%rsi),%ymm9,%ymm9
  1433. vmovdqu %ymm6,0(%rdi)
  1434. vmovdqu %ymm8,32(%rdi)
  1435. vmovdqu %ymm1,64(%rdi)
  1436. vmovdqu %ymm5,96(%rdi)
  1437. vmovdqu %ymm12,128(%rdi)
  1438. vmovdqu %ymm13,160(%rdi)
  1439. vmovdqu %ymm10,192(%rdi)
  1440. vmovdqu %ymm15,224(%rdi)
  1441. vmovdqu %ymm14,256(%rdi)
  1442. vmovdqu %ymm2,288(%rdi)
  1443. vmovdqu %ymm3,320(%rdi)
  1444. vmovdqu %ymm7,352(%rdi)
  1445. vmovdqu %ymm11,384(%rdi)
  1446. vmovdqu %ymm9,416(%rdi)
  1447. je L$done8x
  1448. leaq 448(%rsi),%rsi
  1449. xorq %r10,%r10
  1450. vmovdqa %ymm0,0(%rsp)
  1451. leaq 448(%rdi),%rdi
  1452. subq $448,%rdx
  1453. vmovdqa %ymm4,32(%rsp)
  1454. L$oop_tail8x:
  1455. movzbl (%rsi,%r10,1),%eax
  1456. movzbl (%rsp,%r10,1),%ecx
  1457. leaq 1(%r10),%r10
  1458. xorl %ecx,%eax
  1459. movb %al,-1(%rdi,%r10,1)
  1460. decq %rdx
  1461. jnz L$oop_tail8x
  1462. L$done8x:
  1463. vzeroall
  1464. leaq (%r9),%rsp
  1465. L$8x_epilogue:
  1466. .byte 0xf3,0xc3
  1467. #endif