chacha-x86_64.asm 39 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. %ifdef BORINGSSL_PREFIX
  8. %include "boringssl_prefix_symbols_nasm.inc"
  9. %endif
  10. section .text code align=64
  11. EXTERN OPENSSL_ia32cap_P
  12. ALIGN 64
  13. $L$zero:
  14. DD 0,0,0,0
  15. $L$one:
  16. DD 1,0,0,0
  17. $L$inc:
  18. DD 0,1,2,3
  19. $L$four:
  20. DD 4,4,4,4
  21. $L$incy:
  22. DD 0,2,4,6,1,3,5,7
  23. $L$eight:
  24. DD 8,8,8,8,8,8,8,8
  25. $L$rot16:
  26. DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd
  27. $L$rot24:
  28. DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe
  29. $L$sigma:
  30. DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
  31. DB 0
  32. ALIGN 64
  33. $L$zeroz:
  34. DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
  35. $L$fourz:
  36. DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
  37. $L$incz:
  38. DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
  39. $L$sixteen:
  40. DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
  41. DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
  42. DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
  43. DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
  44. DB 108,46,111,114,103,62,0
  45. global ChaCha20_ctr32
  46. ALIGN 64
  47. ChaCha20_ctr32:
  48. mov QWORD[8+rsp],rdi ;WIN64 prologue
  49. mov QWORD[16+rsp],rsi
  50. mov rax,rsp
  51. $L$SEH_begin_ChaCha20_ctr32:
  52. mov rdi,rcx
  53. mov rsi,rdx
  54. mov rdx,r8
  55. mov rcx,r9
  56. mov r8,QWORD[40+rsp]
  57. cmp rdx,0
  58. je NEAR $L$no_data
  59. mov r10,QWORD[((OPENSSL_ia32cap_P+4))]
  60. test r10d,512
  61. jnz NEAR $L$ChaCha20_ssse3
  62. push rbx
  63. push rbp
  64. push r12
  65. push r13
  66. push r14
  67. push r15
  68. sub rsp,64+24
  69. $L$ctr32_body:
  70. movdqu xmm1,XMMWORD[rcx]
  71. movdqu xmm2,XMMWORD[16+rcx]
  72. movdqu xmm3,XMMWORD[r8]
  73. movdqa xmm4,XMMWORD[$L$one]
  74. movdqa XMMWORD[16+rsp],xmm1
  75. movdqa XMMWORD[32+rsp],xmm2
  76. movdqa XMMWORD[48+rsp],xmm3
  77. mov rbp,rdx
  78. jmp NEAR $L$oop_outer
  79. ALIGN 32
  80. $L$oop_outer:
  81. mov eax,0x61707865
  82. mov ebx,0x3320646e
  83. mov ecx,0x79622d32
  84. mov edx,0x6b206574
  85. mov r8d,DWORD[16+rsp]
  86. mov r9d,DWORD[20+rsp]
  87. mov r10d,DWORD[24+rsp]
  88. mov r11d,DWORD[28+rsp]
  89. movd r12d,xmm3
  90. mov r13d,DWORD[52+rsp]
  91. mov r14d,DWORD[56+rsp]
  92. mov r15d,DWORD[60+rsp]
  93. mov QWORD[((64+0))+rsp],rbp
  94. mov ebp,10
  95. mov QWORD[((64+8))+rsp],rsi
  96. DB 102,72,15,126,214
  97. mov QWORD[((64+16))+rsp],rdi
  98. mov rdi,rsi
  99. shr rdi,32
  100. jmp NEAR $L$oop
  101. ALIGN 32
  102. $L$oop:
  103. add eax,r8d
  104. xor r12d,eax
  105. rol r12d,16
  106. add ebx,r9d
  107. xor r13d,ebx
  108. rol r13d,16
  109. add esi,r12d
  110. xor r8d,esi
  111. rol r8d,12
  112. add edi,r13d
  113. xor r9d,edi
  114. rol r9d,12
  115. add eax,r8d
  116. xor r12d,eax
  117. rol r12d,8
  118. add ebx,r9d
  119. xor r13d,ebx
  120. rol r13d,8
  121. add esi,r12d
  122. xor r8d,esi
  123. rol r8d,7
  124. add edi,r13d
  125. xor r9d,edi
  126. rol r9d,7
  127. mov DWORD[32+rsp],esi
  128. mov DWORD[36+rsp],edi
  129. mov esi,DWORD[40+rsp]
  130. mov edi,DWORD[44+rsp]
  131. add ecx,r10d
  132. xor r14d,ecx
  133. rol r14d,16
  134. add edx,r11d
  135. xor r15d,edx
  136. rol r15d,16
  137. add esi,r14d
  138. xor r10d,esi
  139. rol r10d,12
  140. add edi,r15d
  141. xor r11d,edi
  142. rol r11d,12
  143. add ecx,r10d
  144. xor r14d,ecx
  145. rol r14d,8
  146. add edx,r11d
  147. xor r15d,edx
  148. rol r15d,8
  149. add esi,r14d
  150. xor r10d,esi
  151. rol r10d,7
  152. add edi,r15d
  153. xor r11d,edi
  154. rol r11d,7
  155. add eax,r9d
  156. xor r15d,eax
  157. rol r15d,16
  158. add ebx,r10d
  159. xor r12d,ebx
  160. rol r12d,16
  161. add esi,r15d
  162. xor r9d,esi
  163. rol r9d,12
  164. add edi,r12d
  165. xor r10d,edi
  166. rol r10d,12
  167. add eax,r9d
  168. xor r15d,eax
  169. rol r15d,8
  170. add ebx,r10d
  171. xor r12d,ebx
  172. rol r12d,8
  173. add esi,r15d
  174. xor r9d,esi
  175. rol r9d,7
  176. add edi,r12d
  177. xor r10d,edi
  178. rol r10d,7
  179. mov DWORD[40+rsp],esi
  180. mov DWORD[44+rsp],edi
  181. mov esi,DWORD[32+rsp]
  182. mov edi,DWORD[36+rsp]
  183. add ecx,r11d
  184. xor r13d,ecx
  185. rol r13d,16
  186. add edx,r8d
  187. xor r14d,edx
  188. rol r14d,16
  189. add esi,r13d
  190. xor r11d,esi
  191. rol r11d,12
  192. add edi,r14d
  193. xor r8d,edi
  194. rol r8d,12
  195. add ecx,r11d
  196. xor r13d,ecx
  197. rol r13d,8
  198. add edx,r8d
  199. xor r14d,edx
  200. rol r14d,8
  201. add esi,r13d
  202. xor r11d,esi
  203. rol r11d,7
  204. add edi,r14d
  205. xor r8d,edi
  206. rol r8d,7
  207. dec ebp
  208. jnz NEAR $L$oop
  209. mov DWORD[36+rsp],edi
  210. mov DWORD[32+rsp],esi
  211. mov rbp,QWORD[64+rsp]
  212. movdqa xmm1,xmm2
  213. mov rsi,QWORD[((64+8))+rsp]
  214. paddd xmm3,xmm4
  215. mov rdi,QWORD[((64+16))+rsp]
  216. add eax,0x61707865
  217. add ebx,0x3320646e
  218. add ecx,0x79622d32
  219. add edx,0x6b206574
  220. add r8d,DWORD[16+rsp]
  221. add r9d,DWORD[20+rsp]
  222. add r10d,DWORD[24+rsp]
  223. add r11d,DWORD[28+rsp]
  224. add r12d,DWORD[48+rsp]
  225. add r13d,DWORD[52+rsp]
  226. add r14d,DWORD[56+rsp]
  227. add r15d,DWORD[60+rsp]
  228. paddd xmm1,XMMWORD[32+rsp]
  229. cmp rbp,64
  230. jb NEAR $L$tail
  231. xor eax,DWORD[rsi]
  232. xor ebx,DWORD[4+rsi]
  233. xor ecx,DWORD[8+rsi]
  234. xor edx,DWORD[12+rsi]
  235. xor r8d,DWORD[16+rsi]
  236. xor r9d,DWORD[20+rsi]
  237. xor r10d,DWORD[24+rsi]
  238. xor r11d,DWORD[28+rsi]
  239. movdqu xmm0,XMMWORD[32+rsi]
  240. xor r12d,DWORD[48+rsi]
  241. xor r13d,DWORD[52+rsi]
  242. xor r14d,DWORD[56+rsi]
  243. xor r15d,DWORD[60+rsi]
  244. lea rsi,[64+rsi]
  245. pxor xmm0,xmm1
  246. movdqa XMMWORD[32+rsp],xmm2
  247. movd DWORD[48+rsp],xmm3
  248. mov DWORD[rdi],eax
  249. mov DWORD[4+rdi],ebx
  250. mov DWORD[8+rdi],ecx
  251. mov DWORD[12+rdi],edx
  252. mov DWORD[16+rdi],r8d
  253. mov DWORD[20+rdi],r9d
  254. mov DWORD[24+rdi],r10d
  255. mov DWORD[28+rdi],r11d
  256. movdqu XMMWORD[32+rdi],xmm0
  257. mov DWORD[48+rdi],r12d
  258. mov DWORD[52+rdi],r13d
  259. mov DWORD[56+rdi],r14d
  260. mov DWORD[60+rdi],r15d
  261. lea rdi,[64+rdi]
  262. sub rbp,64
  263. jnz NEAR $L$oop_outer
  264. jmp NEAR $L$done
  265. ALIGN 16
  266. $L$tail:
  267. mov DWORD[rsp],eax
  268. mov DWORD[4+rsp],ebx
  269. xor rbx,rbx
  270. mov DWORD[8+rsp],ecx
  271. mov DWORD[12+rsp],edx
  272. mov DWORD[16+rsp],r8d
  273. mov DWORD[20+rsp],r9d
  274. mov DWORD[24+rsp],r10d
  275. mov DWORD[28+rsp],r11d
  276. movdqa XMMWORD[32+rsp],xmm1
  277. mov DWORD[48+rsp],r12d
  278. mov DWORD[52+rsp],r13d
  279. mov DWORD[56+rsp],r14d
  280. mov DWORD[60+rsp],r15d
  281. $L$oop_tail:
  282. movzx eax,BYTE[rbx*1+rsi]
  283. movzx edx,BYTE[rbx*1+rsp]
  284. lea rbx,[1+rbx]
  285. xor eax,edx
  286. mov BYTE[((-1))+rbx*1+rdi],al
  287. dec rbp
  288. jnz NEAR $L$oop_tail
  289. $L$done:
  290. lea rsi,[((64+24+48))+rsp]
  291. mov r15,QWORD[((-48))+rsi]
  292. mov r14,QWORD[((-40))+rsi]
  293. mov r13,QWORD[((-32))+rsi]
  294. mov r12,QWORD[((-24))+rsi]
  295. mov rbp,QWORD[((-16))+rsi]
  296. mov rbx,QWORD[((-8))+rsi]
  297. lea rsp,[rsi]
  298. $L$no_data:
  299. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  300. mov rsi,QWORD[16+rsp]
  301. DB 0F3h,0C3h ;repret
  302. $L$SEH_end_ChaCha20_ctr32:
  303. ALIGN 32
  304. ChaCha20_ssse3:
  305. mov QWORD[8+rsp],rdi ;WIN64 prologue
  306. mov QWORD[16+rsp],rsi
  307. mov rax,rsp
  308. $L$SEH_begin_ChaCha20_ssse3:
  309. mov rdi,rcx
  310. mov rsi,rdx
  311. mov rdx,r8
  312. mov rcx,r9
  313. mov r8,QWORD[40+rsp]
  314. $L$ChaCha20_ssse3:
  315. mov r9,rsp
  316. cmp rdx,128
  317. ja NEAR $L$ChaCha20_4x
  318. $L$do_sse3_after_all:
  319. sub rsp,64+40
  320. movaps XMMWORD[(-40)+r9],xmm6
  321. movaps XMMWORD[(-24)+r9],xmm7
  322. $L$ssse3_body:
  323. movdqa xmm0,XMMWORD[$L$sigma]
  324. movdqu xmm1,XMMWORD[rcx]
  325. movdqu xmm2,XMMWORD[16+rcx]
  326. movdqu xmm3,XMMWORD[r8]
  327. movdqa xmm6,XMMWORD[$L$rot16]
  328. movdqa xmm7,XMMWORD[$L$rot24]
  329. movdqa XMMWORD[rsp],xmm0
  330. movdqa XMMWORD[16+rsp],xmm1
  331. movdqa XMMWORD[32+rsp],xmm2
  332. movdqa XMMWORD[48+rsp],xmm3
  333. mov r8,10
  334. jmp NEAR $L$oop_ssse3
  335. ALIGN 32
  336. $L$oop_outer_ssse3:
  337. movdqa xmm3,XMMWORD[$L$one]
  338. movdqa xmm0,XMMWORD[rsp]
  339. movdqa xmm1,XMMWORD[16+rsp]
  340. movdqa xmm2,XMMWORD[32+rsp]
  341. paddd xmm3,XMMWORD[48+rsp]
  342. mov r8,10
  343. movdqa XMMWORD[48+rsp],xmm3
  344. jmp NEAR $L$oop_ssse3
  345. ALIGN 32
  346. $L$oop_ssse3:
  347. paddd xmm0,xmm1
  348. pxor xmm3,xmm0
  349. DB 102,15,56,0,222
  350. paddd xmm2,xmm3
  351. pxor xmm1,xmm2
  352. movdqa xmm4,xmm1
  353. psrld xmm1,20
  354. pslld xmm4,12
  355. por xmm1,xmm4
  356. paddd xmm0,xmm1
  357. pxor xmm3,xmm0
  358. DB 102,15,56,0,223
  359. paddd xmm2,xmm3
  360. pxor xmm1,xmm2
  361. movdqa xmm4,xmm1
  362. psrld xmm1,25
  363. pslld xmm4,7
  364. por xmm1,xmm4
  365. pshufd xmm2,xmm2,78
  366. pshufd xmm1,xmm1,57
  367. pshufd xmm3,xmm3,147
  368. nop
  369. paddd xmm0,xmm1
  370. pxor xmm3,xmm0
  371. DB 102,15,56,0,222
  372. paddd xmm2,xmm3
  373. pxor xmm1,xmm2
  374. movdqa xmm4,xmm1
  375. psrld xmm1,20
  376. pslld xmm4,12
  377. por xmm1,xmm4
  378. paddd xmm0,xmm1
  379. pxor xmm3,xmm0
  380. DB 102,15,56,0,223
  381. paddd xmm2,xmm3
  382. pxor xmm1,xmm2
  383. movdqa xmm4,xmm1
  384. psrld xmm1,25
  385. pslld xmm4,7
  386. por xmm1,xmm4
  387. pshufd xmm2,xmm2,78
  388. pshufd xmm1,xmm1,147
  389. pshufd xmm3,xmm3,57
  390. dec r8
  391. jnz NEAR $L$oop_ssse3
  392. paddd xmm0,XMMWORD[rsp]
  393. paddd xmm1,XMMWORD[16+rsp]
  394. paddd xmm2,XMMWORD[32+rsp]
  395. paddd xmm3,XMMWORD[48+rsp]
  396. cmp rdx,64
  397. jb NEAR $L$tail_ssse3
  398. movdqu xmm4,XMMWORD[rsi]
  399. movdqu xmm5,XMMWORD[16+rsi]
  400. pxor xmm0,xmm4
  401. movdqu xmm4,XMMWORD[32+rsi]
  402. pxor xmm1,xmm5
  403. movdqu xmm5,XMMWORD[48+rsi]
  404. lea rsi,[64+rsi]
  405. pxor xmm2,xmm4
  406. pxor xmm3,xmm5
  407. movdqu XMMWORD[rdi],xmm0
  408. movdqu XMMWORD[16+rdi],xmm1
  409. movdqu XMMWORD[32+rdi],xmm2
  410. movdqu XMMWORD[48+rdi],xmm3
  411. lea rdi,[64+rdi]
  412. sub rdx,64
  413. jnz NEAR $L$oop_outer_ssse3
  414. jmp NEAR $L$done_ssse3
  415. ALIGN 16
  416. $L$tail_ssse3:
  417. movdqa XMMWORD[rsp],xmm0
  418. movdqa XMMWORD[16+rsp],xmm1
  419. movdqa XMMWORD[32+rsp],xmm2
  420. movdqa XMMWORD[48+rsp],xmm3
  421. xor r8,r8
  422. $L$oop_tail_ssse3:
  423. movzx eax,BYTE[r8*1+rsi]
  424. movzx ecx,BYTE[r8*1+rsp]
  425. lea r8,[1+r8]
  426. xor eax,ecx
  427. mov BYTE[((-1))+r8*1+rdi],al
  428. dec rdx
  429. jnz NEAR $L$oop_tail_ssse3
  430. $L$done_ssse3:
  431. movaps xmm6,XMMWORD[((-40))+r9]
  432. movaps xmm7,XMMWORD[((-24))+r9]
  433. lea rsp,[r9]
  434. $L$ssse3_epilogue:
  435. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  436. mov rsi,QWORD[16+rsp]
  437. DB 0F3h,0C3h ;repret
  438. $L$SEH_end_ChaCha20_ssse3:
  439. ALIGN 32
  440. ChaCha20_4x:
  441. mov QWORD[8+rsp],rdi ;WIN64 prologue
  442. mov QWORD[16+rsp],rsi
  443. mov rax,rsp
  444. $L$SEH_begin_ChaCha20_4x:
  445. mov rdi,rcx
  446. mov rsi,rdx
  447. mov rdx,r8
  448. mov rcx,r9
  449. mov r8,QWORD[40+rsp]
  450. $L$ChaCha20_4x:
  451. mov r9,rsp
  452. mov r11,r10
  453. shr r10,32
  454. test r10,32
  455. jnz NEAR $L$ChaCha20_8x
  456. cmp rdx,192
  457. ja NEAR $L$proceed4x
  458. and r11,71303168
  459. cmp r11,4194304
  460. je NEAR $L$do_sse3_after_all
  461. $L$proceed4x:
  462. sub rsp,0x140+168
  463. movaps XMMWORD[(-168)+r9],xmm6
  464. movaps XMMWORD[(-152)+r9],xmm7
  465. movaps XMMWORD[(-136)+r9],xmm8
  466. movaps XMMWORD[(-120)+r9],xmm9
  467. movaps XMMWORD[(-104)+r9],xmm10
  468. movaps XMMWORD[(-88)+r9],xmm11
  469. movaps XMMWORD[(-72)+r9],xmm12
  470. movaps XMMWORD[(-56)+r9],xmm13
  471. movaps XMMWORD[(-40)+r9],xmm14
  472. movaps XMMWORD[(-24)+r9],xmm15
  473. $L$4x_body:
  474. movdqa xmm11,XMMWORD[$L$sigma]
  475. movdqu xmm15,XMMWORD[rcx]
  476. movdqu xmm7,XMMWORD[16+rcx]
  477. movdqu xmm3,XMMWORD[r8]
  478. lea rcx,[256+rsp]
  479. lea r10,[$L$rot16]
  480. lea r11,[$L$rot24]
  481. pshufd xmm8,xmm11,0x00
  482. pshufd xmm9,xmm11,0x55
  483. movdqa XMMWORD[64+rsp],xmm8
  484. pshufd xmm10,xmm11,0xaa
  485. movdqa XMMWORD[80+rsp],xmm9
  486. pshufd xmm11,xmm11,0xff
  487. movdqa XMMWORD[96+rsp],xmm10
  488. movdqa XMMWORD[112+rsp],xmm11
  489. pshufd xmm12,xmm15,0x00
  490. pshufd xmm13,xmm15,0x55
  491. movdqa XMMWORD[(128-256)+rcx],xmm12
  492. pshufd xmm14,xmm15,0xaa
  493. movdqa XMMWORD[(144-256)+rcx],xmm13
  494. pshufd xmm15,xmm15,0xff
  495. movdqa XMMWORD[(160-256)+rcx],xmm14
  496. movdqa XMMWORD[(176-256)+rcx],xmm15
  497. pshufd xmm4,xmm7,0x00
  498. pshufd xmm5,xmm7,0x55
  499. movdqa XMMWORD[(192-256)+rcx],xmm4
  500. pshufd xmm6,xmm7,0xaa
  501. movdqa XMMWORD[(208-256)+rcx],xmm5
  502. pshufd xmm7,xmm7,0xff
  503. movdqa XMMWORD[(224-256)+rcx],xmm6
  504. movdqa XMMWORD[(240-256)+rcx],xmm7
  505. pshufd xmm0,xmm3,0x00
  506. pshufd xmm1,xmm3,0x55
  507. paddd xmm0,XMMWORD[$L$inc]
  508. pshufd xmm2,xmm3,0xaa
  509. movdqa XMMWORD[(272-256)+rcx],xmm1
  510. pshufd xmm3,xmm3,0xff
  511. movdqa XMMWORD[(288-256)+rcx],xmm2
  512. movdqa XMMWORD[(304-256)+rcx],xmm3
  513. jmp NEAR $L$oop_enter4x
  514. ALIGN 32
  515. $L$oop_outer4x:
  516. movdqa xmm8,XMMWORD[64+rsp]
  517. movdqa xmm9,XMMWORD[80+rsp]
  518. movdqa xmm10,XMMWORD[96+rsp]
  519. movdqa xmm11,XMMWORD[112+rsp]
  520. movdqa xmm12,XMMWORD[((128-256))+rcx]
  521. movdqa xmm13,XMMWORD[((144-256))+rcx]
  522. movdqa xmm14,XMMWORD[((160-256))+rcx]
  523. movdqa xmm15,XMMWORD[((176-256))+rcx]
  524. movdqa xmm4,XMMWORD[((192-256))+rcx]
  525. movdqa xmm5,XMMWORD[((208-256))+rcx]
  526. movdqa xmm6,XMMWORD[((224-256))+rcx]
  527. movdqa xmm7,XMMWORD[((240-256))+rcx]
  528. movdqa xmm0,XMMWORD[((256-256))+rcx]
  529. movdqa xmm1,XMMWORD[((272-256))+rcx]
  530. movdqa xmm2,XMMWORD[((288-256))+rcx]
  531. movdqa xmm3,XMMWORD[((304-256))+rcx]
  532. paddd xmm0,XMMWORD[$L$four]
  533. $L$oop_enter4x:
  534. movdqa XMMWORD[32+rsp],xmm6
  535. movdqa XMMWORD[48+rsp],xmm7
  536. movdqa xmm7,XMMWORD[r10]
  537. mov eax,10
  538. movdqa XMMWORD[(256-256)+rcx],xmm0
  539. jmp NEAR $L$oop4x
  540. ALIGN 32
  541. $L$oop4x:
  542. paddd xmm8,xmm12
  543. paddd xmm9,xmm13
  544. pxor xmm0,xmm8
  545. pxor xmm1,xmm9
  546. DB 102,15,56,0,199
  547. DB 102,15,56,0,207
  548. paddd xmm4,xmm0
  549. paddd xmm5,xmm1
  550. pxor xmm12,xmm4
  551. pxor xmm13,xmm5
  552. movdqa xmm6,xmm12
  553. pslld xmm12,12
  554. psrld xmm6,20
  555. movdqa xmm7,xmm13
  556. pslld xmm13,12
  557. por xmm12,xmm6
  558. psrld xmm7,20
  559. movdqa xmm6,XMMWORD[r11]
  560. por xmm13,xmm7
  561. paddd xmm8,xmm12
  562. paddd xmm9,xmm13
  563. pxor xmm0,xmm8
  564. pxor xmm1,xmm9
  565. DB 102,15,56,0,198
  566. DB 102,15,56,0,206
  567. paddd xmm4,xmm0
  568. paddd xmm5,xmm1
  569. pxor xmm12,xmm4
  570. pxor xmm13,xmm5
  571. movdqa xmm7,xmm12
  572. pslld xmm12,7
  573. psrld xmm7,25
  574. movdqa xmm6,xmm13
  575. pslld xmm13,7
  576. por xmm12,xmm7
  577. psrld xmm6,25
  578. movdqa xmm7,XMMWORD[r10]
  579. por xmm13,xmm6
  580. movdqa XMMWORD[rsp],xmm4
  581. movdqa XMMWORD[16+rsp],xmm5
  582. movdqa xmm4,XMMWORD[32+rsp]
  583. movdqa xmm5,XMMWORD[48+rsp]
  584. paddd xmm10,xmm14
  585. paddd xmm11,xmm15
  586. pxor xmm2,xmm10
  587. pxor xmm3,xmm11
  588. DB 102,15,56,0,215
  589. DB 102,15,56,0,223
  590. paddd xmm4,xmm2
  591. paddd xmm5,xmm3
  592. pxor xmm14,xmm4
  593. pxor xmm15,xmm5
  594. movdqa xmm6,xmm14
  595. pslld xmm14,12
  596. psrld xmm6,20
  597. movdqa xmm7,xmm15
  598. pslld xmm15,12
  599. por xmm14,xmm6
  600. psrld xmm7,20
  601. movdqa xmm6,XMMWORD[r11]
  602. por xmm15,xmm7
  603. paddd xmm10,xmm14
  604. paddd xmm11,xmm15
  605. pxor xmm2,xmm10
  606. pxor xmm3,xmm11
  607. DB 102,15,56,0,214
  608. DB 102,15,56,0,222
  609. paddd xmm4,xmm2
  610. paddd xmm5,xmm3
  611. pxor xmm14,xmm4
  612. pxor xmm15,xmm5
  613. movdqa xmm7,xmm14
  614. pslld xmm14,7
  615. psrld xmm7,25
  616. movdqa xmm6,xmm15
  617. pslld xmm15,7
  618. por xmm14,xmm7
  619. psrld xmm6,25
  620. movdqa xmm7,XMMWORD[r10]
  621. por xmm15,xmm6
  622. paddd xmm8,xmm13
  623. paddd xmm9,xmm14
  624. pxor xmm3,xmm8
  625. pxor xmm0,xmm9
  626. DB 102,15,56,0,223
  627. DB 102,15,56,0,199
  628. paddd xmm4,xmm3
  629. paddd xmm5,xmm0
  630. pxor xmm13,xmm4
  631. pxor xmm14,xmm5
  632. movdqa xmm6,xmm13
  633. pslld xmm13,12
  634. psrld xmm6,20
  635. movdqa xmm7,xmm14
  636. pslld xmm14,12
  637. por xmm13,xmm6
  638. psrld xmm7,20
  639. movdqa xmm6,XMMWORD[r11]
  640. por xmm14,xmm7
  641. paddd xmm8,xmm13
  642. paddd xmm9,xmm14
  643. pxor xmm3,xmm8
  644. pxor xmm0,xmm9
  645. DB 102,15,56,0,222
  646. DB 102,15,56,0,198
  647. paddd xmm4,xmm3
  648. paddd xmm5,xmm0
  649. pxor xmm13,xmm4
  650. pxor xmm14,xmm5
  651. movdqa xmm7,xmm13
  652. pslld xmm13,7
  653. psrld xmm7,25
  654. movdqa xmm6,xmm14
  655. pslld xmm14,7
  656. por xmm13,xmm7
  657. psrld xmm6,25
  658. movdqa xmm7,XMMWORD[r10]
  659. por xmm14,xmm6
  660. movdqa XMMWORD[32+rsp],xmm4
  661. movdqa XMMWORD[48+rsp],xmm5
  662. movdqa xmm4,XMMWORD[rsp]
  663. movdqa xmm5,XMMWORD[16+rsp]
  664. paddd xmm10,xmm15
  665. paddd xmm11,xmm12
  666. pxor xmm1,xmm10
  667. pxor xmm2,xmm11
  668. DB 102,15,56,0,207
  669. DB 102,15,56,0,215
  670. paddd xmm4,xmm1
  671. paddd xmm5,xmm2
  672. pxor xmm15,xmm4
  673. pxor xmm12,xmm5
  674. movdqa xmm6,xmm15
  675. pslld xmm15,12
  676. psrld xmm6,20
  677. movdqa xmm7,xmm12
  678. pslld xmm12,12
  679. por xmm15,xmm6
  680. psrld xmm7,20
  681. movdqa xmm6,XMMWORD[r11]
  682. por xmm12,xmm7
  683. paddd xmm10,xmm15
  684. paddd xmm11,xmm12
  685. pxor xmm1,xmm10
  686. pxor xmm2,xmm11
  687. DB 102,15,56,0,206
  688. DB 102,15,56,0,214
  689. paddd xmm4,xmm1
  690. paddd xmm5,xmm2
  691. pxor xmm15,xmm4
  692. pxor xmm12,xmm5
  693. movdqa xmm7,xmm15
  694. pslld xmm15,7
  695. psrld xmm7,25
  696. movdqa xmm6,xmm12
  697. pslld xmm12,7
  698. por xmm15,xmm7
  699. psrld xmm6,25
  700. movdqa xmm7,XMMWORD[r10]
  701. por xmm12,xmm6
  702. dec eax
  703. jnz NEAR $L$oop4x
  704. paddd xmm8,XMMWORD[64+rsp]
  705. paddd xmm9,XMMWORD[80+rsp]
  706. paddd xmm10,XMMWORD[96+rsp]
  707. paddd xmm11,XMMWORD[112+rsp]
  708. movdqa xmm6,xmm8
  709. punpckldq xmm8,xmm9
  710. movdqa xmm7,xmm10
  711. punpckldq xmm10,xmm11
  712. punpckhdq xmm6,xmm9
  713. punpckhdq xmm7,xmm11
  714. movdqa xmm9,xmm8
  715. punpcklqdq xmm8,xmm10
  716. movdqa xmm11,xmm6
  717. punpcklqdq xmm6,xmm7
  718. punpckhqdq xmm9,xmm10
  719. punpckhqdq xmm11,xmm7
  720. paddd xmm12,XMMWORD[((128-256))+rcx]
  721. paddd xmm13,XMMWORD[((144-256))+rcx]
  722. paddd xmm14,XMMWORD[((160-256))+rcx]
  723. paddd xmm15,XMMWORD[((176-256))+rcx]
  724. movdqa XMMWORD[rsp],xmm8
  725. movdqa XMMWORD[16+rsp],xmm9
  726. movdqa xmm8,XMMWORD[32+rsp]
  727. movdqa xmm9,XMMWORD[48+rsp]
  728. movdqa xmm10,xmm12
  729. punpckldq xmm12,xmm13
  730. movdqa xmm7,xmm14
  731. punpckldq xmm14,xmm15
  732. punpckhdq xmm10,xmm13
  733. punpckhdq xmm7,xmm15
  734. movdqa xmm13,xmm12
  735. punpcklqdq xmm12,xmm14
  736. movdqa xmm15,xmm10
  737. punpcklqdq xmm10,xmm7
  738. punpckhqdq xmm13,xmm14
  739. punpckhqdq xmm15,xmm7
  740. paddd xmm4,XMMWORD[((192-256))+rcx]
  741. paddd xmm5,XMMWORD[((208-256))+rcx]
  742. paddd xmm8,XMMWORD[((224-256))+rcx]
  743. paddd xmm9,XMMWORD[((240-256))+rcx]
  744. movdqa XMMWORD[32+rsp],xmm6
  745. movdqa XMMWORD[48+rsp],xmm11
  746. movdqa xmm14,xmm4
  747. punpckldq xmm4,xmm5
  748. movdqa xmm7,xmm8
  749. punpckldq xmm8,xmm9
  750. punpckhdq xmm14,xmm5
  751. punpckhdq xmm7,xmm9
  752. movdqa xmm5,xmm4
  753. punpcklqdq xmm4,xmm8
  754. movdqa xmm9,xmm14
  755. punpcklqdq xmm14,xmm7
  756. punpckhqdq xmm5,xmm8
  757. punpckhqdq xmm9,xmm7
  758. paddd xmm0,XMMWORD[((256-256))+rcx]
  759. paddd xmm1,XMMWORD[((272-256))+rcx]
  760. paddd xmm2,XMMWORD[((288-256))+rcx]
  761. paddd xmm3,XMMWORD[((304-256))+rcx]
  762. movdqa xmm8,xmm0
  763. punpckldq xmm0,xmm1
  764. movdqa xmm7,xmm2
  765. punpckldq xmm2,xmm3
  766. punpckhdq xmm8,xmm1
  767. punpckhdq xmm7,xmm3
  768. movdqa xmm1,xmm0
  769. punpcklqdq xmm0,xmm2
  770. movdqa xmm3,xmm8
  771. punpcklqdq xmm8,xmm7
  772. punpckhqdq xmm1,xmm2
  773. punpckhqdq xmm3,xmm7
  774. cmp rdx,64*4
  775. jb NEAR $L$tail4x
  776. movdqu xmm6,XMMWORD[rsi]
  777. movdqu xmm11,XMMWORD[16+rsi]
  778. movdqu xmm2,XMMWORD[32+rsi]
  779. movdqu xmm7,XMMWORD[48+rsi]
  780. pxor xmm6,XMMWORD[rsp]
  781. pxor xmm11,xmm12
  782. pxor xmm2,xmm4
  783. pxor xmm7,xmm0
  784. movdqu XMMWORD[rdi],xmm6
  785. movdqu xmm6,XMMWORD[64+rsi]
  786. movdqu XMMWORD[16+rdi],xmm11
  787. movdqu xmm11,XMMWORD[80+rsi]
  788. movdqu XMMWORD[32+rdi],xmm2
  789. movdqu xmm2,XMMWORD[96+rsi]
  790. movdqu XMMWORD[48+rdi],xmm7
  791. movdqu xmm7,XMMWORD[112+rsi]
  792. lea rsi,[128+rsi]
  793. pxor xmm6,XMMWORD[16+rsp]
  794. pxor xmm11,xmm13
  795. pxor xmm2,xmm5
  796. pxor xmm7,xmm1
  797. movdqu XMMWORD[64+rdi],xmm6
  798. movdqu xmm6,XMMWORD[rsi]
  799. movdqu XMMWORD[80+rdi],xmm11
  800. movdqu xmm11,XMMWORD[16+rsi]
  801. movdqu XMMWORD[96+rdi],xmm2
  802. movdqu xmm2,XMMWORD[32+rsi]
  803. movdqu XMMWORD[112+rdi],xmm7
  804. lea rdi,[128+rdi]
  805. movdqu xmm7,XMMWORD[48+rsi]
  806. pxor xmm6,XMMWORD[32+rsp]
  807. pxor xmm11,xmm10
  808. pxor xmm2,xmm14
  809. pxor xmm7,xmm8
  810. movdqu XMMWORD[rdi],xmm6
  811. movdqu xmm6,XMMWORD[64+rsi]
  812. movdqu XMMWORD[16+rdi],xmm11
  813. movdqu xmm11,XMMWORD[80+rsi]
  814. movdqu XMMWORD[32+rdi],xmm2
  815. movdqu xmm2,XMMWORD[96+rsi]
  816. movdqu XMMWORD[48+rdi],xmm7
  817. movdqu xmm7,XMMWORD[112+rsi]
  818. lea rsi,[128+rsi]
  819. pxor xmm6,XMMWORD[48+rsp]
  820. pxor xmm11,xmm15
  821. pxor xmm2,xmm9
  822. pxor xmm7,xmm3
  823. movdqu XMMWORD[64+rdi],xmm6
  824. movdqu XMMWORD[80+rdi],xmm11
  825. movdqu XMMWORD[96+rdi],xmm2
  826. movdqu XMMWORD[112+rdi],xmm7
  827. lea rdi,[128+rdi]
  828. sub rdx,64*4
  829. jnz NEAR $L$oop_outer4x
  830. jmp NEAR $L$done4x
  831. $L$tail4x:
  832. cmp rdx,192
  833. jae NEAR $L$192_or_more4x
  834. cmp rdx,128
  835. jae NEAR $L$128_or_more4x
  836. cmp rdx,64
  837. jae NEAR $L$64_or_more4x
  838. xor r10,r10
  839. movdqa XMMWORD[16+rsp],xmm12
  840. movdqa XMMWORD[32+rsp],xmm4
  841. movdqa XMMWORD[48+rsp],xmm0
  842. jmp NEAR $L$oop_tail4x
  843. ALIGN 32
  844. $L$64_or_more4x:
  845. movdqu xmm6,XMMWORD[rsi]
  846. movdqu xmm11,XMMWORD[16+rsi]
  847. movdqu xmm2,XMMWORD[32+rsi]
  848. movdqu xmm7,XMMWORD[48+rsi]
  849. pxor xmm6,XMMWORD[rsp]
  850. pxor xmm11,xmm12
  851. pxor xmm2,xmm4
  852. pxor xmm7,xmm0
  853. movdqu XMMWORD[rdi],xmm6
  854. movdqu XMMWORD[16+rdi],xmm11
  855. movdqu XMMWORD[32+rdi],xmm2
  856. movdqu XMMWORD[48+rdi],xmm7
  857. je NEAR $L$done4x
  858. movdqa xmm6,XMMWORD[16+rsp]
  859. lea rsi,[64+rsi]
  860. xor r10,r10
  861. movdqa XMMWORD[rsp],xmm6
  862. movdqa XMMWORD[16+rsp],xmm13
  863. lea rdi,[64+rdi]
  864. movdqa XMMWORD[32+rsp],xmm5
  865. sub rdx,64
  866. movdqa XMMWORD[48+rsp],xmm1
  867. jmp NEAR $L$oop_tail4x
  868. ALIGN 32
  869. $L$128_or_more4x:
  870. movdqu xmm6,XMMWORD[rsi]
  871. movdqu xmm11,XMMWORD[16+rsi]
  872. movdqu xmm2,XMMWORD[32+rsi]
  873. movdqu xmm7,XMMWORD[48+rsi]
  874. pxor xmm6,XMMWORD[rsp]
  875. pxor xmm11,xmm12
  876. pxor xmm2,xmm4
  877. pxor xmm7,xmm0
  878. movdqu XMMWORD[rdi],xmm6
  879. movdqu xmm6,XMMWORD[64+rsi]
  880. movdqu XMMWORD[16+rdi],xmm11
  881. movdqu xmm11,XMMWORD[80+rsi]
  882. movdqu XMMWORD[32+rdi],xmm2
  883. movdqu xmm2,XMMWORD[96+rsi]
  884. movdqu XMMWORD[48+rdi],xmm7
  885. movdqu xmm7,XMMWORD[112+rsi]
  886. pxor xmm6,XMMWORD[16+rsp]
  887. pxor xmm11,xmm13
  888. pxor xmm2,xmm5
  889. pxor xmm7,xmm1
  890. movdqu XMMWORD[64+rdi],xmm6
  891. movdqu XMMWORD[80+rdi],xmm11
  892. movdqu XMMWORD[96+rdi],xmm2
  893. movdqu XMMWORD[112+rdi],xmm7
  894. je NEAR $L$done4x
  895. movdqa xmm6,XMMWORD[32+rsp]
  896. lea rsi,[128+rsi]
  897. xor r10,r10
  898. movdqa XMMWORD[rsp],xmm6
  899. movdqa XMMWORD[16+rsp],xmm10
  900. lea rdi,[128+rdi]
  901. movdqa XMMWORD[32+rsp],xmm14
  902. sub rdx,128
  903. movdqa XMMWORD[48+rsp],xmm8
  904. jmp NEAR $L$oop_tail4x
  905. ALIGN 32
  906. $L$192_or_more4x:
  907. movdqu xmm6,XMMWORD[rsi]
  908. movdqu xmm11,XMMWORD[16+rsi]
  909. movdqu xmm2,XMMWORD[32+rsi]
  910. movdqu xmm7,XMMWORD[48+rsi]
  911. pxor xmm6,XMMWORD[rsp]
  912. pxor xmm11,xmm12
  913. pxor xmm2,xmm4
  914. pxor xmm7,xmm0
  915. movdqu XMMWORD[rdi],xmm6
  916. movdqu xmm6,XMMWORD[64+rsi]
  917. movdqu XMMWORD[16+rdi],xmm11
  918. movdqu xmm11,XMMWORD[80+rsi]
  919. movdqu XMMWORD[32+rdi],xmm2
  920. movdqu xmm2,XMMWORD[96+rsi]
  921. movdqu XMMWORD[48+rdi],xmm7
  922. movdqu xmm7,XMMWORD[112+rsi]
  923. lea rsi,[128+rsi]
  924. pxor xmm6,XMMWORD[16+rsp]
  925. pxor xmm11,xmm13
  926. pxor xmm2,xmm5
  927. pxor xmm7,xmm1
  928. movdqu XMMWORD[64+rdi],xmm6
  929. movdqu xmm6,XMMWORD[rsi]
  930. movdqu XMMWORD[80+rdi],xmm11
  931. movdqu xmm11,XMMWORD[16+rsi]
  932. movdqu XMMWORD[96+rdi],xmm2
  933. movdqu xmm2,XMMWORD[32+rsi]
  934. movdqu XMMWORD[112+rdi],xmm7
  935. lea rdi,[128+rdi]
  936. movdqu xmm7,XMMWORD[48+rsi]
  937. pxor xmm6,XMMWORD[32+rsp]
  938. pxor xmm11,xmm10
  939. pxor xmm2,xmm14
  940. pxor xmm7,xmm8
  941. movdqu XMMWORD[rdi],xmm6
  942. movdqu XMMWORD[16+rdi],xmm11
  943. movdqu XMMWORD[32+rdi],xmm2
  944. movdqu XMMWORD[48+rdi],xmm7
  945. je NEAR $L$done4x
  946. movdqa xmm6,XMMWORD[48+rsp]
  947. lea rsi,[64+rsi]
  948. xor r10,r10
  949. movdqa XMMWORD[rsp],xmm6
  950. movdqa XMMWORD[16+rsp],xmm15
  951. lea rdi,[64+rdi]
  952. movdqa XMMWORD[32+rsp],xmm9
  953. sub rdx,192
  954. movdqa XMMWORD[48+rsp],xmm3
  955. $L$oop_tail4x:
  956. movzx eax,BYTE[r10*1+rsi]
  957. movzx ecx,BYTE[r10*1+rsp]
  958. lea r10,[1+r10]
  959. xor eax,ecx
  960. mov BYTE[((-1))+r10*1+rdi],al
  961. dec rdx
  962. jnz NEAR $L$oop_tail4x
  963. $L$done4x:
  964. movaps xmm6,XMMWORD[((-168))+r9]
  965. movaps xmm7,XMMWORD[((-152))+r9]
  966. movaps xmm8,XMMWORD[((-136))+r9]
  967. movaps xmm9,XMMWORD[((-120))+r9]
  968. movaps xmm10,XMMWORD[((-104))+r9]
  969. movaps xmm11,XMMWORD[((-88))+r9]
  970. movaps xmm12,XMMWORD[((-72))+r9]
  971. movaps xmm13,XMMWORD[((-56))+r9]
  972. movaps xmm14,XMMWORD[((-40))+r9]
  973. movaps xmm15,XMMWORD[((-24))+r9]
  974. lea rsp,[r9]
  975. $L$4x_epilogue:
  976. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  977. mov rsi,QWORD[16+rsp]
  978. DB 0F3h,0C3h ;repret
  979. $L$SEH_end_ChaCha20_4x:
  980. ALIGN 32
  981. ChaCha20_8x:
  982. mov QWORD[8+rsp],rdi ;WIN64 prologue
  983. mov QWORD[16+rsp],rsi
  984. mov rax,rsp
  985. $L$SEH_begin_ChaCha20_8x:
  986. mov rdi,rcx
  987. mov rsi,rdx
  988. mov rdx,r8
  989. mov rcx,r9
  990. mov r8,QWORD[40+rsp]
  991. $L$ChaCha20_8x:
  992. mov r9,rsp
  993. sub rsp,0x280+168
  994. and rsp,-32
  995. movaps XMMWORD[(-168)+r9],xmm6
  996. movaps XMMWORD[(-152)+r9],xmm7
  997. movaps XMMWORD[(-136)+r9],xmm8
  998. movaps XMMWORD[(-120)+r9],xmm9
  999. movaps XMMWORD[(-104)+r9],xmm10
  1000. movaps XMMWORD[(-88)+r9],xmm11
  1001. movaps XMMWORD[(-72)+r9],xmm12
  1002. movaps XMMWORD[(-56)+r9],xmm13
  1003. movaps XMMWORD[(-40)+r9],xmm14
  1004. movaps XMMWORD[(-24)+r9],xmm15
  1005. $L$8x_body:
  1006. vzeroupper
  1007. vbroadcasti128 ymm11,XMMWORD[$L$sigma]
  1008. vbroadcasti128 ymm3,XMMWORD[rcx]
  1009. vbroadcasti128 ymm15,XMMWORD[16+rcx]
  1010. vbroadcasti128 ymm7,XMMWORD[r8]
  1011. lea rcx,[256+rsp]
  1012. lea rax,[512+rsp]
  1013. lea r10,[$L$rot16]
  1014. lea r11,[$L$rot24]
  1015. vpshufd ymm8,ymm11,0x00
  1016. vpshufd ymm9,ymm11,0x55
  1017. vmovdqa YMMWORD[(128-256)+rcx],ymm8
  1018. vpshufd ymm10,ymm11,0xaa
  1019. vmovdqa YMMWORD[(160-256)+rcx],ymm9
  1020. vpshufd ymm11,ymm11,0xff
  1021. vmovdqa YMMWORD[(192-256)+rcx],ymm10
  1022. vmovdqa YMMWORD[(224-256)+rcx],ymm11
  1023. vpshufd ymm0,ymm3,0x00
  1024. vpshufd ymm1,ymm3,0x55
  1025. vmovdqa YMMWORD[(256-256)+rcx],ymm0
  1026. vpshufd ymm2,ymm3,0xaa
  1027. vmovdqa YMMWORD[(288-256)+rcx],ymm1
  1028. vpshufd ymm3,ymm3,0xff
  1029. vmovdqa YMMWORD[(320-256)+rcx],ymm2
  1030. vmovdqa YMMWORD[(352-256)+rcx],ymm3
  1031. vpshufd ymm12,ymm15,0x00
  1032. vpshufd ymm13,ymm15,0x55
  1033. vmovdqa YMMWORD[(384-512)+rax],ymm12
  1034. vpshufd ymm14,ymm15,0xaa
  1035. vmovdqa YMMWORD[(416-512)+rax],ymm13
  1036. vpshufd ymm15,ymm15,0xff
  1037. vmovdqa YMMWORD[(448-512)+rax],ymm14
  1038. vmovdqa YMMWORD[(480-512)+rax],ymm15
  1039. vpshufd ymm4,ymm7,0x00
  1040. vpshufd ymm5,ymm7,0x55
  1041. vpaddd ymm4,ymm4,YMMWORD[$L$incy]
  1042. vpshufd ymm6,ymm7,0xaa
  1043. vmovdqa YMMWORD[(544-512)+rax],ymm5
  1044. vpshufd ymm7,ymm7,0xff
  1045. vmovdqa YMMWORD[(576-512)+rax],ymm6
  1046. vmovdqa YMMWORD[(608-512)+rax],ymm7
  1047. jmp NEAR $L$oop_enter8x
  1048. ALIGN 32
  1049. $L$oop_outer8x:
  1050. vmovdqa ymm8,YMMWORD[((128-256))+rcx]
  1051. vmovdqa ymm9,YMMWORD[((160-256))+rcx]
  1052. vmovdqa ymm10,YMMWORD[((192-256))+rcx]
  1053. vmovdqa ymm11,YMMWORD[((224-256))+rcx]
  1054. vmovdqa ymm0,YMMWORD[((256-256))+rcx]
  1055. vmovdqa ymm1,YMMWORD[((288-256))+rcx]
  1056. vmovdqa ymm2,YMMWORD[((320-256))+rcx]
  1057. vmovdqa ymm3,YMMWORD[((352-256))+rcx]
  1058. vmovdqa ymm12,YMMWORD[((384-512))+rax]
  1059. vmovdqa ymm13,YMMWORD[((416-512))+rax]
  1060. vmovdqa ymm14,YMMWORD[((448-512))+rax]
  1061. vmovdqa ymm15,YMMWORD[((480-512))+rax]
  1062. vmovdqa ymm4,YMMWORD[((512-512))+rax]
  1063. vmovdqa ymm5,YMMWORD[((544-512))+rax]
  1064. vmovdqa ymm6,YMMWORD[((576-512))+rax]
  1065. vmovdqa ymm7,YMMWORD[((608-512))+rax]
  1066. vpaddd ymm4,ymm4,YMMWORD[$L$eight]
  1067. $L$oop_enter8x:
  1068. vmovdqa YMMWORD[64+rsp],ymm14
  1069. vmovdqa YMMWORD[96+rsp],ymm15
  1070. vbroadcasti128 ymm15,XMMWORD[r10]
  1071. vmovdqa YMMWORD[(512-512)+rax],ymm4
  1072. mov eax,10
  1073. jmp NEAR $L$oop8x
  1074. ALIGN 32
  1075. $L$oop8x:
  1076. vpaddd ymm8,ymm8,ymm0
  1077. vpxor ymm4,ymm8,ymm4
  1078. vpshufb ymm4,ymm4,ymm15
  1079. vpaddd ymm9,ymm9,ymm1
  1080. vpxor ymm5,ymm9,ymm5
  1081. vpshufb ymm5,ymm5,ymm15
  1082. vpaddd ymm12,ymm12,ymm4
  1083. vpxor ymm0,ymm12,ymm0
  1084. vpslld ymm14,ymm0,12
  1085. vpsrld ymm0,ymm0,20
  1086. vpor ymm0,ymm14,ymm0
  1087. vbroadcasti128 ymm14,XMMWORD[r11]
  1088. vpaddd ymm13,ymm13,ymm5
  1089. vpxor ymm1,ymm13,ymm1
  1090. vpslld ymm15,ymm1,12
  1091. vpsrld ymm1,ymm1,20
  1092. vpor ymm1,ymm15,ymm1
  1093. vpaddd ymm8,ymm8,ymm0
  1094. vpxor ymm4,ymm8,ymm4
  1095. vpshufb ymm4,ymm4,ymm14
  1096. vpaddd ymm9,ymm9,ymm1
  1097. vpxor ymm5,ymm9,ymm5
  1098. vpshufb ymm5,ymm5,ymm14
  1099. vpaddd ymm12,ymm12,ymm4
  1100. vpxor ymm0,ymm12,ymm0
  1101. vpslld ymm15,ymm0,7
  1102. vpsrld ymm0,ymm0,25
  1103. vpor ymm0,ymm15,ymm0
  1104. vbroadcasti128 ymm15,XMMWORD[r10]
  1105. vpaddd ymm13,ymm13,ymm5
  1106. vpxor ymm1,ymm13,ymm1
  1107. vpslld ymm14,ymm1,7
  1108. vpsrld ymm1,ymm1,25
  1109. vpor ymm1,ymm14,ymm1
  1110. vmovdqa YMMWORD[rsp],ymm12
  1111. vmovdqa YMMWORD[32+rsp],ymm13
  1112. vmovdqa ymm12,YMMWORD[64+rsp]
  1113. vmovdqa ymm13,YMMWORD[96+rsp]
  1114. vpaddd ymm10,ymm10,ymm2
  1115. vpxor ymm6,ymm10,ymm6
  1116. vpshufb ymm6,ymm6,ymm15
  1117. vpaddd ymm11,ymm11,ymm3
  1118. vpxor ymm7,ymm11,ymm7
  1119. vpshufb ymm7,ymm7,ymm15
  1120. vpaddd ymm12,ymm12,ymm6
  1121. vpxor ymm2,ymm12,ymm2
  1122. vpslld ymm14,ymm2,12
  1123. vpsrld ymm2,ymm2,20
  1124. vpor ymm2,ymm14,ymm2
  1125. vbroadcasti128 ymm14,XMMWORD[r11]
  1126. vpaddd ymm13,ymm13,ymm7
  1127. vpxor ymm3,ymm13,ymm3
  1128. vpslld ymm15,ymm3,12
  1129. vpsrld ymm3,ymm3,20
  1130. vpor ymm3,ymm15,ymm3
  1131. vpaddd ymm10,ymm10,ymm2
  1132. vpxor ymm6,ymm10,ymm6
  1133. vpshufb ymm6,ymm6,ymm14
  1134. vpaddd ymm11,ymm11,ymm3
  1135. vpxor ymm7,ymm11,ymm7
  1136. vpshufb ymm7,ymm7,ymm14
  1137. vpaddd ymm12,ymm12,ymm6
  1138. vpxor ymm2,ymm12,ymm2
  1139. vpslld ymm15,ymm2,7
  1140. vpsrld ymm2,ymm2,25
  1141. vpor ymm2,ymm15,ymm2
  1142. vbroadcasti128 ymm15,XMMWORD[r10]
  1143. vpaddd ymm13,ymm13,ymm7
  1144. vpxor ymm3,ymm13,ymm3
  1145. vpslld ymm14,ymm3,7
  1146. vpsrld ymm3,ymm3,25
  1147. vpor ymm3,ymm14,ymm3
  1148. vpaddd ymm8,ymm8,ymm1
  1149. vpxor ymm7,ymm8,ymm7
  1150. vpshufb ymm7,ymm7,ymm15
  1151. vpaddd ymm9,ymm9,ymm2
  1152. vpxor ymm4,ymm9,ymm4
  1153. vpshufb ymm4,ymm4,ymm15
  1154. vpaddd ymm12,ymm12,ymm7
  1155. vpxor ymm1,ymm12,ymm1
  1156. vpslld ymm14,ymm1,12
  1157. vpsrld ymm1,ymm1,20
  1158. vpor ymm1,ymm14,ymm1
  1159. vbroadcasti128 ymm14,XMMWORD[r11]
  1160. vpaddd ymm13,ymm13,ymm4
  1161. vpxor ymm2,ymm13,ymm2
  1162. vpslld ymm15,ymm2,12
  1163. vpsrld ymm2,ymm2,20
  1164. vpor ymm2,ymm15,ymm2
  1165. vpaddd ymm8,ymm8,ymm1
  1166. vpxor ymm7,ymm8,ymm7
  1167. vpshufb ymm7,ymm7,ymm14
  1168. vpaddd ymm9,ymm9,ymm2
  1169. vpxor ymm4,ymm9,ymm4
  1170. vpshufb ymm4,ymm4,ymm14
  1171. vpaddd ymm12,ymm12,ymm7
  1172. vpxor ymm1,ymm12,ymm1
  1173. vpslld ymm15,ymm1,7
  1174. vpsrld ymm1,ymm1,25
  1175. vpor ymm1,ymm15,ymm1
  1176. vbroadcasti128 ymm15,XMMWORD[r10]
  1177. vpaddd ymm13,ymm13,ymm4
  1178. vpxor ymm2,ymm13,ymm2
  1179. vpslld ymm14,ymm2,7
  1180. vpsrld ymm2,ymm2,25
  1181. vpor ymm2,ymm14,ymm2
  1182. vmovdqa YMMWORD[64+rsp],ymm12
  1183. vmovdqa YMMWORD[96+rsp],ymm13
  1184. vmovdqa ymm12,YMMWORD[rsp]
  1185. vmovdqa ymm13,YMMWORD[32+rsp]
  1186. vpaddd ymm10,ymm10,ymm3
  1187. vpxor ymm5,ymm10,ymm5
  1188. vpshufb ymm5,ymm5,ymm15
  1189. vpaddd ymm11,ymm11,ymm0
  1190. vpxor ymm6,ymm11,ymm6
  1191. vpshufb ymm6,ymm6,ymm15
  1192. vpaddd ymm12,ymm12,ymm5
  1193. vpxor ymm3,ymm12,ymm3
  1194. vpslld ymm14,ymm3,12
  1195. vpsrld ymm3,ymm3,20
  1196. vpor ymm3,ymm14,ymm3
  1197. vbroadcasti128 ymm14,XMMWORD[r11]
  1198. vpaddd ymm13,ymm13,ymm6
  1199. vpxor ymm0,ymm13,ymm0
  1200. vpslld ymm15,ymm0,12
  1201. vpsrld ymm0,ymm0,20
  1202. vpor ymm0,ymm15,ymm0
  1203. vpaddd ymm10,ymm10,ymm3
  1204. vpxor ymm5,ymm10,ymm5
  1205. vpshufb ymm5,ymm5,ymm14
  1206. vpaddd ymm11,ymm11,ymm0
  1207. vpxor ymm6,ymm11,ymm6
  1208. vpshufb ymm6,ymm6,ymm14
  1209. vpaddd ymm12,ymm12,ymm5
  1210. vpxor ymm3,ymm12,ymm3
  1211. vpslld ymm15,ymm3,7
  1212. vpsrld ymm3,ymm3,25
  1213. vpor ymm3,ymm15,ymm3
  1214. vbroadcasti128 ymm15,XMMWORD[r10]
  1215. vpaddd ymm13,ymm13,ymm6
  1216. vpxor ymm0,ymm13,ymm0
  1217. vpslld ymm14,ymm0,7
  1218. vpsrld ymm0,ymm0,25
  1219. vpor ymm0,ymm14,ymm0
  1220. dec eax
  1221. jnz NEAR $L$oop8x
  1222. lea rax,[512+rsp]
  1223. vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx]
  1224. vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx]
  1225. vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx]
  1226. vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx]
  1227. vpunpckldq ymm14,ymm8,ymm9
  1228. vpunpckldq ymm15,ymm10,ymm11
  1229. vpunpckhdq ymm8,ymm8,ymm9
  1230. vpunpckhdq ymm10,ymm10,ymm11
  1231. vpunpcklqdq ymm9,ymm14,ymm15
  1232. vpunpckhqdq ymm14,ymm14,ymm15
  1233. vpunpcklqdq ymm11,ymm8,ymm10
  1234. vpunpckhqdq ymm8,ymm8,ymm10
  1235. vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx]
  1236. vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx]
  1237. vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx]
  1238. vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx]
  1239. vpunpckldq ymm10,ymm0,ymm1
  1240. vpunpckldq ymm15,ymm2,ymm3
  1241. vpunpckhdq ymm0,ymm0,ymm1
  1242. vpunpckhdq ymm2,ymm2,ymm3
  1243. vpunpcklqdq ymm1,ymm10,ymm15
  1244. vpunpckhqdq ymm10,ymm10,ymm15
  1245. vpunpcklqdq ymm3,ymm0,ymm2
  1246. vpunpckhqdq ymm0,ymm0,ymm2
  1247. vperm2i128 ymm15,ymm9,ymm1,0x20
  1248. vperm2i128 ymm1,ymm9,ymm1,0x31
  1249. vperm2i128 ymm9,ymm14,ymm10,0x20
  1250. vperm2i128 ymm10,ymm14,ymm10,0x31
  1251. vperm2i128 ymm14,ymm11,ymm3,0x20
  1252. vperm2i128 ymm3,ymm11,ymm3,0x31
  1253. vperm2i128 ymm11,ymm8,ymm0,0x20
  1254. vperm2i128 ymm0,ymm8,ymm0,0x31
  1255. vmovdqa YMMWORD[rsp],ymm15
  1256. vmovdqa YMMWORD[32+rsp],ymm9
  1257. vmovdqa ymm15,YMMWORD[64+rsp]
  1258. vmovdqa ymm9,YMMWORD[96+rsp]
  1259. vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax]
  1260. vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax]
  1261. vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax]
  1262. vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax]
  1263. vpunpckldq ymm2,ymm12,ymm13
  1264. vpunpckldq ymm8,ymm15,ymm9
  1265. vpunpckhdq ymm12,ymm12,ymm13
  1266. vpunpckhdq ymm15,ymm15,ymm9
  1267. vpunpcklqdq ymm13,ymm2,ymm8
  1268. vpunpckhqdq ymm2,ymm2,ymm8
  1269. vpunpcklqdq ymm9,ymm12,ymm15
  1270. vpunpckhqdq ymm12,ymm12,ymm15
  1271. vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax]
  1272. vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax]
  1273. vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax]
  1274. vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax]
  1275. vpunpckldq ymm15,ymm4,ymm5
  1276. vpunpckldq ymm8,ymm6,ymm7
  1277. vpunpckhdq ymm4,ymm4,ymm5
  1278. vpunpckhdq ymm6,ymm6,ymm7
  1279. vpunpcklqdq ymm5,ymm15,ymm8
  1280. vpunpckhqdq ymm15,ymm15,ymm8
  1281. vpunpcklqdq ymm7,ymm4,ymm6
  1282. vpunpckhqdq ymm4,ymm4,ymm6
  1283. vperm2i128 ymm8,ymm13,ymm5,0x20
  1284. vperm2i128 ymm5,ymm13,ymm5,0x31
  1285. vperm2i128 ymm13,ymm2,ymm15,0x20
  1286. vperm2i128 ymm15,ymm2,ymm15,0x31
  1287. vperm2i128 ymm2,ymm9,ymm7,0x20
  1288. vperm2i128 ymm7,ymm9,ymm7,0x31
  1289. vperm2i128 ymm9,ymm12,ymm4,0x20
  1290. vperm2i128 ymm4,ymm12,ymm4,0x31
  1291. vmovdqa ymm6,YMMWORD[rsp]
  1292. vmovdqa ymm12,YMMWORD[32+rsp]
  1293. cmp rdx,64*8
  1294. jb NEAR $L$tail8x
  1295. vpxor ymm6,ymm6,YMMWORD[rsi]
  1296. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1297. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1298. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1299. lea rsi,[128+rsi]
  1300. vmovdqu YMMWORD[rdi],ymm6
  1301. vmovdqu YMMWORD[32+rdi],ymm8
  1302. vmovdqu YMMWORD[64+rdi],ymm1
  1303. vmovdqu YMMWORD[96+rdi],ymm5
  1304. lea rdi,[128+rdi]
  1305. vpxor ymm12,ymm12,YMMWORD[rsi]
  1306. vpxor ymm13,ymm13,YMMWORD[32+rsi]
  1307. vpxor ymm10,ymm10,YMMWORD[64+rsi]
  1308. vpxor ymm15,ymm15,YMMWORD[96+rsi]
  1309. lea rsi,[128+rsi]
  1310. vmovdqu YMMWORD[rdi],ymm12
  1311. vmovdqu YMMWORD[32+rdi],ymm13
  1312. vmovdqu YMMWORD[64+rdi],ymm10
  1313. vmovdqu YMMWORD[96+rdi],ymm15
  1314. lea rdi,[128+rdi]
  1315. vpxor ymm14,ymm14,YMMWORD[rsi]
  1316. vpxor ymm2,ymm2,YMMWORD[32+rsi]
  1317. vpxor ymm3,ymm3,YMMWORD[64+rsi]
  1318. vpxor ymm7,ymm7,YMMWORD[96+rsi]
  1319. lea rsi,[128+rsi]
  1320. vmovdqu YMMWORD[rdi],ymm14
  1321. vmovdqu YMMWORD[32+rdi],ymm2
  1322. vmovdqu YMMWORD[64+rdi],ymm3
  1323. vmovdqu YMMWORD[96+rdi],ymm7
  1324. lea rdi,[128+rdi]
  1325. vpxor ymm11,ymm11,YMMWORD[rsi]
  1326. vpxor ymm9,ymm9,YMMWORD[32+rsi]
  1327. vpxor ymm0,ymm0,YMMWORD[64+rsi]
  1328. vpxor ymm4,ymm4,YMMWORD[96+rsi]
  1329. lea rsi,[128+rsi]
  1330. vmovdqu YMMWORD[rdi],ymm11
  1331. vmovdqu YMMWORD[32+rdi],ymm9
  1332. vmovdqu YMMWORD[64+rdi],ymm0
  1333. vmovdqu YMMWORD[96+rdi],ymm4
  1334. lea rdi,[128+rdi]
  1335. sub rdx,64*8
  1336. jnz NEAR $L$oop_outer8x
  1337. jmp NEAR $L$done8x
  1338. $L$tail8x:
  1339. cmp rdx,448
  1340. jae NEAR $L$448_or_more8x
  1341. cmp rdx,384
  1342. jae NEAR $L$384_or_more8x
  1343. cmp rdx,320
  1344. jae NEAR $L$320_or_more8x
  1345. cmp rdx,256
  1346. jae NEAR $L$256_or_more8x
  1347. cmp rdx,192
  1348. jae NEAR $L$192_or_more8x
  1349. cmp rdx,128
  1350. jae NEAR $L$128_or_more8x
  1351. cmp rdx,64
  1352. jae NEAR $L$64_or_more8x
  1353. xor r10,r10
  1354. vmovdqa YMMWORD[rsp],ymm6
  1355. vmovdqa YMMWORD[32+rsp],ymm8
  1356. jmp NEAR $L$oop_tail8x
  1357. ALIGN 32
  1358. $L$64_or_more8x:
  1359. vpxor ymm6,ymm6,YMMWORD[rsi]
  1360. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1361. vmovdqu YMMWORD[rdi],ymm6
  1362. vmovdqu YMMWORD[32+rdi],ymm8
  1363. je NEAR $L$done8x
  1364. lea rsi,[64+rsi]
  1365. xor r10,r10
  1366. vmovdqa YMMWORD[rsp],ymm1
  1367. lea rdi,[64+rdi]
  1368. sub rdx,64
  1369. vmovdqa YMMWORD[32+rsp],ymm5
  1370. jmp NEAR $L$oop_tail8x
  1371. ALIGN 32
  1372. $L$128_or_more8x:
  1373. vpxor ymm6,ymm6,YMMWORD[rsi]
  1374. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1375. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1376. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1377. vmovdqu YMMWORD[rdi],ymm6
  1378. vmovdqu YMMWORD[32+rdi],ymm8
  1379. vmovdqu YMMWORD[64+rdi],ymm1
  1380. vmovdqu YMMWORD[96+rdi],ymm5
  1381. je NEAR $L$done8x
  1382. lea rsi,[128+rsi]
  1383. xor r10,r10
  1384. vmovdqa YMMWORD[rsp],ymm12
  1385. lea rdi,[128+rdi]
  1386. sub rdx,128
  1387. vmovdqa YMMWORD[32+rsp],ymm13
  1388. jmp NEAR $L$oop_tail8x
  1389. ALIGN 32
  1390. $L$192_or_more8x:
  1391. vpxor ymm6,ymm6,YMMWORD[rsi]
  1392. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1393. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1394. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1395. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1396. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1397. vmovdqu YMMWORD[rdi],ymm6
  1398. vmovdqu YMMWORD[32+rdi],ymm8
  1399. vmovdqu YMMWORD[64+rdi],ymm1
  1400. vmovdqu YMMWORD[96+rdi],ymm5
  1401. vmovdqu YMMWORD[128+rdi],ymm12
  1402. vmovdqu YMMWORD[160+rdi],ymm13
  1403. je NEAR $L$done8x
  1404. lea rsi,[192+rsi]
  1405. xor r10,r10
  1406. vmovdqa YMMWORD[rsp],ymm10
  1407. lea rdi,[192+rdi]
  1408. sub rdx,192
  1409. vmovdqa YMMWORD[32+rsp],ymm15
  1410. jmp NEAR $L$oop_tail8x
  1411. ALIGN 32
  1412. $L$256_or_more8x:
  1413. vpxor ymm6,ymm6,YMMWORD[rsi]
  1414. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1415. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1416. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1417. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1418. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1419. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1420. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1421. vmovdqu YMMWORD[rdi],ymm6
  1422. vmovdqu YMMWORD[32+rdi],ymm8
  1423. vmovdqu YMMWORD[64+rdi],ymm1
  1424. vmovdqu YMMWORD[96+rdi],ymm5
  1425. vmovdqu YMMWORD[128+rdi],ymm12
  1426. vmovdqu YMMWORD[160+rdi],ymm13
  1427. vmovdqu YMMWORD[192+rdi],ymm10
  1428. vmovdqu YMMWORD[224+rdi],ymm15
  1429. je NEAR $L$done8x
  1430. lea rsi,[256+rsi]
  1431. xor r10,r10
  1432. vmovdqa YMMWORD[rsp],ymm14
  1433. lea rdi,[256+rdi]
  1434. sub rdx,256
  1435. vmovdqa YMMWORD[32+rsp],ymm2
  1436. jmp NEAR $L$oop_tail8x
  1437. ALIGN 32
  1438. $L$320_or_more8x:
  1439. vpxor ymm6,ymm6,YMMWORD[rsi]
  1440. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1441. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1442. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1443. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1444. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1445. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1446. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1447. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1448. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1449. vmovdqu YMMWORD[rdi],ymm6
  1450. vmovdqu YMMWORD[32+rdi],ymm8
  1451. vmovdqu YMMWORD[64+rdi],ymm1
  1452. vmovdqu YMMWORD[96+rdi],ymm5
  1453. vmovdqu YMMWORD[128+rdi],ymm12
  1454. vmovdqu YMMWORD[160+rdi],ymm13
  1455. vmovdqu YMMWORD[192+rdi],ymm10
  1456. vmovdqu YMMWORD[224+rdi],ymm15
  1457. vmovdqu YMMWORD[256+rdi],ymm14
  1458. vmovdqu YMMWORD[288+rdi],ymm2
  1459. je NEAR $L$done8x
  1460. lea rsi,[320+rsi]
  1461. xor r10,r10
  1462. vmovdqa YMMWORD[rsp],ymm3
  1463. lea rdi,[320+rdi]
  1464. sub rdx,320
  1465. vmovdqa YMMWORD[32+rsp],ymm7
  1466. jmp NEAR $L$oop_tail8x
  1467. ALIGN 32
  1468. $L$384_or_more8x:
  1469. vpxor ymm6,ymm6,YMMWORD[rsi]
  1470. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1471. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1472. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1473. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1474. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1475. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1476. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1477. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1478. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1479. vpxor ymm3,ymm3,YMMWORD[320+rsi]
  1480. vpxor ymm7,ymm7,YMMWORD[352+rsi]
  1481. vmovdqu YMMWORD[rdi],ymm6
  1482. vmovdqu YMMWORD[32+rdi],ymm8
  1483. vmovdqu YMMWORD[64+rdi],ymm1
  1484. vmovdqu YMMWORD[96+rdi],ymm5
  1485. vmovdqu YMMWORD[128+rdi],ymm12
  1486. vmovdqu YMMWORD[160+rdi],ymm13
  1487. vmovdqu YMMWORD[192+rdi],ymm10
  1488. vmovdqu YMMWORD[224+rdi],ymm15
  1489. vmovdqu YMMWORD[256+rdi],ymm14
  1490. vmovdqu YMMWORD[288+rdi],ymm2
  1491. vmovdqu YMMWORD[320+rdi],ymm3
  1492. vmovdqu YMMWORD[352+rdi],ymm7
  1493. je NEAR $L$done8x
  1494. lea rsi,[384+rsi]
  1495. xor r10,r10
  1496. vmovdqa YMMWORD[rsp],ymm11
  1497. lea rdi,[384+rdi]
  1498. sub rdx,384
  1499. vmovdqa YMMWORD[32+rsp],ymm9
  1500. jmp NEAR $L$oop_tail8x
  1501. ALIGN 32
  1502. $L$448_or_more8x:
  1503. vpxor ymm6,ymm6,YMMWORD[rsi]
  1504. vpxor ymm8,ymm8,YMMWORD[32+rsi]
  1505. vpxor ymm1,ymm1,YMMWORD[64+rsi]
  1506. vpxor ymm5,ymm5,YMMWORD[96+rsi]
  1507. vpxor ymm12,ymm12,YMMWORD[128+rsi]
  1508. vpxor ymm13,ymm13,YMMWORD[160+rsi]
  1509. vpxor ymm10,ymm10,YMMWORD[192+rsi]
  1510. vpxor ymm15,ymm15,YMMWORD[224+rsi]
  1511. vpxor ymm14,ymm14,YMMWORD[256+rsi]
  1512. vpxor ymm2,ymm2,YMMWORD[288+rsi]
  1513. vpxor ymm3,ymm3,YMMWORD[320+rsi]
  1514. vpxor ymm7,ymm7,YMMWORD[352+rsi]
  1515. vpxor ymm11,ymm11,YMMWORD[384+rsi]
  1516. vpxor ymm9,ymm9,YMMWORD[416+rsi]
  1517. vmovdqu YMMWORD[rdi],ymm6
  1518. vmovdqu YMMWORD[32+rdi],ymm8
  1519. vmovdqu YMMWORD[64+rdi],ymm1
  1520. vmovdqu YMMWORD[96+rdi],ymm5
  1521. vmovdqu YMMWORD[128+rdi],ymm12
  1522. vmovdqu YMMWORD[160+rdi],ymm13
  1523. vmovdqu YMMWORD[192+rdi],ymm10
  1524. vmovdqu YMMWORD[224+rdi],ymm15
  1525. vmovdqu YMMWORD[256+rdi],ymm14
  1526. vmovdqu YMMWORD[288+rdi],ymm2
  1527. vmovdqu YMMWORD[320+rdi],ymm3
  1528. vmovdqu YMMWORD[352+rdi],ymm7
  1529. vmovdqu YMMWORD[384+rdi],ymm11
  1530. vmovdqu YMMWORD[416+rdi],ymm9
  1531. je NEAR $L$done8x
  1532. lea rsi,[448+rsi]
  1533. xor r10,r10
  1534. vmovdqa YMMWORD[rsp],ymm0
  1535. lea rdi,[448+rdi]
  1536. sub rdx,448
  1537. vmovdqa YMMWORD[32+rsp],ymm4
  1538. $L$oop_tail8x:
  1539. movzx eax,BYTE[r10*1+rsi]
  1540. movzx ecx,BYTE[r10*1+rsp]
  1541. lea r10,[1+r10]
  1542. xor eax,ecx
  1543. mov BYTE[((-1))+r10*1+rdi],al
  1544. dec rdx
  1545. jnz NEAR $L$oop_tail8x
  1546. $L$done8x:
  1547. vzeroall
  1548. movaps xmm6,XMMWORD[((-168))+r9]
  1549. movaps xmm7,XMMWORD[((-152))+r9]
  1550. movaps xmm8,XMMWORD[((-136))+r9]
  1551. movaps xmm9,XMMWORD[((-120))+r9]
  1552. movaps xmm10,XMMWORD[((-104))+r9]
  1553. movaps xmm11,XMMWORD[((-88))+r9]
  1554. movaps xmm12,XMMWORD[((-72))+r9]
  1555. movaps xmm13,XMMWORD[((-56))+r9]
  1556. movaps xmm14,XMMWORD[((-40))+r9]
  1557. movaps xmm15,XMMWORD[((-24))+r9]
  1558. lea rsp,[r9]
  1559. $L$8x_epilogue:
  1560. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1561. mov rsi,QWORD[16+rsp]
  1562. DB 0F3h,0C3h ;repret
  1563. $L$SEH_end_ChaCha20_8x:
  1564. EXTERN __imp_RtlVirtualUnwind
  1565. ALIGN 16
  1566. se_handler:
  1567. push rsi
  1568. push rdi
  1569. push rbx
  1570. push rbp
  1571. push r12
  1572. push r13
  1573. push r14
  1574. push r15
  1575. pushfq
  1576. sub rsp,64
  1577. mov rax,QWORD[120+r8]
  1578. mov rbx,QWORD[248+r8]
  1579. mov rsi,QWORD[8+r9]
  1580. mov r11,QWORD[56+r9]
  1581. lea r10,[$L$ctr32_body]
  1582. cmp rbx,r10
  1583. jb NEAR $L$common_seh_tail
  1584. mov rax,QWORD[152+r8]
  1585. lea r10,[$L$no_data]
  1586. cmp rbx,r10
  1587. jae NEAR $L$common_seh_tail
  1588. lea rax,[((64+24+48))+rax]
  1589. mov rbx,QWORD[((-8))+rax]
  1590. mov rbp,QWORD[((-16))+rax]
  1591. mov r12,QWORD[((-24))+rax]
  1592. mov r13,QWORD[((-32))+rax]
  1593. mov r14,QWORD[((-40))+rax]
  1594. mov r15,QWORD[((-48))+rax]
  1595. mov QWORD[144+r8],rbx
  1596. mov QWORD[160+r8],rbp
  1597. mov QWORD[216+r8],r12
  1598. mov QWORD[224+r8],r13
  1599. mov QWORD[232+r8],r14
  1600. mov QWORD[240+r8],r15
  1601. $L$common_seh_tail:
  1602. mov rdi,QWORD[8+rax]
  1603. mov rsi,QWORD[16+rax]
  1604. mov QWORD[152+r8],rax
  1605. mov QWORD[168+r8],rsi
  1606. mov QWORD[176+r8],rdi
  1607. mov rdi,QWORD[40+r9]
  1608. mov rsi,r8
  1609. mov ecx,154
  1610. DD 0xa548f3fc
  1611. mov rsi,r9
  1612. xor rcx,rcx
  1613. mov rdx,QWORD[8+rsi]
  1614. mov r8,QWORD[rsi]
  1615. mov r9,QWORD[16+rsi]
  1616. mov r10,QWORD[40+rsi]
  1617. lea r11,[56+rsi]
  1618. lea r12,[24+rsi]
  1619. mov QWORD[32+rsp],r10
  1620. mov QWORD[40+rsp],r11
  1621. mov QWORD[48+rsp],r12
  1622. mov QWORD[56+rsp],rcx
  1623. call QWORD[__imp_RtlVirtualUnwind]
  1624. mov eax,1
  1625. add rsp,64
  1626. popfq
  1627. pop r15
  1628. pop r14
  1629. pop r13
  1630. pop r12
  1631. pop rbp
  1632. pop rbx
  1633. pop rdi
  1634. pop rsi
  1635. DB 0F3h,0C3h ;repret
  1636. ALIGN 16
  1637. ssse3_handler:
  1638. push rsi
  1639. push rdi
  1640. push rbx
  1641. push rbp
  1642. push r12
  1643. push r13
  1644. push r14
  1645. push r15
  1646. pushfq
  1647. sub rsp,64
  1648. mov rax,QWORD[120+r8]
  1649. mov rbx,QWORD[248+r8]
  1650. mov rsi,QWORD[8+r9]
  1651. mov r11,QWORD[56+r9]
  1652. mov r10d,DWORD[r11]
  1653. lea r10,[r10*1+rsi]
  1654. cmp rbx,r10
  1655. jb NEAR $L$common_seh_tail
  1656. mov rax,QWORD[192+r8]
  1657. mov r10d,DWORD[4+r11]
  1658. lea r10,[r10*1+rsi]
  1659. cmp rbx,r10
  1660. jae NEAR $L$common_seh_tail
  1661. lea rsi,[((-40))+rax]
  1662. lea rdi,[512+r8]
  1663. mov ecx,4
  1664. DD 0xa548f3fc
  1665. jmp NEAR $L$common_seh_tail
  1666. ALIGN 16
  1667. full_handler:
  1668. push rsi
  1669. push rdi
  1670. push rbx
  1671. push rbp
  1672. push r12
  1673. push r13
  1674. push r14
  1675. push r15
  1676. pushfq
  1677. sub rsp,64
  1678. mov rax,QWORD[120+r8]
  1679. mov rbx,QWORD[248+r8]
  1680. mov rsi,QWORD[8+r9]
  1681. mov r11,QWORD[56+r9]
  1682. mov r10d,DWORD[r11]
  1683. lea r10,[r10*1+rsi]
  1684. cmp rbx,r10
  1685. jb NEAR $L$common_seh_tail
  1686. mov rax,QWORD[192+r8]
  1687. mov r10d,DWORD[4+r11]
  1688. lea r10,[r10*1+rsi]
  1689. cmp rbx,r10
  1690. jae NEAR $L$common_seh_tail
  1691. lea rsi,[((-168))+rax]
  1692. lea rdi,[512+r8]
  1693. mov ecx,20
  1694. DD 0xa548f3fc
  1695. jmp NEAR $L$common_seh_tail
  1696. section .pdata rdata align=4
  1697. ALIGN 4
  1698. DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
  1699. DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
  1700. DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
  1701. DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
  1702. DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
  1703. DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
  1704. DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase
  1705. DD $L$SEH_end_ChaCha20_4x wrt ..imagebase
  1706. DD $L$SEH_info_ChaCha20_4x wrt ..imagebase
  1707. DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase
  1708. DD $L$SEH_end_ChaCha20_8x wrt ..imagebase
  1709. DD $L$SEH_info_ChaCha20_8x wrt ..imagebase
  1710. section .xdata rdata align=8
  1711. ALIGN 8
  1712. $L$SEH_info_ChaCha20_ctr32:
  1713. DB 9,0,0,0
  1714. DD se_handler wrt ..imagebase
  1715. $L$SEH_info_ChaCha20_ssse3:
  1716. DB 9,0,0,0
  1717. DD ssse3_handler wrt ..imagebase
  1718. DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
  1719. $L$SEH_info_ChaCha20_4x:
  1720. DB 9,0,0,0
  1721. DD full_handler wrt ..imagebase
  1722. DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
  1723. $L$SEH_info_ChaCha20_8x:
  1724. DB 9,0,0,0
  1725. DD full_handler wrt ..imagebase
  1726. DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase