x86_64-mont.asm 23 KB


  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. %ifdef BORINGSSL_PREFIX
  8. %include "boringssl_prefix_symbols_nasm.inc"
  9. %endif
  10. section .text code align=64
  11. EXTERN OPENSSL_ia32cap_P
  12. global bn_mul_mont
  13. ALIGN 16
  14. bn_mul_mont:
  15. mov QWORD[8+rsp],rdi ;WIN64 prologue
  16. mov QWORD[16+rsp],rsi
  17. mov rax,rsp
  18. $L$SEH_begin_bn_mul_mont:
  19. mov rdi,rcx
  20. mov rsi,rdx
  21. mov rdx,r8
  22. mov rcx,r9
  23. mov r8,QWORD[40+rsp]
  24. mov r9,QWORD[48+rsp]
  25. mov r9d,r9d
  26. mov rax,rsp
  27. test r9d,3
  28. jnz NEAR $L$mul_enter
  29. cmp r9d,8
  30. jb NEAR $L$mul_enter
  31. lea r11,[OPENSSL_ia32cap_P]
  32. mov r11d,DWORD[8+r11]
  33. cmp rdx,rsi
  34. jne NEAR $L$mul4x_enter
  35. test r9d,7
  36. jz NEAR $L$sqr8x_enter
  37. jmp NEAR $L$mul4x_enter
  38. ALIGN 16
  39. $L$mul_enter:
  40. push rbx
  41. push rbp
  42. push r12
  43. push r13
  44. push r14
  45. push r15
  46. neg r9
  47. mov r11,rsp
  48. lea r10,[((-16))+r9*8+rsp]
  49. neg r9
  50. and r10,-1024
  51. sub r11,r10
  52. and r11,-4096
  53. lea rsp,[r11*1+r10]
  54. mov r11,QWORD[rsp]
  55. cmp rsp,r10
  56. ja NEAR $L$mul_page_walk
  57. jmp NEAR $L$mul_page_walk_done
  58. ALIGN 16
  59. $L$mul_page_walk:
  60. lea rsp,[((-4096))+rsp]
  61. mov r11,QWORD[rsp]
  62. cmp rsp,r10
  63. ja NEAR $L$mul_page_walk
  64. $L$mul_page_walk_done:
  65. mov QWORD[8+r9*8+rsp],rax
  66. $L$mul_body:
  67. mov r12,rdx
  68. mov r8,QWORD[r8]
  69. mov rbx,QWORD[r12]
  70. mov rax,QWORD[rsi]
  71. xor r14,r14
  72. xor r15,r15
  73. mov rbp,r8
  74. mul rbx
  75. mov r10,rax
  76. mov rax,QWORD[rcx]
  77. imul rbp,r10
  78. mov r11,rdx
  79. mul rbp
  80. add r10,rax
  81. mov rax,QWORD[8+rsi]
  82. adc rdx,0
  83. mov r13,rdx
  84. lea r15,[1+r15]
  85. jmp NEAR $L$1st_enter
  86. ALIGN 16
  87. $L$1st:
  88. add r13,rax
  89. mov rax,QWORD[r15*8+rsi]
  90. adc rdx,0
  91. add r13,r11
  92. mov r11,r10
  93. adc rdx,0
  94. mov QWORD[((-16))+r15*8+rsp],r13
  95. mov r13,rdx
  96. $L$1st_enter:
  97. mul rbx
  98. add r11,rax
  99. mov rax,QWORD[r15*8+rcx]
  100. adc rdx,0
  101. lea r15,[1+r15]
  102. mov r10,rdx
  103. mul rbp
  104. cmp r15,r9
  105. jne NEAR $L$1st
  106. add r13,rax
  107. mov rax,QWORD[rsi]
  108. adc rdx,0
  109. add r13,r11
  110. adc rdx,0
  111. mov QWORD[((-16))+r15*8+rsp],r13
  112. mov r13,rdx
  113. mov r11,r10
  114. xor rdx,rdx
  115. add r13,r11
  116. adc rdx,0
  117. mov QWORD[((-8))+r9*8+rsp],r13
  118. mov QWORD[r9*8+rsp],rdx
  119. lea r14,[1+r14]
  120. jmp NEAR $L$outer
  121. ALIGN 16
  122. $L$outer:
  123. mov rbx,QWORD[r14*8+r12]
  124. xor r15,r15
  125. mov rbp,r8
  126. mov r10,QWORD[rsp]
  127. mul rbx
  128. add r10,rax
  129. mov rax,QWORD[rcx]
  130. adc rdx,0
  131. imul rbp,r10
  132. mov r11,rdx
  133. mul rbp
  134. add r10,rax
  135. mov rax,QWORD[8+rsi]
  136. adc rdx,0
  137. mov r10,QWORD[8+rsp]
  138. mov r13,rdx
  139. lea r15,[1+r15]
  140. jmp NEAR $L$inner_enter
  141. ALIGN 16
  142. $L$inner:
  143. add r13,rax
  144. mov rax,QWORD[r15*8+rsi]
  145. adc rdx,0
  146. add r13,r10
  147. mov r10,QWORD[r15*8+rsp]
  148. adc rdx,0
  149. mov QWORD[((-16))+r15*8+rsp],r13
  150. mov r13,rdx
  151. $L$inner_enter:
  152. mul rbx
  153. add r11,rax
  154. mov rax,QWORD[r15*8+rcx]
  155. adc rdx,0
  156. add r10,r11
  157. mov r11,rdx
  158. adc r11,0
  159. lea r15,[1+r15]
  160. mul rbp
  161. cmp r15,r9
  162. jne NEAR $L$inner
  163. add r13,rax
  164. mov rax,QWORD[rsi]
  165. adc rdx,0
  166. add r13,r10
  167. mov r10,QWORD[r15*8+rsp]
  168. adc rdx,0
  169. mov QWORD[((-16))+r15*8+rsp],r13
  170. mov r13,rdx
  171. xor rdx,rdx
  172. add r13,r11
  173. adc rdx,0
  174. add r13,r10
  175. adc rdx,0
  176. mov QWORD[((-8))+r9*8+rsp],r13
  177. mov QWORD[r9*8+rsp],rdx
  178. lea r14,[1+r14]
  179. cmp r14,r9
  180. jb NEAR $L$outer
  181. xor r14,r14
  182. mov rax,QWORD[rsp]
  183. mov r15,r9
  184. ALIGN 16
  185. $L$sub: sbb rax,QWORD[r14*8+rcx]
  186. mov QWORD[r14*8+rdi],rax
  187. mov rax,QWORD[8+r14*8+rsp]
  188. lea r14,[1+r14]
  189. dec r15
  190. jnz NEAR $L$sub
  191. sbb rax,0
  192. mov rbx,-1
  193. xor rbx,rax
  194. xor r14,r14
  195. mov r15,r9
  196. $L$copy:
  197. mov rcx,QWORD[r14*8+rdi]
  198. mov rdx,QWORD[r14*8+rsp]
  199. and rcx,rbx
  200. and rdx,rax
  201. mov QWORD[r14*8+rsp],r9
  202. or rdx,rcx
  203. mov QWORD[r14*8+rdi],rdx
  204. lea r14,[1+r14]
  205. sub r15,1
  206. jnz NEAR $L$copy
  207. mov rsi,QWORD[8+r9*8+rsp]
  208. mov rax,1
  209. mov r15,QWORD[((-48))+rsi]
  210. mov r14,QWORD[((-40))+rsi]
  211. mov r13,QWORD[((-32))+rsi]
  212. mov r12,QWORD[((-24))+rsi]
  213. mov rbp,QWORD[((-16))+rsi]
  214. mov rbx,QWORD[((-8))+rsi]
  215. lea rsp,[rsi]
  216. $L$mul_epilogue:
  217. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  218. mov rsi,QWORD[16+rsp]
  219. DB 0F3h,0C3h ;repret
  220. $L$SEH_end_bn_mul_mont:
  221. ALIGN 16
  222. bn_mul4x_mont:
  223. mov QWORD[8+rsp],rdi ;WIN64 prologue
  224. mov QWORD[16+rsp],rsi
  225. mov rax,rsp
  226. $L$SEH_begin_bn_mul4x_mont:
  227. mov rdi,rcx
  228. mov rsi,rdx
  229. mov rdx,r8
  230. mov rcx,r9
  231. mov r8,QWORD[40+rsp]
  232. mov r9,QWORD[48+rsp]
  233. mov r9d,r9d
  234. mov rax,rsp
  235. $L$mul4x_enter:
  236. and r11d,0x80100
  237. cmp r11d,0x80100
  238. je NEAR $L$mulx4x_enter
  239. push rbx
  240. push rbp
  241. push r12
  242. push r13
  243. push r14
  244. push r15
  245. neg r9
  246. mov r11,rsp
  247. lea r10,[((-32))+r9*8+rsp]
  248. neg r9
  249. and r10,-1024
  250. sub r11,r10
  251. and r11,-4096
  252. lea rsp,[r11*1+r10]
  253. mov r11,QWORD[rsp]
  254. cmp rsp,r10
  255. ja NEAR $L$mul4x_page_walk
  256. jmp NEAR $L$mul4x_page_walk_done
  257. $L$mul4x_page_walk:
  258. lea rsp,[((-4096))+rsp]
  259. mov r11,QWORD[rsp]
  260. cmp rsp,r10
  261. ja NEAR $L$mul4x_page_walk
  262. $L$mul4x_page_walk_done:
  263. mov QWORD[8+r9*8+rsp],rax
  264. $L$mul4x_body:
  265. mov QWORD[16+r9*8+rsp],rdi
  266. mov r12,rdx
  267. mov r8,QWORD[r8]
  268. mov rbx,QWORD[r12]
  269. mov rax,QWORD[rsi]
  270. xor r14,r14
  271. xor r15,r15
  272. mov rbp,r8
  273. mul rbx
  274. mov r10,rax
  275. mov rax,QWORD[rcx]
  276. imul rbp,r10
  277. mov r11,rdx
  278. mul rbp
  279. add r10,rax
  280. mov rax,QWORD[8+rsi]
  281. adc rdx,0
  282. mov rdi,rdx
  283. mul rbx
  284. add r11,rax
  285. mov rax,QWORD[8+rcx]
  286. adc rdx,0
  287. mov r10,rdx
  288. mul rbp
  289. add rdi,rax
  290. mov rax,QWORD[16+rsi]
  291. adc rdx,0
  292. add rdi,r11
  293. lea r15,[4+r15]
  294. adc rdx,0
  295. mov QWORD[rsp],rdi
  296. mov r13,rdx
  297. jmp NEAR $L$1st4x
  298. ALIGN 16
  299. $L$1st4x:
  300. mul rbx
  301. add r10,rax
  302. mov rax,QWORD[((-16))+r15*8+rcx]
  303. adc rdx,0
  304. mov r11,rdx
  305. mul rbp
  306. add r13,rax
  307. mov rax,QWORD[((-8))+r15*8+rsi]
  308. adc rdx,0
  309. add r13,r10
  310. adc rdx,0
  311. mov QWORD[((-24))+r15*8+rsp],r13
  312. mov rdi,rdx
  313. mul rbx
  314. add r11,rax
  315. mov rax,QWORD[((-8))+r15*8+rcx]
  316. adc rdx,0
  317. mov r10,rdx
  318. mul rbp
  319. add rdi,rax
  320. mov rax,QWORD[r15*8+rsi]
  321. adc rdx,0
  322. add rdi,r11
  323. adc rdx,0
  324. mov QWORD[((-16))+r15*8+rsp],rdi
  325. mov r13,rdx
  326. mul rbx
  327. add r10,rax
  328. mov rax,QWORD[r15*8+rcx]
  329. adc rdx,0
  330. mov r11,rdx
  331. mul rbp
  332. add r13,rax
  333. mov rax,QWORD[8+r15*8+rsi]
  334. adc rdx,0
  335. add r13,r10
  336. adc rdx,0
  337. mov QWORD[((-8))+r15*8+rsp],r13
  338. mov rdi,rdx
  339. mul rbx
  340. add r11,rax
  341. mov rax,QWORD[8+r15*8+rcx]
  342. adc rdx,0
  343. lea r15,[4+r15]
  344. mov r10,rdx
  345. mul rbp
  346. add rdi,rax
  347. mov rax,QWORD[((-16))+r15*8+rsi]
  348. adc rdx,0
  349. add rdi,r11
  350. adc rdx,0
  351. mov QWORD[((-32))+r15*8+rsp],rdi
  352. mov r13,rdx
  353. cmp r15,r9
  354. jb NEAR $L$1st4x
  355. mul rbx
  356. add r10,rax
  357. mov rax,QWORD[((-16))+r15*8+rcx]
  358. adc rdx,0
  359. mov r11,rdx
  360. mul rbp
  361. add r13,rax
  362. mov rax,QWORD[((-8))+r15*8+rsi]
  363. adc rdx,0
  364. add r13,r10
  365. adc rdx,0
  366. mov QWORD[((-24))+r15*8+rsp],r13
  367. mov rdi,rdx
  368. mul rbx
  369. add r11,rax
  370. mov rax,QWORD[((-8))+r15*8+rcx]
  371. adc rdx,0
  372. mov r10,rdx
  373. mul rbp
  374. add rdi,rax
  375. mov rax,QWORD[rsi]
  376. adc rdx,0
  377. add rdi,r11
  378. adc rdx,0
  379. mov QWORD[((-16))+r15*8+rsp],rdi
  380. mov r13,rdx
  381. xor rdi,rdi
  382. add r13,r10
  383. adc rdi,0
  384. mov QWORD[((-8))+r15*8+rsp],r13
  385. mov QWORD[r15*8+rsp],rdi
  386. lea r14,[1+r14]
  387. ALIGN 4
  388. $L$outer4x:
  389. mov rbx,QWORD[r14*8+r12]
  390. xor r15,r15
  391. mov r10,QWORD[rsp]
  392. mov rbp,r8
  393. mul rbx
  394. add r10,rax
  395. mov rax,QWORD[rcx]
  396. adc rdx,0
  397. imul rbp,r10
  398. mov r11,rdx
  399. mul rbp
  400. add r10,rax
  401. mov rax,QWORD[8+rsi]
  402. adc rdx,0
  403. mov rdi,rdx
  404. mul rbx
  405. add r11,rax
  406. mov rax,QWORD[8+rcx]
  407. adc rdx,0
  408. add r11,QWORD[8+rsp]
  409. adc rdx,0
  410. mov r10,rdx
  411. mul rbp
  412. add rdi,rax
  413. mov rax,QWORD[16+rsi]
  414. adc rdx,0
  415. add rdi,r11
  416. lea r15,[4+r15]
  417. adc rdx,0
  418. mov QWORD[rsp],rdi
  419. mov r13,rdx
  420. jmp NEAR $L$inner4x
  421. ALIGN 16
  422. $L$inner4x:
  423. mul rbx
  424. add r10,rax
  425. mov rax,QWORD[((-16))+r15*8+rcx]
  426. adc rdx,0
  427. add r10,QWORD[((-16))+r15*8+rsp]
  428. adc rdx,0
  429. mov r11,rdx
  430. mul rbp
  431. add r13,rax
  432. mov rax,QWORD[((-8))+r15*8+rsi]
  433. adc rdx,0
  434. add r13,r10
  435. adc rdx,0
  436. mov QWORD[((-24))+r15*8+rsp],r13
  437. mov rdi,rdx
  438. mul rbx
  439. add r11,rax
  440. mov rax,QWORD[((-8))+r15*8+rcx]
  441. adc rdx,0
  442. add r11,QWORD[((-8))+r15*8+rsp]
  443. adc rdx,0
  444. mov r10,rdx
  445. mul rbp
  446. add rdi,rax
  447. mov rax,QWORD[r15*8+rsi]
  448. adc rdx,0
  449. add rdi,r11
  450. adc rdx,0
  451. mov QWORD[((-16))+r15*8+rsp],rdi
  452. mov r13,rdx
  453. mul rbx
  454. add r10,rax
  455. mov rax,QWORD[r15*8+rcx]
  456. adc rdx,0
  457. add r10,QWORD[r15*8+rsp]
  458. adc rdx,0
  459. mov r11,rdx
  460. mul rbp
  461. add r13,rax
  462. mov rax,QWORD[8+r15*8+rsi]
  463. adc rdx,0
  464. add r13,r10
  465. adc rdx,0
  466. mov QWORD[((-8))+r15*8+rsp],r13
  467. mov rdi,rdx
  468. mul rbx
  469. add r11,rax
  470. mov rax,QWORD[8+r15*8+rcx]
  471. adc rdx,0
  472. add r11,QWORD[8+r15*8+rsp]
  473. adc rdx,0
  474. lea r15,[4+r15]
  475. mov r10,rdx
  476. mul rbp
  477. add rdi,rax
  478. mov rax,QWORD[((-16))+r15*8+rsi]
  479. adc rdx,0
  480. add rdi,r11
  481. adc rdx,0
  482. mov QWORD[((-32))+r15*8+rsp],rdi
  483. mov r13,rdx
  484. cmp r15,r9
  485. jb NEAR $L$inner4x
  486. mul rbx
  487. add r10,rax
  488. mov rax,QWORD[((-16))+r15*8+rcx]
  489. adc rdx,0
  490. add r10,QWORD[((-16))+r15*8+rsp]
  491. adc rdx,0
  492. mov r11,rdx
  493. mul rbp
  494. add r13,rax
  495. mov rax,QWORD[((-8))+r15*8+rsi]
  496. adc rdx,0
  497. add r13,r10
  498. adc rdx,0
  499. mov QWORD[((-24))+r15*8+rsp],r13
  500. mov rdi,rdx
  501. mul rbx
  502. add r11,rax
  503. mov rax,QWORD[((-8))+r15*8+rcx]
  504. adc rdx,0
  505. add r11,QWORD[((-8))+r15*8+rsp]
  506. adc rdx,0
  507. lea r14,[1+r14]
  508. mov r10,rdx
  509. mul rbp
  510. add rdi,rax
  511. mov rax,QWORD[rsi]
  512. adc rdx,0
  513. add rdi,r11
  514. adc rdx,0
  515. mov QWORD[((-16))+r15*8+rsp],rdi
  516. mov r13,rdx
  517. xor rdi,rdi
  518. add r13,r10
  519. adc rdi,0
  520. add r13,QWORD[r9*8+rsp]
  521. adc rdi,0
  522. mov QWORD[((-8))+r15*8+rsp],r13
  523. mov QWORD[r15*8+rsp],rdi
  524. cmp r14,r9
  525. jb NEAR $L$outer4x
  526. mov rdi,QWORD[16+r9*8+rsp]
  527. lea r15,[((-4))+r9]
  528. mov rax,QWORD[rsp]
  529. mov rdx,QWORD[8+rsp]
  530. shr r15,2
  531. lea rsi,[rsp]
  532. xor r14,r14
  533. sub rax,QWORD[rcx]
  534. mov rbx,QWORD[16+rsi]
  535. mov rbp,QWORD[24+rsi]
  536. sbb rdx,QWORD[8+rcx]
  537. $L$sub4x:
  538. mov QWORD[r14*8+rdi],rax
  539. mov QWORD[8+r14*8+rdi],rdx
  540. sbb rbx,QWORD[16+r14*8+rcx]
  541. mov rax,QWORD[32+r14*8+rsi]
  542. mov rdx,QWORD[40+r14*8+rsi]
  543. sbb rbp,QWORD[24+r14*8+rcx]
  544. mov QWORD[16+r14*8+rdi],rbx
  545. mov QWORD[24+r14*8+rdi],rbp
  546. sbb rax,QWORD[32+r14*8+rcx]
  547. mov rbx,QWORD[48+r14*8+rsi]
  548. mov rbp,QWORD[56+r14*8+rsi]
  549. sbb rdx,QWORD[40+r14*8+rcx]
  550. lea r14,[4+r14]
  551. dec r15
  552. jnz NEAR $L$sub4x
  553. mov QWORD[r14*8+rdi],rax
  554. mov rax,QWORD[32+r14*8+rsi]
  555. sbb rbx,QWORD[16+r14*8+rcx]
  556. mov QWORD[8+r14*8+rdi],rdx
  557. sbb rbp,QWORD[24+r14*8+rcx]
  558. mov QWORD[16+r14*8+rdi],rbx
  559. sbb rax,0
  560. mov QWORD[24+r14*8+rdi],rbp
  561. pxor xmm0,xmm0
  562. DB 102,72,15,110,224
  563. pcmpeqd xmm5,xmm5
  564. pshufd xmm4,xmm4,0
  565. mov r15,r9
  566. pxor xmm5,xmm4
  567. shr r15,2
  568. xor eax,eax
  569. jmp NEAR $L$copy4x
  570. ALIGN 16
  571. $L$copy4x:
  572. movdqa xmm1,XMMWORD[rax*1+rsp]
  573. movdqu xmm2,XMMWORD[rax*1+rdi]
  574. pand xmm1,xmm4
  575. pand xmm2,xmm5
  576. movdqa xmm3,XMMWORD[16+rax*1+rsp]
  577. movdqa XMMWORD[rax*1+rsp],xmm0
  578. por xmm1,xmm2
  579. movdqu xmm2,XMMWORD[16+rax*1+rdi]
  580. movdqu XMMWORD[rax*1+rdi],xmm1
  581. pand xmm3,xmm4
  582. pand xmm2,xmm5
  583. movdqa XMMWORD[16+rax*1+rsp],xmm0
  584. por xmm3,xmm2
  585. movdqu XMMWORD[16+rax*1+rdi],xmm3
  586. lea rax,[32+rax]
  587. dec r15
  588. jnz NEAR $L$copy4x
  589. mov rsi,QWORD[8+r9*8+rsp]
  590. mov rax,1
  591. mov r15,QWORD[((-48))+rsi]
  592. mov r14,QWORD[((-40))+rsi]
  593. mov r13,QWORD[((-32))+rsi]
  594. mov r12,QWORD[((-24))+rsi]
  595. mov rbp,QWORD[((-16))+rsi]
  596. mov rbx,QWORD[((-8))+rsi]
  597. lea rsp,[rsi]
  598. $L$mul4x_epilogue:
  599. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  600. mov rsi,QWORD[16+rsp]
  601. DB 0F3h,0C3h ;repret
  602. $L$SEH_end_bn_mul4x_mont:
  603. EXTERN bn_sqrx8x_internal
  604. EXTERN bn_sqr8x_internal
  605. ALIGN 32
  606. bn_sqr8x_mont:
  607. mov QWORD[8+rsp],rdi ;WIN64 prologue
  608. mov QWORD[16+rsp],rsi
  609. mov rax,rsp
  610. $L$SEH_begin_bn_sqr8x_mont:
  611. mov rdi,rcx
  612. mov rsi,rdx
  613. mov rdx,r8
  614. mov rcx,r9
  615. mov r8,QWORD[40+rsp]
  616. mov r9,QWORD[48+rsp]
  617. mov rax,rsp
  618. $L$sqr8x_enter:
  619. push rbx
  620. push rbp
  621. push r12
  622. push r13
  623. push r14
  624. push r15
  625. $L$sqr8x_prologue:
  626. mov r10d,r9d
  627. shl r9d,3
  628. shl r10,3+2
  629. neg r9
  630. lea r11,[((-64))+r9*2+rsp]
  631. mov rbp,rsp
  632. mov r8,QWORD[r8]
  633. sub r11,rsi
  634. and r11,4095
  635. cmp r10,r11
  636. jb NEAR $L$sqr8x_sp_alt
  637. sub rbp,r11
  638. lea rbp,[((-64))+r9*2+rbp]
  639. jmp NEAR $L$sqr8x_sp_done
  640. ALIGN 32
  641. $L$sqr8x_sp_alt:
  642. lea r10,[((4096-64))+r9*2]
  643. lea rbp,[((-64))+r9*2+rbp]
  644. sub r11,r10
  645. mov r10,0
  646. cmovc r11,r10
  647. sub rbp,r11
  648. $L$sqr8x_sp_done:
  649. and rbp,-64
  650. mov r11,rsp
  651. sub r11,rbp
  652. and r11,-4096
  653. lea rsp,[rbp*1+r11]
  654. mov r10,QWORD[rsp]
  655. cmp rsp,rbp
  656. ja NEAR $L$sqr8x_page_walk
  657. jmp NEAR $L$sqr8x_page_walk_done
  658. ALIGN 16
  659. $L$sqr8x_page_walk:
  660. lea rsp,[((-4096))+rsp]
  661. mov r10,QWORD[rsp]
  662. cmp rsp,rbp
  663. ja NEAR $L$sqr8x_page_walk
  664. $L$sqr8x_page_walk_done:
  665. mov r10,r9
  666. neg r9
  667. mov QWORD[32+rsp],r8
  668. mov QWORD[40+rsp],rax
  669. $L$sqr8x_body:
  670. DB 102,72,15,110,209
  671. pxor xmm0,xmm0
  672. DB 102,72,15,110,207
  673. DB 102,73,15,110,218
  674. lea rax,[OPENSSL_ia32cap_P]
  675. mov eax,DWORD[8+rax]
  676. and eax,0x80100
  677. cmp eax,0x80100
  678. jne NEAR $L$sqr8x_nox
  679. call bn_sqrx8x_internal
  680. lea rbx,[rcx*1+r8]
  681. mov r9,rcx
  682. mov rdx,rcx
  683. DB 102,72,15,126,207
  684. sar rcx,3+2
  685. jmp NEAR $L$sqr8x_sub
  686. ALIGN 32
  687. $L$sqr8x_nox:
  688. call bn_sqr8x_internal
  689. lea rbx,[r9*1+rdi]
  690. mov rcx,r9
  691. mov rdx,r9
  692. DB 102,72,15,126,207
  693. sar rcx,3+2
  694. jmp NEAR $L$sqr8x_sub
  695. ALIGN 32
  696. $L$sqr8x_sub:
  697. mov r12,QWORD[rbx]
  698. mov r13,QWORD[8+rbx]
  699. mov r14,QWORD[16+rbx]
  700. mov r15,QWORD[24+rbx]
  701. lea rbx,[32+rbx]
  702. sbb r12,QWORD[rbp]
  703. sbb r13,QWORD[8+rbp]
  704. sbb r14,QWORD[16+rbp]
  705. sbb r15,QWORD[24+rbp]
  706. lea rbp,[32+rbp]
  707. mov QWORD[rdi],r12
  708. mov QWORD[8+rdi],r13
  709. mov QWORD[16+rdi],r14
  710. mov QWORD[24+rdi],r15
  711. lea rdi,[32+rdi]
  712. inc rcx
  713. jnz NEAR $L$sqr8x_sub
  714. sbb rax,0
  715. lea rbx,[r9*1+rbx]
  716. lea rdi,[r9*1+rdi]
  717. DB 102,72,15,110,200
  718. pxor xmm0,xmm0
  719. pshufd xmm1,xmm1,0
  720. mov rsi,QWORD[40+rsp]
  721. jmp NEAR $L$sqr8x_cond_copy
  722. ALIGN 32
  723. $L$sqr8x_cond_copy:
  724. movdqa xmm2,XMMWORD[rbx]
  725. movdqa xmm3,XMMWORD[16+rbx]
  726. lea rbx,[32+rbx]
  727. movdqu xmm4,XMMWORD[rdi]
  728. movdqu xmm5,XMMWORD[16+rdi]
  729. lea rdi,[32+rdi]
  730. movdqa XMMWORD[(-32)+rbx],xmm0
  731. movdqa XMMWORD[(-16)+rbx],xmm0
  732. movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0
  733. movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0
  734. pcmpeqd xmm0,xmm1
  735. pand xmm2,xmm1
  736. pand xmm3,xmm1
  737. pand xmm4,xmm0
  738. pand xmm5,xmm0
  739. pxor xmm0,xmm0
  740. por xmm4,xmm2
  741. por xmm5,xmm3
  742. movdqu XMMWORD[(-32)+rdi],xmm4
  743. movdqu XMMWORD[(-16)+rdi],xmm5
  744. add r9,32
  745. jnz NEAR $L$sqr8x_cond_copy
  746. mov rax,1
  747. mov r15,QWORD[((-48))+rsi]
  748. mov r14,QWORD[((-40))+rsi]
  749. mov r13,QWORD[((-32))+rsi]
  750. mov r12,QWORD[((-24))+rsi]
  751. mov rbp,QWORD[((-16))+rsi]
  752. mov rbx,QWORD[((-8))+rsi]
  753. lea rsp,[rsi]
  754. $L$sqr8x_epilogue:
  755. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  756. mov rsi,QWORD[16+rsp]
  757. DB 0F3h,0C3h ;repret
  758. $L$SEH_end_bn_sqr8x_mont:
  759. ALIGN 32
  760. bn_mulx4x_mont:
  761. mov QWORD[8+rsp],rdi ;WIN64 prologue
  762. mov QWORD[16+rsp],rsi
  763. mov rax,rsp
  764. $L$SEH_begin_bn_mulx4x_mont:
  765. mov rdi,rcx
  766. mov rsi,rdx
  767. mov rdx,r8
  768. mov rcx,r9
  769. mov r8,QWORD[40+rsp]
  770. mov r9,QWORD[48+rsp]
  771. mov rax,rsp
  772. $L$mulx4x_enter:
  773. push rbx
  774. push rbp
  775. push r12
  776. push r13
  777. push r14
  778. push r15
  779. $L$mulx4x_prologue:
  780. shl r9d,3
  781. xor r10,r10
  782. sub r10,r9
  783. mov r8,QWORD[r8]
  784. lea rbp,[((-72))+r10*1+rsp]
  785. and rbp,-128
  786. mov r11,rsp
  787. sub r11,rbp
  788. and r11,-4096
  789. lea rsp,[rbp*1+r11]
  790. mov r10,QWORD[rsp]
  791. cmp rsp,rbp
  792. ja NEAR $L$mulx4x_page_walk
  793. jmp NEAR $L$mulx4x_page_walk_done
  794. ALIGN 16
  795. $L$mulx4x_page_walk:
  796. lea rsp,[((-4096))+rsp]
  797. mov r10,QWORD[rsp]
  798. cmp rsp,rbp
  799. ja NEAR $L$mulx4x_page_walk
  800. $L$mulx4x_page_walk_done:
  801. lea r10,[r9*1+rdx]
  802. mov QWORD[rsp],r9
  803. shr r9,5
  804. mov QWORD[16+rsp],r10
  805. sub r9,1
  806. mov QWORD[24+rsp],r8
  807. mov QWORD[32+rsp],rdi
  808. mov QWORD[40+rsp],rax
  809. mov QWORD[48+rsp],r9
  810. jmp NEAR $L$mulx4x_body
  811. ALIGN 32
  812. $L$mulx4x_body:
  813. lea rdi,[8+rdx]
  814. mov rdx,QWORD[rdx]
  815. lea rbx,[((64+32))+rsp]
  816. mov r9,rdx
  817. mulx rax,r8,QWORD[rsi]
  818. mulx r14,r11,QWORD[8+rsi]
  819. add r11,rax
  820. mov QWORD[8+rsp],rdi
  821. mulx r13,r12,QWORD[16+rsi]
  822. adc r12,r14
  823. adc r13,0
  824. mov rdi,r8
  825. imul r8,QWORD[24+rsp]
  826. xor rbp,rbp
  827. mulx r14,rax,QWORD[24+rsi]
  828. mov rdx,r8
  829. lea rsi,[32+rsi]
  830. adcx r13,rax
  831. adcx r14,rbp
  832. mulx r10,rax,QWORD[rcx]
  833. adcx rdi,rax
  834. adox r10,r11
  835. mulx r11,rax,QWORD[8+rcx]
  836. adcx r10,rax
  837. adox r11,r12
  838. DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
  839. mov rdi,QWORD[48+rsp]
  840. mov QWORD[((-32))+rbx],r10
  841. adcx r11,rax
  842. adox r12,r13
  843. mulx r15,rax,QWORD[24+rcx]
  844. mov rdx,r9
  845. mov QWORD[((-24))+rbx],r11
  846. adcx r12,rax
  847. adox r15,rbp
  848. lea rcx,[32+rcx]
  849. mov QWORD[((-16))+rbx],r12
  850. jmp NEAR $L$mulx4x_1st
  851. ALIGN 32
  852. $L$mulx4x_1st:
  853. adcx r15,rbp
  854. mulx rax,r10,QWORD[rsi]
  855. adcx r10,r14
  856. mulx r14,r11,QWORD[8+rsi]
  857. adcx r11,rax
  858. mulx rax,r12,QWORD[16+rsi]
  859. adcx r12,r14
  860. mulx r14,r13,QWORD[24+rsi]
  861. DB 0x67,0x67
  862. mov rdx,r8
  863. adcx r13,rax
  864. adcx r14,rbp
  865. lea rsi,[32+rsi]
  866. lea rbx,[32+rbx]
  867. adox r10,r15
  868. mulx r15,rax,QWORD[rcx]
  869. adcx r10,rax
  870. adox r11,r15
  871. mulx r15,rax,QWORD[8+rcx]
  872. adcx r11,rax
  873. adox r12,r15
  874. mulx r15,rax,QWORD[16+rcx]
  875. mov QWORD[((-40))+rbx],r10
  876. adcx r12,rax
  877. mov QWORD[((-32))+rbx],r11
  878. adox r13,r15
  879. mulx r15,rax,QWORD[24+rcx]
  880. mov rdx,r9
  881. mov QWORD[((-24))+rbx],r12
  882. adcx r13,rax
  883. adox r15,rbp
  884. lea rcx,[32+rcx]
  885. mov QWORD[((-16))+rbx],r13
  886. dec rdi
  887. jnz NEAR $L$mulx4x_1st
  888. mov rax,QWORD[rsp]
  889. mov rdi,QWORD[8+rsp]
  890. adc r15,rbp
  891. add r14,r15
  892. sbb r15,r15
  893. mov QWORD[((-8))+rbx],r14
  894. jmp NEAR $L$mulx4x_outer
  895. ALIGN 32
  896. $L$mulx4x_outer:
  897. mov rdx,QWORD[rdi]
  898. lea rdi,[8+rdi]
  899. sub rsi,rax
  900. mov QWORD[rbx],r15
  901. lea rbx,[((64+32))+rsp]
  902. sub rcx,rax
  903. mulx r11,r8,QWORD[rsi]
  904. xor ebp,ebp
  905. mov r9,rdx
  906. mulx r12,r14,QWORD[8+rsi]
  907. adox r8,QWORD[((-32))+rbx]
  908. adcx r11,r14
  909. mulx r13,r15,QWORD[16+rsi]
  910. adox r11,QWORD[((-24))+rbx]
  911. adcx r12,r15
  912. adox r12,QWORD[((-16))+rbx]
  913. adcx r13,rbp
  914. adox r13,rbp
  915. mov QWORD[8+rsp],rdi
  916. mov r15,r8
  917. imul r8,QWORD[24+rsp]
  918. xor ebp,ebp
  919. mulx r14,rax,QWORD[24+rsi]
  920. mov rdx,r8
  921. adcx r13,rax
  922. adox r13,QWORD[((-8))+rbx]
  923. adcx r14,rbp
  924. lea rsi,[32+rsi]
  925. adox r14,rbp
  926. mulx r10,rax,QWORD[rcx]
  927. adcx r15,rax
  928. adox r10,r11
  929. mulx r11,rax,QWORD[8+rcx]
  930. adcx r10,rax
  931. adox r11,r12
  932. mulx r12,rax,QWORD[16+rcx]
  933. mov QWORD[((-32))+rbx],r10
  934. adcx r11,rax
  935. adox r12,r13
  936. mulx r15,rax,QWORD[24+rcx]
  937. mov rdx,r9
  938. mov QWORD[((-24))+rbx],r11
  939. lea rcx,[32+rcx]
  940. adcx r12,rax
  941. adox r15,rbp
  942. mov rdi,QWORD[48+rsp]
  943. mov QWORD[((-16))+rbx],r12
  944. jmp NEAR $L$mulx4x_inner
  945. ALIGN 32
  946. $L$mulx4x_inner:
  947. mulx rax,r10,QWORD[rsi]
  948. adcx r15,rbp
  949. adox r10,r14
  950. mulx r14,r11,QWORD[8+rsi]
  951. adcx r10,QWORD[rbx]
  952. adox r11,rax
  953. mulx rax,r12,QWORD[16+rsi]
  954. adcx r11,QWORD[8+rbx]
  955. adox r12,r14
  956. mulx r14,r13,QWORD[24+rsi]
  957. mov rdx,r8
  958. adcx r12,QWORD[16+rbx]
  959. adox r13,rax
  960. adcx r13,QWORD[24+rbx]
  961. adox r14,rbp
  962. lea rsi,[32+rsi]
  963. lea rbx,[32+rbx]
  964. adcx r14,rbp
  965. adox r10,r15
  966. mulx r15,rax,QWORD[rcx]
  967. adcx r10,rax
  968. adox r11,r15
  969. mulx r15,rax,QWORD[8+rcx]
  970. adcx r11,rax
  971. adox r12,r15
  972. mulx r15,rax,QWORD[16+rcx]
  973. mov QWORD[((-40))+rbx],r10
  974. adcx r12,rax
  975. adox r13,r15
  976. mulx r15,rax,QWORD[24+rcx]
  977. mov rdx,r9
  978. mov QWORD[((-32))+rbx],r11
  979. mov QWORD[((-24))+rbx],r12
  980. adcx r13,rax
  981. adox r15,rbp
  982. lea rcx,[32+rcx]
  983. mov QWORD[((-16))+rbx],r13
  984. dec rdi
  985. jnz NEAR $L$mulx4x_inner
  986. mov rax,QWORD[rsp]
  987. mov rdi,QWORD[8+rsp]
  988. adc r15,rbp
  989. sub rbp,QWORD[rbx]
  990. adc r14,r15
  991. sbb r15,r15
  992. mov QWORD[((-8))+rbx],r14
  993. cmp rdi,QWORD[16+rsp]
  994. jne NEAR $L$mulx4x_outer
  995. lea rbx,[64+rsp]
  996. sub rcx,rax
  997. neg r15
  998. mov rdx,rax
  999. shr rax,3+2
  1000. mov rdi,QWORD[32+rsp]
  1001. jmp NEAR $L$mulx4x_sub
  1002. ALIGN 32
  1003. $L$mulx4x_sub:
  1004. mov r11,QWORD[rbx]
  1005. mov r12,QWORD[8+rbx]
  1006. mov r13,QWORD[16+rbx]
  1007. mov r14,QWORD[24+rbx]
  1008. lea rbx,[32+rbx]
  1009. sbb r11,QWORD[rcx]
  1010. sbb r12,QWORD[8+rcx]
  1011. sbb r13,QWORD[16+rcx]
  1012. sbb r14,QWORD[24+rcx]
  1013. lea rcx,[32+rcx]
  1014. mov QWORD[rdi],r11
  1015. mov QWORD[8+rdi],r12
  1016. mov QWORD[16+rdi],r13
  1017. mov QWORD[24+rdi],r14
  1018. lea rdi,[32+rdi]
  1019. dec rax
  1020. jnz NEAR $L$mulx4x_sub
  1021. sbb r15,0
  1022. lea rbx,[64+rsp]
  1023. sub rdi,rdx
  1024. DB 102,73,15,110,207
  1025. pxor xmm0,xmm0
  1026. pshufd xmm1,xmm1,0
  1027. mov rsi,QWORD[40+rsp]
  1028. jmp NEAR $L$mulx4x_cond_copy
  1029. ALIGN 32
  1030. $L$mulx4x_cond_copy:
  1031. movdqa xmm2,XMMWORD[rbx]
  1032. movdqa xmm3,XMMWORD[16+rbx]
  1033. lea rbx,[32+rbx]
  1034. movdqu xmm4,XMMWORD[rdi]
  1035. movdqu xmm5,XMMWORD[16+rdi]
  1036. lea rdi,[32+rdi]
  1037. movdqa XMMWORD[(-32)+rbx],xmm0
  1038. movdqa XMMWORD[(-16)+rbx],xmm0
  1039. pcmpeqd xmm0,xmm1
  1040. pand xmm2,xmm1
  1041. pand xmm3,xmm1
  1042. pand xmm4,xmm0
  1043. pand xmm5,xmm0
  1044. pxor xmm0,xmm0
  1045. por xmm4,xmm2
  1046. por xmm5,xmm3
  1047. movdqu XMMWORD[(-32)+rdi],xmm4
  1048. movdqu XMMWORD[(-16)+rdi],xmm5
  1049. sub rdx,32
  1050. jnz NEAR $L$mulx4x_cond_copy
  1051. mov QWORD[rbx],rdx
  1052. mov rax,1
  1053. mov r15,QWORD[((-48))+rsi]
  1054. mov r14,QWORD[((-40))+rsi]
  1055. mov r13,QWORD[((-32))+rsi]
  1056. mov r12,QWORD[((-24))+rsi]
  1057. mov rbp,QWORD[((-16))+rsi]
  1058. mov rbx,QWORD[((-8))+rsi]
  1059. lea rsp,[rsi]
  1060. $L$mulx4x_epilogue:
  1061. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1062. mov rsi,QWORD[16+rsp]
  1063. DB 0F3h,0C3h ;repret
  1064. $L$SEH_end_bn_mulx4x_mont:
  1065. DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
  1066. DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
  1067. DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83
  1068. DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
  1069. DB 115,108,46,111,114,103,62,0
  1070. ALIGN 16
  1071. EXTERN __imp_RtlVirtualUnwind
  1072. ALIGN 16
  1073. mul_handler:
  1074. push rsi
  1075. push rdi
  1076. push rbx
  1077. push rbp
  1078. push r12
  1079. push r13
  1080. push r14
  1081. push r15
  1082. pushfq
  1083. sub rsp,64
  1084. mov rax,QWORD[120+r8]
  1085. mov rbx,QWORD[248+r8]
  1086. mov rsi,QWORD[8+r9]
  1087. mov r11,QWORD[56+r9]
  1088. mov r10d,DWORD[r11]
  1089. lea r10,[r10*1+rsi]
  1090. cmp rbx,r10
  1091. jb NEAR $L$common_seh_tail
  1092. mov rax,QWORD[152+r8]
  1093. mov r10d,DWORD[4+r11]
  1094. lea r10,[r10*1+rsi]
  1095. cmp rbx,r10
  1096. jae NEAR $L$common_seh_tail
  1097. mov r10,QWORD[192+r8]
  1098. mov rax,QWORD[8+r10*8+rax]
  1099. jmp NEAR $L$common_pop_regs
  1100. ALIGN 16
  1101. sqr_handler:
  1102. push rsi
  1103. push rdi
  1104. push rbx
  1105. push rbp
  1106. push r12
  1107. push r13
  1108. push r14
  1109. push r15
  1110. pushfq
  1111. sub rsp,64
  1112. mov rax,QWORD[120+r8]
  1113. mov rbx,QWORD[248+r8]
  1114. mov rsi,QWORD[8+r9]
  1115. mov r11,QWORD[56+r9]
  1116. mov r10d,DWORD[r11]
  1117. lea r10,[r10*1+rsi]
  1118. cmp rbx,r10
  1119. jb NEAR $L$common_seh_tail
  1120. mov r10d,DWORD[4+r11]
  1121. lea r10,[r10*1+rsi]
  1122. cmp rbx,r10
  1123. jb NEAR $L$common_pop_regs
  1124. mov rax,QWORD[152+r8]
  1125. mov r10d,DWORD[8+r11]
  1126. lea r10,[r10*1+rsi]
  1127. cmp rbx,r10
  1128. jae NEAR $L$common_seh_tail
  1129. mov rax,QWORD[40+rax]
  1130. $L$common_pop_regs:
  1131. mov rbx,QWORD[((-8))+rax]
  1132. mov rbp,QWORD[((-16))+rax]
  1133. mov r12,QWORD[((-24))+rax]
  1134. mov r13,QWORD[((-32))+rax]
  1135. mov r14,QWORD[((-40))+rax]
  1136. mov r15,QWORD[((-48))+rax]
  1137. mov QWORD[144+r8],rbx
  1138. mov QWORD[160+r8],rbp
  1139. mov QWORD[216+r8],r12
  1140. mov QWORD[224+r8],r13
  1141. mov QWORD[232+r8],r14
  1142. mov QWORD[240+r8],r15
  1143. $L$common_seh_tail:
  1144. mov rdi,QWORD[8+rax]
  1145. mov rsi,QWORD[16+rax]
  1146. mov QWORD[152+r8],rax
  1147. mov QWORD[168+r8],rsi
  1148. mov QWORD[176+r8],rdi
  1149. mov rdi,QWORD[40+r9]
  1150. mov rsi,r8
  1151. mov ecx,154
  1152. DD 0xa548f3fc
  1153. mov rsi,r9
  1154. xor rcx,rcx
  1155. mov rdx,QWORD[8+rsi]
  1156. mov r8,QWORD[rsi]
  1157. mov r9,QWORD[16+rsi]
  1158. mov r10,QWORD[40+rsi]
  1159. lea r11,[56+rsi]
  1160. lea r12,[24+rsi]
  1161. mov QWORD[32+rsp],r10
  1162. mov QWORD[40+rsp],r11
  1163. mov QWORD[48+rsp],r12
  1164. mov QWORD[56+rsp],rcx
  1165. call QWORD[__imp_RtlVirtualUnwind]
  1166. mov eax,1
  1167. add rsp,64
  1168. popfq
  1169. pop r15
  1170. pop r14
  1171. pop r13
  1172. pop r12
  1173. pop rbp
  1174. pop rbx
  1175. pop rdi
  1176. pop rsi
  1177. DB 0F3h,0C3h ;repret
  1178. section .pdata rdata align=4
  1179. ALIGN 4
  1180. DD $L$SEH_begin_bn_mul_mont wrt ..imagebase
  1181. DD $L$SEH_end_bn_mul_mont wrt ..imagebase
  1182. DD $L$SEH_info_bn_mul_mont wrt ..imagebase
  1183. DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase
  1184. DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase
  1185. DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase
  1186. DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase
  1187. DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase
  1188. DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase
  1189. DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase
  1190. DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase
  1191. DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase
  1192. section .xdata rdata align=8
  1193. ALIGN 8
  1194. $L$SEH_info_bn_mul_mont:
  1195. DB 9,0,0,0
  1196. DD mul_handler wrt ..imagebase
  1197. DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
  1198. $L$SEH_info_bn_mul4x_mont:
  1199. DB 9,0,0,0
  1200. DD mul_handler wrt ..imagebase
  1201. DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
  1202. $L$SEH_info_bn_sqr8x_mont:
  1203. DB 9,0,0,0
  1204. DD sqr_handler wrt ..imagebase
  1205. DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
  1206. ALIGN 8
  1207. $L$SEH_info_bn_mulx4x_mont:
  1208. DB 9,0,0,0
  1209. DD sqr_handler wrt ..imagebase
  1210. DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase
  1211. ALIGN 8