x86_64-mont.S 20 KB


  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if defined(__has_feature)
  4. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  5. #define OPENSSL_NO_ASM
  6. #endif
  7. #endif
  8. #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
  9. #if defined(BORINGSSL_PREFIX)
  10. #include <boringssl_prefix_symbols_asm.h>
  11. #endif
  12. .text
  13. .extern OPENSSL_ia32cap_P
  14. .hidden OPENSSL_ia32cap_P
  15. .globl bn_mul_mont
  16. .hidden bn_mul_mont
  17. .type bn_mul_mont,@function
  18. .align 16
  19. bn_mul_mont:
  20. .cfi_startproc
  21. movl %r9d,%r9d
  22. movq %rsp,%rax
  23. .cfi_def_cfa_register %rax
  24. testl $3,%r9d
  25. jnz .Lmul_enter
  26. cmpl $8,%r9d
  27. jb .Lmul_enter
  28. leaq OPENSSL_ia32cap_P(%rip),%r11
  29. movl 8(%r11),%r11d
  30. cmpq %rsi,%rdx
  31. jne .Lmul4x_enter
  32. testl $7,%r9d
  33. jz .Lsqr8x_enter
  34. jmp .Lmul4x_enter
  35. .align 16
  36. .Lmul_enter:
  37. pushq %rbx
  38. .cfi_offset %rbx,-16
  39. pushq %rbp
  40. .cfi_offset %rbp,-24
  41. pushq %r12
  42. .cfi_offset %r12,-32
  43. pushq %r13
  44. .cfi_offset %r13,-40
  45. pushq %r14
  46. .cfi_offset %r14,-48
  47. pushq %r15
  48. .cfi_offset %r15,-56
  49. negq %r9
  50. movq %rsp,%r11
  51. leaq -16(%rsp,%r9,8),%r10
  52. negq %r9
  53. andq $-1024,%r10
  54. subq %r10,%r11
  55. andq $-4096,%r11
  56. leaq (%r10,%r11,1),%rsp
  57. movq (%rsp),%r11
  58. cmpq %r10,%rsp
  59. ja .Lmul_page_walk
  60. jmp .Lmul_page_walk_done
  61. .align 16
  62. .Lmul_page_walk:
  63. leaq -4096(%rsp),%rsp
  64. movq (%rsp),%r11
  65. cmpq %r10,%rsp
  66. ja .Lmul_page_walk
  67. .Lmul_page_walk_done:
  68. movq %rax,8(%rsp,%r9,8)
  69. .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
  70. .Lmul_body:
  71. movq %rdx,%r12
  72. movq (%r8),%r8
  73. movq (%r12),%rbx
  74. movq (%rsi),%rax
  75. xorq %r14,%r14
  76. xorq %r15,%r15
  77. movq %r8,%rbp
  78. mulq %rbx
  79. movq %rax,%r10
  80. movq (%rcx),%rax
  81. imulq %r10,%rbp
  82. movq %rdx,%r11
  83. mulq %rbp
  84. addq %rax,%r10
  85. movq 8(%rsi),%rax
  86. adcq $0,%rdx
  87. movq %rdx,%r13
  88. leaq 1(%r15),%r15
  89. jmp .L1st_enter
  90. .align 16
  91. .L1st:
  92. addq %rax,%r13
  93. movq (%rsi,%r15,8),%rax
  94. adcq $0,%rdx
  95. addq %r11,%r13
  96. movq %r10,%r11
  97. adcq $0,%rdx
  98. movq %r13,-16(%rsp,%r15,8)
  99. movq %rdx,%r13
  100. .L1st_enter:
  101. mulq %rbx
  102. addq %rax,%r11
  103. movq (%rcx,%r15,8),%rax
  104. adcq $0,%rdx
  105. leaq 1(%r15),%r15
  106. movq %rdx,%r10
  107. mulq %rbp
  108. cmpq %r9,%r15
  109. jne .L1st
  110. addq %rax,%r13
  111. movq (%rsi),%rax
  112. adcq $0,%rdx
  113. addq %r11,%r13
  114. adcq $0,%rdx
  115. movq %r13,-16(%rsp,%r15,8)
  116. movq %rdx,%r13
  117. movq %r10,%r11
  118. xorq %rdx,%rdx
  119. addq %r11,%r13
  120. adcq $0,%rdx
  121. movq %r13,-8(%rsp,%r9,8)
  122. movq %rdx,(%rsp,%r9,8)
  123. leaq 1(%r14),%r14
  124. jmp .Louter
  125. .align 16
  126. .Louter:
  127. movq (%r12,%r14,8),%rbx
  128. xorq %r15,%r15
  129. movq %r8,%rbp
  130. movq (%rsp),%r10
  131. mulq %rbx
  132. addq %rax,%r10
  133. movq (%rcx),%rax
  134. adcq $0,%rdx
  135. imulq %r10,%rbp
  136. movq %rdx,%r11
  137. mulq %rbp
  138. addq %rax,%r10
  139. movq 8(%rsi),%rax
  140. adcq $0,%rdx
  141. movq 8(%rsp),%r10
  142. movq %rdx,%r13
  143. leaq 1(%r15),%r15
  144. jmp .Linner_enter
  145. .align 16
  146. .Linner:
  147. addq %rax,%r13
  148. movq (%rsi,%r15,8),%rax
  149. adcq $0,%rdx
  150. addq %r10,%r13
  151. movq (%rsp,%r15,8),%r10
  152. adcq $0,%rdx
  153. movq %r13,-16(%rsp,%r15,8)
  154. movq %rdx,%r13
  155. .Linner_enter:
  156. mulq %rbx
  157. addq %rax,%r11
  158. movq (%rcx,%r15,8),%rax
  159. adcq $0,%rdx
  160. addq %r11,%r10
  161. movq %rdx,%r11
  162. adcq $0,%r11
  163. leaq 1(%r15),%r15
  164. mulq %rbp
  165. cmpq %r9,%r15
  166. jne .Linner
  167. addq %rax,%r13
  168. movq (%rsi),%rax
  169. adcq $0,%rdx
  170. addq %r10,%r13
  171. movq (%rsp,%r15,8),%r10
  172. adcq $0,%rdx
  173. movq %r13,-16(%rsp,%r15,8)
  174. movq %rdx,%r13
  175. xorq %rdx,%rdx
  176. addq %r11,%r13
  177. adcq $0,%rdx
  178. addq %r10,%r13
  179. adcq $0,%rdx
  180. movq %r13,-8(%rsp,%r9,8)
  181. movq %rdx,(%rsp,%r9,8)
  182. leaq 1(%r14),%r14
  183. cmpq %r9,%r14
  184. jb .Louter
  185. xorq %r14,%r14
  186. movq (%rsp),%rax
  187. movq %r9,%r15
  188. .align 16
  189. .Lsub: sbbq (%rcx,%r14,8),%rax
  190. movq %rax,(%rdi,%r14,8)
  191. movq 8(%rsp,%r14,8),%rax
  192. leaq 1(%r14),%r14
  193. decq %r15
  194. jnz .Lsub
  195. sbbq $0,%rax
  196. movq $-1,%rbx
  197. xorq %rax,%rbx
  198. xorq %r14,%r14
  199. movq %r9,%r15
  200. .Lcopy:
  201. movq (%rdi,%r14,8),%rcx
  202. movq (%rsp,%r14,8),%rdx
  203. andq %rbx,%rcx
  204. andq %rax,%rdx
  205. movq %r9,(%rsp,%r14,8)
  206. orq %rcx,%rdx
  207. movq %rdx,(%rdi,%r14,8)
  208. leaq 1(%r14),%r14
  209. subq $1,%r15
  210. jnz .Lcopy
  211. movq 8(%rsp,%r9,8),%rsi
  212. .cfi_def_cfa %rsi,8
  213. movq $1,%rax
  214. movq -48(%rsi),%r15
  215. .cfi_restore %r15
  216. movq -40(%rsi),%r14
  217. .cfi_restore %r14
  218. movq -32(%rsi),%r13
  219. .cfi_restore %r13
  220. movq -24(%rsi),%r12
  221. .cfi_restore %r12
  222. movq -16(%rsi),%rbp
  223. .cfi_restore %rbp
  224. movq -8(%rsi),%rbx
  225. .cfi_restore %rbx
  226. leaq (%rsi),%rsp
  227. .cfi_def_cfa_register %rsp
  228. .Lmul_epilogue:
  229. .byte 0xf3,0xc3
  230. .cfi_endproc
  231. .size bn_mul_mont,.-bn_mul_mont
  232. .type bn_mul4x_mont,@function
  233. .align 16
  234. bn_mul4x_mont:
  235. .cfi_startproc
  236. movl %r9d,%r9d
  237. movq %rsp,%rax
  238. .cfi_def_cfa_register %rax
  239. .Lmul4x_enter:
  240. andl $0x80100,%r11d
  241. cmpl $0x80100,%r11d
  242. je .Lmulx4x_enter
  243. pushq %rbx
  244. .cfi_offset %rbx,-16
  245. pushq %rbp
  246. .cfi_offset %rbp,-24
  247. pushq %r12
  248. .cfi_offset %r12,-32
  249. pushq %r13
  250. .cfi_offset %r13,-40
  251. pushq %r14
  252. .cfi_offset %r14,-48
  253. pushq %r15
  254. .cfi_offset %r15,-56
  255. negq %r9
  256. movq %rsp,%r11
  257. leaq -32(%rsp,%r9,8),%r10
  258. negq %r9
  259. andq $-1024,%r10
  260. subq %r10,%r11
  261. andq $-4096,%r11
  262. leaq (%r10,%r11,1),%rsp
  263. movq (%rsp),%r11
  264. cmpq %r10,%rsp
  265. ja .Lmul4x_page_walk
  266. jmp .Lmul4x_page_walk_done
  267. .Lmul4x_page_walk:
  268. leaq -4096(%rsp),%rsp
  269. movq (%rsp),%r11
  270. cmpq %r10,%rsp
  271. ja .Lmul4x_page_walk
  272. .Lmul4x_page_walk_done:
  273. movq %rax,8(%rsp,%r9,8)
  274. .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
  275. .Lmul4x_body:
  276. movq %rdi,16(%rsp,%r9,8)
  277. movq %rdx,%r12
  278. movq (%r8),%r8
  279. movq (%r12),%rbx
  280. movq (%rsi),%rax
  281. xorq %r14,%r14
  282. xorq %r15,%r15
  283. movq %r8,%rbp
  284. mulq %rbx
  285. movq %rax,%r10
  286. movq (%rcx),%rax
  287. imulq %r10,%rbp
  288. movq %rdx,%r11
  289. mulq %rbp
  290. addq %rax,%r10
  291. movq 8(%rsi),%rax
  292. adcq $0,%rdx
  293. movq %rdx,%rdi
  294. mulq %rbx
  295. addq %rax,%r11
  296. movq 8(%rcx),%rax
  297. adcq $0,%rdx
  298. movq %rdx,%r10
  299. mulq %rbp
  300. addq %rax,%rdi
  301. movq 16(%rsi),%rax
  302. adcq $0,%rdx
  303. addq %r11,%rdi
  304. leaq 4(%r15),%r15
  305. adcq $0,%rdx
  306. movq %rdi,(%rsp)
  307. movq %rdx,%r13
  308. jmp .L1st4x
  309. .align 16
  310. .L1st4x:
  311. mulq %rbx
  312. addq %rax,%r10
  313. movq -16(%rcx,%r15,8),%rax
  314. adcq $0,%rdx
  315. movq %rdx,%r11
  316. mulq %rbp
  317. addq %rax,%r13
  318. movq -8(%rsi,%r15,8),%rax
  319. adcq $0,%rdx
  320. addq %r10,%r13
  321. adcq $0,%rdx
  322. movq %r13,-24(%rsp,%r15,8)
  323. movq %rdx,%rdi
  324. mulq %rbx
  325. addq %rax,%r11
  326. movq -8(%rcx,%r15,8),%rax
  327. adcq $0,%rdx
  328. movq %rdx,%r10
  329. mulq %rbp
  330. addq %rax,%rdi
  331. movq (%rsi,%r15,8),%rax
  332. adcq $0,%rdx
  333. addq %r11,%rdi
  334. adcq $0,%rdx
  335. movq %rdi,-16(%rsp,%r15,8)
  336. movq %rdx,%r13
  337. mulq %rbx
  338. addq %rax,%r10
  339. movq (%rcx,%r15,8),%rax
  340. adcq $0,%rdx
  341. movq %rdx,%r11
  342. mulq %rbp
  343. addq %rax,%r13
  344. movq 8(%rsi,%r15,8),%rax
  345. adcq $0,%rdx
  346. addq %r10,%r13
  347. adcq $0,%rdx
  348. movq %r13,-8(%rsp,%r15,8)
  349. movq %rdx,%rdi
  350. mulq %rbx
  351. addq %rax,%r11
  352. movq 8(%rcx,%r15,8),%rax
  353. adcq $0,%rdx
  354. leaq 4(%r15),%r15
  355. movq %rdx,%r10
  356. mulq %rbp
  357. addq %rax,%rdi
  358. movq -16(%rsi,%r15,8),%rax
  359. adcq $0,%rdx
  360. addq %r11,%rdi
  361. adcq $0,%rdx
  362. movq %rdi,-32(%rsp,%r15,8)
  363. movq %rdx,%r13
  364. cmpq %r9,%r15
  365. jb .L1st4x
  366. mulq %rbx
  367. addq %rax,%r10
  368. movq -16(%rcx,%r15,8),%rax
  369. adcq $0,%rdx
  370. movq %rdx,%r11
  371. mulq %rbp
  372. addq %rax,%r13
  373. movq -8(%rsi,%r15,8),%rax
  374. adcq $0,%rdx
  375. addq %r10,%r13
  376. adcq $0,%rdx
  377. movq %r13,-24(%rsp,%r15,8)
  378. movq %rdx,%rdi
  379. mulq %rbx
  380. addq %rax,%r11
  381. movq -8(%rcx,%r15,8),%rax
  382. adcq $0,%rdx
  383. movq %rdx,%r10
  384. mulq %rbp
  385. addq %rax,%rdi
  386. movq (%rsi),%rax
  387. adcq $0,%rdx
  388. addq %r11,%rdi
  389. adcq $0,%rdx
  390. movq %rdi,-16(%rsp,%r15,8)
  391. movq %rdx,%r13
  392. xorq %rdi,%rdi
  393. addq %r10,%r13
  394. adcq $0,%rdi
  395. movq %r13,-8(%rsp,%r15,8)
  396. movq %rdi,(%rsp,%r15,8)
  397. leaq 1(%r14),%r14
  398. .align 4
  399. .Louter4x:
  400. movq (%r12,%r14,8),%rbx
  401. xorq %r15,%r15
  402. movq (%rsp),%r10
  403. movq %r8,%rbp
  404. mulq %rbx
  405. addq %rax,%r10
  406. movq (%rcx),%rax
  407. adcq $0,%rdx
  408. imulq %r10,%rbp
  409. movq %rdx,%r11
  410. mulq %rbp
  411. addq %rax,%r10
  412. movq 8(%rsi),%rax
  413. adcq $0,%rdx
  414. movq %rdx,%rdi
  415. mulq %rbx
  416. addq %rax,%r11
  417. movq 8(%rcx),%rax
  418. adcq $0,%rdx
  419. addq 8(%rsp),%r11
  420. adcq $0,%rdx
  421. movq %rdx,%r10
  422. mulq %rbp
  423. addq %rax,%rdi
  424. movq 16(%rsi),%rax
  425. adcq $0,%rdx
  426. addq %r11,%rdi
  427. leaq 4(%r15),%r15
  428. adcq $0,%rdx
  429. movq %rdi,(%rsp)
  430. movq %rdx,%r13
  431. jmp .Linner4x
  432. .align 16
  433. .Linner4x:
  434. mulq %rbx
  435. addq %rax,%r10
  436. movq -16(%rcx,%r15,8),%rax
  437. adcq $0,%rdx
  438. addq -16(%rsp,%r15,8),%r10
  439. adcq $0,%rdx
  440. movq %rdx,%r11
  441. mulq %rbp
  442. addq %rax,%r13
  443. movq -8(%rsi,%r15,8),%rax
  444. adcq $0,%rdx
  445. addq %r10,%r13
  446. adcq $0,%rdx
  447. movq %r13,-24(%rsp,%r15,8)
  448. movq %rdx,%rdi
  449. mulq %rbx
  450. addq %rax,%r11
  451. movq -8(%rcx,%r15,8),%rax
  452. adcq $0,%rdx
  453. addq -8(%rsp,%r15,8),%r11
  454. adcq $0,%rdx
  455. movq %rdx,%r10
  456. mulq %rbp
  457. addq %rax,%rdi
  458. movq (%rsi,%r15,8),%rax
  459. adcq $0,%rdx
  460. addq %r11,%rdi
  461. adcq $0,%rdx
  462. movq %rdi,-16(%rsp,%r15,8)
  463. movq %rdx,%r13
  464. mulq %rbx
  465. addq %rax,%r10
  466. movq (%rcx,%r15,8),%rax
  467. adcq $0,%rdx
  468. addq (%rsp,%r15,8),%r10
  469. adcq $0,%rdx
  470. movq %rdx,%r11
  471. mulq %rbp
  472. addq %rax,%r13
  473. movq 8(%rsi,%r15,8),%rax
  474. adcq $0,%rdx
  475. addq %r10,%r13
  476. adcq $0,%rdx
  477. movq %r13,-8(%rsp,%r15,8)
  478. movq %rdx,%rdi
  479. mulq %rbx
  480. addq %rax,%r11
  481. movq 8(%rcx,%r15,8),%rax
  482. adcq $0,%rdx
  483. addq 8(%rsp,%r15,8),%r11
  484. adcq $0,%rdx
  485. leaq 4(%r15),%r15
  486. movq %rdx,%r10
  487. mulq %rbp
  488. addq %rax,%rdi
  489. movq -16(%rsi,%r15,8),%rax
  490. adcq $0,%rdx
  491. addq %r11,%rdi
  492. adcq $0,%rdx
  493. movq %rdi,-32(%rsp,%r15,8)
  494. movq %rdx,%r13
  495. cmpq %r9,%r15
  496. jb .Linner4x
  497. mulq %rbx
  498. addq %rax,%r10
  499. movq -16(%rcx,%r15,8),%rax
  500. adcq $0,%rdx
  501. addq -16(%rsp,%r15,8),%r10
  502. adcq $0,%rdx
  503. movq %rdx,%r11
  504. mulq %rbp
  505. addq %rax,%r13
  506. movq -8(%rsi,%r15,8),%rax
  507. adcq $0,%rdx
  508. addq %r10,%r13
  509. adcq $0,%rdx
  510. movq %r13,-24(%rsp,%r15,8)
  511. movq %rdx,%rdi
  512. mulq %rbx
  513. addq %rax,%r11
  514. movq -8(%rcx,%r15,8),%rax
  515. adcq $0,%rdx
  516. addq -8(%rsp,%r15,8),%r11
  517. adcq $0,%rdx
  518. leaq 1(%r14),%r14
  519. movq %rdx,%r10
  520. mulq %rbp
  521. addq %rax,%rdi
  522. movq (%rsi),%rax
  523. adcq $0,%rdx
  524. addq %r11,%rdi
  525. adcq $0,%rdx
  526. movq %rdi,-16(%rsp,%r15,8)
  527. movq %rdx,%r13
  528. xorq %rdi,%rdi
  529. addq %r10,%r13
  530. adcq $0,%rdi
  531. addq (%rsp,%r9,8),%r13
  532. adcq $0,%rdi
  533. movq %r13,-8(%rsp,%r15,8)
  534. movq %rdi,(%rsp,%r15,8)
  535. cmpq %r9,%r14
  536. jb .Louter4x
  537. movq 16(%rsp,%r9,8),%rdi
  538. leaq -4(%r9),%r15
  539. movq 0(%rsp),%rax
  540. movq 8(%rsp),%rdx
  541. shrq $2,%r15
  542. leaq (%rsp),%rsi
  543. xorq %r14,%r14
  544. subq 0(%rcx),%rax
  545. movq 16(%rsi),%rbx
  546. movq 24(%rsi),%rbp
  547. sbbq 8(%rcx),%rdx
  548. .Lsub4x:
  549. movq %rax,0(%rdi,%r14,8)
  550. movq %rdx,8(%rdi,%r14,8)
  551. sbbq 16(%rcx,%r14,8),%rbx
  552. movq 32(%rsi,%r14,8),%rax
  553. movq 40(%rsi,%r14,8),%rdx
  554. sbbq 24(%rcx,%r14,8),%rbp
  555. movq %rbx,16(%rdi,%r14,8)
  556. movq %rbp,24(%rdi,%r14,8)
  557. sbbq 32(%rcx,%r14,8),%rax
  558. movq 48(%rsi,%r14,8),%rbx
  559. movq 56(%rsi,%r14,8),%rbp
  560. sbbq 40(%rcx,%r14,8),%rdx
  561. leaq 4(%r14),%r14
  562. decq %r15
  563. jnz .Lsub4x
  564. movq %rax,0(%rdi,%r14,8)
  565. movq 32(%rsi,%r14,8),%rax
  566. sbbq 16(%rcx,%r14,8),%rbx
  567. movq %rdx,8(%rdi,%r14,8)
  568. sbbq 24(%rcx,%r14,8),%rbp
  569. movq %rbx,16(%rdi,%r14,8)
  570. sbbq $0,%rax
  571. movq %rbp,24(%rdi,%r14,8)
  572. pxor %xmm0,%xmm0
  573. .byte 102,72,15,110,224
  574. pcmpeqd %xmm5,%xmm5
  575. pshufd $0,%xmm4,%xmm4
  576. movq %r9,%r15
  577. pxor %xmm4,%xmm5
  578. shrq $2,%r15
  579. xorl %eax,%eax
  580. jmp .Lcopy4x
  581. .align 16
  582. .Lcopy4x:
  583. movdqa (%rsp,%rax,1),%xmm1
  584. movdqu (%rdi,%rax,1),%xmm2
  585. pand %xmm4,%xmm1
  586. pand %xmm5,%xmm2
  587. movdqa 16(%rsp,%rax,1),%xmm3
  588. movdqa %xmm0,(%rsp,%rax,1)
  589. por %xmm2,%xmm1
  590. movdqu 16(%rdi,%rax,1),%xmm2
  591. movdqu %xmm1,(%rdi,%rax,1)
  592. pand %xmm4,%xmm3
  593. pand %xmm5,%xmm2
  594. movdqa %xmm0,16(%rsp,%rax,1)
  595. por %xmm2,%xmm3
  596. movdqu %xmm3,16(%rdi,%rax,1)
  597. leaq 32(%rax),%rax
  598. decq %r15
  599. jnz .Lcopy4x
  600. movq 8(%rsp,%r9,8),%rsi
  601. .cfi_def_cfa %rsi, 8
  602. movq $1,%rax
  603. movq -48(%rsi),%r15
  604. .cfi_restore %r15
  605. movq -40(%rsi),%r14
  606. .cfi_restore %r14
  607. movq -32(%rsi),%r13
  608. .cfi_restore %r13
  609. movq -24(%rsi),%r12
  610. .cfi_restore %r12
  611. movq -16(%rsi),%rbp
  612. .cfi_restore %rbp
  613. movq -8(%rsi),%rbx
  614. .cfi_restore %rbx
  615. leaq (%rsi),%rsp
  616. .cfi_def_cfa_register %rsp
  617. .Lmul4x_epilogue:
  618. .byte 0xf3,0xc3
  619. .cfi_endproc
  620. .size bn_mul4x_mont,.-bn_mul4x_mont
  621. .extern bn_sqrx8x_internal
  622. .hidden bn_sqrx8x_internal
  623. .extern bn_sqr8x_internal
  624. .hidden bn_sqr8x_internal
  625. .type bn_sqr8x_mont,@function
  626. .align 32
  627. bn_sqr8x_mont:
  628. .cfi_startproc
  629. movq %rsp,%rax
  630. .cfi_def_cfa_register %rax
  631. .Lsqr8x_enter:
  632. pushq %rbx
  633. .cfi_offset %rbx,-16
  634. pushq %rbp
  635. .cfi_offset %rbp,-24
  636. pushq %r12
  637. .cfi_offset %r12,-32
  638. pushq %r13
  639. .cfi_offset %r13,-40
  640. pushq %r14
  641. .cfi_offset %r14,-48
  642. pushq %r15
  643. .cfi_offset %r15,-56
  644. .Lsqr8x_prologue:
  645. movl %r9d,%r10d
  646. shll $3,%r9d
  647. shlq $3+2,%r10
  648. negq %r9
  649. leaq -64(%rsp,%r9,2),%r11
  650. movq %rsp,%rbp
  651. movq (%r8),%r8
  652. subq %rsi,%r11
  653. andq $4095,%r11
  654. cmpq %r11,%r10
  655. jb .Lsqr8x_sp_alt
  656. subq %r11,%rbp
  657. leaq -64(%rbp,%r9,2),%rbp
  658. jmp .Lsqr8x_sp_done
  659. .align 32
  660. .Lsqr8x_sp_alt:
  661. leaq 4096-64(,%r9,2),%r10
  662. leaq -64(%rbp,%r9,2),%rbp
  663. subq %r10,%r11
  664. movq $0,%r10
  665. cmovcq %r10,%r11
  666. subq %r11,%rbp
  667. .Lsqr8x_sp_done:
  668. andq $-64,%rbp
  669. movq %rsp,%r11
  670. subq %rbp,%r11
  671. andq $-4096,%r11
  672. leaq (%r11,%rbp,1),%rsp
  673. movq (%rsp),%r10
  674. cmpq %rbp,%rsp
  675. ja .Lsqr8x_page_walk
  676. jmp .Lsqr8x_page_walk_done
  677. .align 16
  678. .Lsqr8x_page_walk:
  679. leaq -4096(%rsp),%rsp
  680. movq (%rsp),%r10
  681. cmpq %rbp,%rsp
  682. ja .Lsqr8x_page_walk
  683. .Lsqr8x_page_walk_done:
  684. movq %r9,%r10
  685. negq %r9
  686. movq %r8,32(%rsp)
  687. movq %rax,40(%rsp)
  688. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  689. .Lsqr8x_body:
  690. .byte 102,72,15,110,209
  691. pxor %xmm0,%xmm0
  692. .byte 102,72,15,110,207
  693. .byte 102,73,15,110,218
  694. leaq OPENSSL_ia32cap_P(%rip),%rax
  695. movl 8(%rax),%eax
  696. andl $0x80100,%eax
  697. cmpl $0x80100,%eax
  698. jne .Lsqr8x_nox
  699. call bn_sqrx8x_internal
  700. leaq (%r8,%rcx,1),%rbx
  701. movq %rcx,%r9
  702. movq %rcx,%rdx
  703. .byte 102,72,15,126,207
  704. sarq $3+2,%rcx
  705. jmp .Lsqr8x_sub
  706. .align 32
  707. .Lsqr8x_nox:
  708. call bn_sqr8x_internal
  709. leaq (%rdi,%r9,1),%rbx
  710. movq %r9,%rcx
  711. movq %r9,%rdx
  712. .byte 102,72,15,126,207
  713. sarq $3+2,%rcx
  714. jmp .Lsqr8x_sub
  715. .align 32
  716. .Lsqr8x_sub:
  717. movq 0(%rbx),%r12
  718. movq 8(%rbx),%r13
  719. movq 16(%rbx),%r14
  720. movq 24(%rbx),%r15
  721. leaq 32(%rbx),%rbx
  722. sbbq 0(%rbp),%r12
  723. sbbq 8(%rbp),%r13
  724. sbbq 16(%rbp),%r14
  725. sbbq 24(%rbp),%r15
  726. leaq 32(%rbp),%rbp
  727. movq %r12,0(%rdi)
  728. movq %r13,8(%rdi)
  729. movq %r14,16(%rdi)
  730. movq %r15,24(%rdi)
  731. leaq 32(%rdi),%rdi
  732. incq %rcx
  733. jnz .Lsqr8x_sub
  734. sbbq $0,%rax
  735. leaq (%rbx,%r9,1),%rbx
  736. leaq (%rdi,%r9,1),%rdi
  737. .byte 102,72,15,110,200
  738. pxor %xmm0,%xmm0
  739. pshufd $0,%xmm1,%xmm1
  740. movq 40(%rsp),%rsi
  741. .cfi_def_cfa %rsi,8
  742. jmp .Lsqr8x_cond_copy
  743. .align 32
  744. .Lsqr8x_cond_copy:
  745. movdqa 0(%rbx),%xmm2
  746. movdqa 16(%rbx),%xmm3
  747. leaq 32(%rbx),%rbx
  748. movdqu 0(%rdi),%xmm4
  749. movdqu 16(%rdi),%xmm5
  750. leaq 32(%rdi),%rdi
  751. movdqa %xmm0,-32(%rbx)
  752. movdqa %xmm0,-16(%rbx)
  753. movdqa %xmm0,-32(%rbx,%rdx,1)
  754. movdqa %xmm0,-16(%rbx,%rdx,1)
  755. pcmpeqd %xmm1,%xmm0
  756. pand %xmm1,%xmm2
  757. pand %xmm1,%xmm3
  758. pand %xmm0,%xmm4
  759. pand %xmm0,%xmm5
  760. pxor %xmm0,%xmm0
  761. por %xmm2,%xmm4
  762. por %xmm3,%xmm5
  763. movdqu %xmm4,-32(%rdi)
  764. movdqu %xmm5,-16(%rdi)
  765. addq $32,%r9
  766. jnz .Lsqr8x_cond_copy
  767. movq $1,%rax
  768. movq -48(%rsi),%r15
  769. .cfi_restore %r15
  770. movq -40(%rsi),%r14
  771. .cfi_restore %r14
  772. movq -32(%rsi),%r13
  773. .cfi_restore %r13
  774. movq -24(%rsi),%r12
  775. .cfi_restore %r12
  776. movq -16(%rsi),%rbp
  777. .cfi_restore %rbp
  778. movq -8(%rsi),%rbx
  779. .cfi_restore %rbx
  780. leaq (%rsi),%rsp
  781. .cfi_def_cfa_register %rsp
  782. .Lsqr8x_epilogue:
  783. .byte 0xf3,0xc3
  784. .cfi_endproc
  785. .size bn_sqr8x_mont,.-bn_sqr8x_mont
  786. .type bn_mulx4x_mont,@function
  787. .align 32
  788. bn_mulx4x_mont:
  789. .cfi_startproc
  790. movq %rsp,%rax
  791. .cfi_def_cfa_register %rax
  792. .Lmulx4x_enter:
  793. pushq %rbx
  794. .cfi_offset %rbx,-16
  795. pushq %rbp
  796. .cfi_offset %rbp,-24
  797. pushq %r12
  798. .cfi_offset %r12,-32
  799. pushq %r13
  800. .cfi_offset %r13,-40
  801. pushq %r14
  802. .cfi_offset %r14,-48
  803. pushq %r15
  804. .cfi_offset %r15,-56
  805. .Lmulx4x_prologue:
  806. shll $3,%r9d
  807. xorq %r10,%r10
  808. subq %r9,%r10
  809. movq (%r8),%r8
  810. leaq -72(%rsp,%r10,1),%rbp
  811. andq $-128,%rbp
  812. movq %rsp,%r11
  813. subq %rbp,%r11
  814. andq $-4096,%r11
  815. leaq (%r11,%rbp,1),%rsp
  816. movq (%rsp),%r10
  817. cmpq %rbp,%rsp
  818. ja .Lmulx4x_page_walk
  819. jmp .Lmulx4x_page_walk_done
  820. .align 16
  821. .Lmulx4x_page_walk:
  822. leaq -4096(%rsp),%rsp
  823. movq (%rsp),%r10
  824. cmpq %rbp,%rsp
  825. ja .Lmulx4x_page_walk
  826. .Lmulx4x_page_walk_done:
  827. leaq (%rdx,%r9,1),%r10
  828. movq %r9,0(%rsp)
  829. shrq $5,%r9
  830. movq %r10,16(%rsp)
  831. subq $1,%r9
  832. movq %r8,24(%rsp)
  833. movq %rdi,32(%rsp)
  834. movq %rax,40(%rsp)
  835. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  836. movq %r9,48(%rsp)
  837. jmp .Lmulx4x_body
  838. .align 32
  839. .Lmulx4x_body:
  840. leaq 8(%rdx),%rdi
  841. movq (%rdx),%rdx
  842. leaq 64+32(%rsp),%rbx
  843. movq %rdx,%r9
  844. mulxq 0(%rsi),%r8,%rax
  845. mulxq 8(%rsi),%r11,%r14
  846. addq %rax,%r11
  847. movq %rdi,8(%rsp)
  848. mulxq 16(%rsi),%r12,%r13
  849. adcq %r14,%r12
  850. adcq $0,%r13
  851. movq %r8,%rdi
  852. imulq 24(%rsp),%r8
  853. xorq %rbp,%rbp
  854. mulxq 24(%rsi),%rax,%r14
  855. movq %r8,%rdx
  856. leaq 32(%rsi),%rsi
  857. adcxq %rax,%r13
  858. adcxq %rbp,%r14
  859. mulxq 0(%rcx),%rax,%r10
  860. adcxq %rax,%rdi
  861. adoxq %r11,%r10
  862. mulxq 8(%rcx),%rax,%r11
  863. adcxq %rax,%r10
  864. adoxq %r12,%r11
  865. .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
  866. movq 48(%rsp),%rdi
  867. movq %r10,-32(%rbx)
  868. adcxq %rax,%r11
  869. adoxq %r13,%r12
  870. mulxq 24(%rcx),%rax,%r15
  871. movq %r9,%rdx
  872. movq %r11,-24(%rbx)
  873. adcxq %rax,%r12
  874. adoxq %rbp,%r15
  875. leaq 32(%rcx),%rcx
  876. movq %r12,-16(%rbx)
  877. jmp .Lmulx4x_1st
  878. .align 32
  879. .Lmulx4x_1st:
  880. adcxq %rbp,%r15
  881. mulxq 0(%rsi),%r10,%rax
  882. adcxq %r14,%r10
  883. mulxq 8(%rsi),%r11,%r14
  884. adcxq %rax,%r11
  885. mulxq 16(%rsi),%r12,%rax
  886. adcxq %r14,%r12
  887. mulxq 24(%rsi),%r13,%r14
  888. .byte 0x67,0x67
  889. movq %r8,%rdx
  890. adcxq %rax,%r13
  891. adcxq %rbp,%r14
  892. leaq 32(%rsi),%rsi
  893. leaq 32(%rbx),%rbx
  894. adoxq %r15,%r10
  895. mulxq 0(%rcx),%rax,%r15
  896. adcxq %rax,%r10
  897. adoxq %r15,%r11
  898. mulxq 8(%rcx),%rax,%r15
  899. adcxq %rax,%r11
  900. adoxq %r15,%r12
  901. mulxq 16(%rcx),%rax,%r15
  902. movq %r10,-40(%rbx)
  903. adcxq %rax,%r12
  904. movq %r11,-32(%rbx)
  905. adoxq %r15,%r13
  906. mulxq 24(%rcx),%rax,%r15
  907. movq %r9,%rdx
  908. movq %r12,-24(%rbx)
  909. adcxq %rax,%r13
  910. adoxq %rbp,%r15
  911. leaq 32(%rcx),%rcx
  912. movq %r13,-16(%rbx)
  913. decq %rdi
  914. jnz .Lmulx4x_1st
  915. movq 0(%rsp),%rax
  916. movq 8(%rsp),%rdi
  917. adcq %rbp,%r15
  918. addq %r15,%r14
  919. sbbq %r15,%r15
  920. movq %r14,-8(%rbx)
  921. jmp .Lmulx4x_outer
  922. .align 32
  923. .Lmulx4x_outer:
  924. movq (%rdi),%rdx
  925. leaq 8(%rdi),%rdi
  926. subq %rax,%rsi
  927. movq %r15,(%rbx)
  928. leaq 64+32(%rsp),%rbx
  929. subq %rax,%rcx
  930. mulxq 0(%rsi),%r8,%r11
  931. xorl %ebp,%ebp
  932. movq %rdx,%r9
  933. mulxq 8(%rsi),%r14,%r12
  934. adoxq -32(%rbx),%r8
  935. adcxq %r14,%r11
  936. mulxq 16(%rsi),%r15,%r13
  937. adoxq -24(%rbx),%r11
  938. adcxq %r15,%r12
  939. adoxq -16(%rbx),%r12
  940. adcxq %rbp,%r13
  941. adoxq %rbp,%r13
  942. movq %rdi,8(%rsp)
  943. movq %r8,%r15
  944. imulq 24(%rsp),%r8
  945. xorl %ebp,%ebp
  946. mulxq 24(%rsi),%rax,%r14
  947. movq %r8,%rdx
  948. adcxq %rax,%r13
  949. adoxq -8(%rbx),%r13
  950. adcxq %rbp,%r14
  951. leaq 32(%rsi),%rsi
  952. adoxq %rbp,%r14
  953. mulxq 0(%rcx),%rax,%r10
  954. adcxq %rax,%r15
  955. adoxq %r11,%r10
  956. mulxq 8(%rcx),%rax,%r11
  957. adcxq %rax,%r10
  958. adoxq %r12,%r11
  959. mulxq 16(%rcx),%rax,%r12
  960. movq %r10,-32(%rbx)
  961. adcxq %rax,%r11
  962. adoxq %r13,%r12
  963. mulxq 24(%rcx),%rax,%r15
  964. movq %r9,%rdx
  965. movq %r11,-24(%rbx)
  966. leaq 32(%rcx),%rcx
  967. adcxq %rax,%r12
  968. adoxq %rbp,%r15
  969. movq 48(%rsp),%rdi
  970. movq %r12,-16(%rbx)
  971. jmp .Lmulx4x_inner
  972. .align 32
  973. .Lmulx4x_inner:
  974. mulxq 0(%rsi),%r10,%rax
  975. adcxq %rbp,%r15
  976. adoxq %r14,%r10
  977. mulxq 8(%rsi),%r11,%r14
  978. adcxq 0(%rbx),%r10
  979. adoxq %rax,%r11
  980. mulxq 16(%rsi),%r12,%rax
  981. adcxq 8(%rbx),%r11
  982. adoxq %r14,%r12
  983. mulxq 24(%rsi),%r13,%r14
  984. movq %r8,%rdx
  985. adcxq 16(%rbx),%r12
  986. adoxq %rax,%r13
  987. adcxq 24(%rbx),%r13
  988. adoxq %rbp,%r14
  989. leaq 32(%rsi),%rsi
  990. leaq 32(%rbx),%rbx
  991. adcxq %rbp,%r14
  992. adoxq %r15,%r10
  993. mulxq 0(%rcx),%rax,%r15
  994. adcxq %rax,%r10
  995. adoxq %r15,%r11
  996. mulxq 8(%rcx),%rax,%r15
  997. adcxq %rax,%r11
  998. adoxq %r15,%r12
  999. mulxq 16(%rcx),%rax,%r15
  1000. movq %r10,-40(%rbx)
  1001. adcxq %rax,%r12
  1002. adoxq %r15,%r13
  1003. mulxq 24(%rcx),%rax,%r15
  1004. movq %r9,%rdx
  1005. movq %r11,-32(%rbx)
  1006. movq %r12,-24(%rbx)
  1007. adcxq %rax,%r13
  1008. adoxq %rbp,%r15
  1009. leaq 32(%rcx),%rcx
  1010. movq %r13,-16(%rbx)
  1011. decq %rdi
  1012. jnz .Lmulx4x_inner
  1013. movq 0(%rsp),%rax
  1014. movq 8(%rsp),%rdi
  1015. adcq %rbp,%r15
  1016. subq 0(%rbx),%rbp
  1017. adcq %r15,%r14
  1018. sbbq %r15,%r15
  1019. movq %r14,-8(%rbx)
  1020. cmpq 16(%rsp),%rdi
  1021. jne .Lmulx4x_outer
  1022. leaq 64(%rsp),%rbx
  1023. subq %rax,%rcx
  1024. negq %r15
  1025. movq %rax,%rdx
  1026. shrq $3+2,%rax
  1027. movq 32(%rsp),%rdi
  1028. jmp .Lmulx4x_sub
  1029. .align 32
  1030. .Lmulx4x_sub:
  1031. movq 0(%rbx),%r11
  1032. movq 8(%rbx),%r12
  1033. movq 16(%rbx),%r13
  1034. movq 24(%rbx),%r14
  1035. leaq 32(%rbx),%rbx
  1036. sbbq 0(%rcx),%r11
  1037. sbbq 8(%rcx),%r12
  1038. sbbq 16(%rcx),%r13
  1039. sbbq 24(%rcx),%r14
  1040. leaq 32(%rcx),%rcx
  1041. movq %r11,0(%rdi)
  1042. movq %r12,8(%rdi)
  1043. movq %r13,16(%rdi)
  1044. movq %r14,24(%rdi)
  1045. leaq 32(%rdi),%rdi
  1046. decq %rax
  1047. jnz .Lmulx4x_sub
  1048. sbbq $0,%r15
  1049. leaq 64(%rsp),%rbx
  1050. subq %rdx,%rdi
  1051. .byte 102,73,15,110,207
  1052. pxor %xmm0,%xmm0
  1053. pshufd $0,%xmm1,%xmm1
  1054. movq 40(%rsp),%rsi
  1055. .cfi_def_cfa %rsi,8
  1056. jmp .Lmulx4x_cond_copy
  1057. .align 32
  1058. .Lmulx4x_cond_copy:
  1059. movdqa 0(%rbx),%xmm2
  1060. movdqa 16(%rbx),%xmm3
  1061. leaq 32(%rbx),%rbx
  1062. movdqu 0(%rdi),%xmm4
  1063. movdqu 16(%rdi),%xmm5
  1064. leaq 32(%rdi),%rdi
  1065. movdqa %xmm0,-32(%rbx)
  1066. movdqa %xmm0,-16(%rbx)
  1067. pcmpeqd %xmm1,%xmm0
  1068. pand %xmm1,%xmm2
  1069. pand %xmm1,%xmm3
  1070. pand %xmm0,%xmm4
  1071. pand %xmm0,%xmm5
  1072. pxor %xmm0,%xmm0
  1073. por %xmm2,%xmm4
  1074. por %xmm3,%xmm5
  1075. movdqu %xmm4,-32(%rdi)
  1076. movdqu %xmm5,-16(%rdi)
  1077. subq $32,%rdx
  1078. jnz .Lmulx4x_cond_copy
  1079. movq %rdx,(%rbx)
  1080. movq $1,%rax
  1081. movq -48(%rsi),%r15
  1082. .cfi_restore %r15
  1083. movq -40(%rsi),%r14
  1084. .cfi_restore %r14
  1085. movq -32(%rsi),%r13
  1086. .cfi_restore %r13
  1087. movq -24(%rsi),%r12
  1088. .cfi_restore %r12
  1089. movq -16(%rsi),%rbp
  1090. .cfi_restore %rbp
  1091. movq -8(%rsi),%rbx
  1092. .cfi_restore %rbx
  1093. leaq (%rsi),%rsp
  1094. .cfi_def_cfa_register %rsp
  1095. .Lmulx4x_epilogue:
  1096. .byte 0xf3,0xc3
  1097. .cfi_endproc
  1098. .size bn_mulx4x_mont,.-bn_mulx4x_mont
  1099. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1100. .align 16
  1101. #endif
  1102. .section .note.GNU-stack,"",@progbits