x86_64-mont5.S 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if defined(__has_feature)
  4. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  5. #define OPENSSL_NO_ASM
  6. #endif
  7. #endif
  8. #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
  9. #if defined(BORINGSSL_PREFIX)
  10. #include <boringssl_prefix_symbols_asm.h>
  11. #endif
  12. .text
  13. .extern OPENSSL_ia32cap_P
  14. .hidden OPENSSL_ia32cap_P
  15. .globl bn_mul_mont_gather5
  16. .hidden bn_mul_mont_gather5
  17. .type bn_mul_mont_gather5,@function
  18. .align 64
  19. bn_mul_mont_gather5:
  20. .cfi_startproc
  21. movl %r9d,%r9d
  22. movq %rsp,%rax
  23. .cfi_def_cfa_register %rax
  24. testl $7,%r9d
  25. jnz .Lmul_enter
  26. leaq OPENSSL_ia32cap_P(%rip),%r11
  27. movl 8(%r11),%r11d
  28. jmp .Lmul4x_enter
  29. .align 16
  30. .Lmul_enter:
  31. movd 8(%rsp),%xmm5
  32. pushq %rbx
  33. .cfi_offset %rbx,-16
  34. pushq %rbp
  35. .cfi_offset %rbp,-24
  36. pushq %r12
  37. .cfi_offset %r12,-32
  38. pushq %r13
  39. .cfi_offset %r13,-40
  40. pushq %r14
  41. .cfi_offset %r14,-48
  42. pushq %r15
  43. .cfi_offset %r15,-56
  44. negq %r9
  45. movq %rsp,%r11
  46. leaq -280(%rsp,%r9,8),%r10
  47. negq %r9
  48. andq $-1024,%r10
  49. subq %r10,%r11
  50. andq $-4096,%r11
  51. leaq (%r10,%r11,1),%rsp
  52. movq (%rsp),%r11
  53. cmpq %r10,%rsp
  54. ja .Lmul_page_walk
  55. jmp .Lmul_page_walk_done
  56. .Lmul_page_walk:
  57. leaq -4096(%rsp),%rsp
  58. movq (%rsp),%r11
  59. cmpq %r10,%rsp
  60. ja .Lmul_page_walk
  61. .Lmul_page_walk_done:
  62. leaq .Linc(%rip),%r10
  63. movq %rax,8(%rsp,%r9,8)
  64. .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
  65. .Lmul_body:
  66. leaq 128(%rdx),%r12
  67. movdqa 0(%r10),%xmm0
  68. movdqa 16(%r10),%xmm1
  69. leaq 24-112(%rsp,%r9,8),%r10
  70. andq $-16,%r10
  71. pshufd $0,%xmm5,%xmm5
  72. movdqa %xmm1,%xmm4
  73. movdqa %xmm1,%xmm2
  74. paddd %xmm0,%xmm1
  75. pcmpeqd %xmm5,%xmm0
  76. .byte 0x67
  77. movdqa %xmm4,%xmm3
  78. paddd %xmm1,%xmm2
  79. pcmpeqd %xmm5,%xmm1
  80. movdqa %xmm0,112(%r10)
  81. movdqa %xmm4,%xmm0
  82. paddd %xmm2,%xmm3
  83. pcmpeqd %xmm5,%xmm2
  84. movdqa %xmm1,128(%r10)
  85. movdqa %xmm4,%xmm1
  86. paddd %xmm3,%xmm0
  87. pcmpeqd %xmm5,%xmm3
  88. movdqa %xmm2,144(%r10)
  89. movdqa %xmm4,%xmm2
  90. paddd %xmm0,%xmm1
  91. pcmpeqd %xmm5,%xmm0
  92. movdqa %xmm3,160(%r10)
  93. movdqa %xmm4,%xmm3
  94. paddd %xmm1,%xmm2
  95. pcmpeqd %xmm5,%xmm1
  96. movdqa %xmm0,176(%r10)
  97. movdqa %xmm4,%xmm0
  98. paddd %xmm2,%xmm3
  99. pcmpeqd %xmm5,%xmm2
  100. movdqa %xmm1,192(%r10)
  101. movdqa %xmm4,%xmm1
  102. paddd %xmm3,%xmm0
  103. pcmpeqd %xmm5,%xmm3
  104. movdqa %xmm2,208(%r10)
  105. movdqa %xmm4,%xmm2
  106. paddd %xmm0,%xmm1
  107. pcmpeqd %xmm5,%xmm0
  108. movdqa %xmm3,224(%r10)
  109. movdqa %xmm4,%xmm3
  110. paddd %xmm1,%xmm2
  111. pcmpeqd %xmm5,%xmm1
  112. movdqa %xmm0,240(%r10)
  113. movdqa %xmm4,%xmm0
  114. paddd %xmm2,%xmm3
  115. pcmpeqd %xmm5,%xmm2
  116. movdqa %xmm1,256(%r10)
  117. movdqa %xmm4,%xmm1
  118. paddd %xmm3,%xmm0
  119. pcmpeqd %xmm5,%xmm3
  120. movdqa %xmm2,272(%r10)
  121. movdqa %xmm4,%xmm2
  122. paddd %xmm0,%xmm1
  123. pcmpeqd %xmm5,%xmm0
  124. movdqa %xmm3,288(%r10)
  125. movdqa %xmm4,%xmm3
  126. paddd %xmm1,%xmm2
  127. pcmpeqd %xmm5,%xmm1
  128. movdqa %xmm0,304(%r10)
  129. paddd %xmm2,%xmm3
  130. .byte 0x67
  131. pcmpeqd %xmm5,%xmm2
  132. movdqa %xmm1,320(%r10)
  133. pcmpeqd %xmm5,%xmm3
  134. movdqa %xmm2,336(%r10)
  135. pand 64(%r12),%xmm0
  136. pand 80(%r12),%xmm1
  137. pand 96(%r12),%xmm2
  138. movdqa %xmm3,352(%r10)
  139. pand 112(%r12),%xmm3
  140. por %xmm2,%xmm0
  141. por %xmm3,%xmm1
  142. movdqa -128(%r12),%xmm4
  143. movdqa -112(%r12),%xmm5
  144. movdqa -96(%r12),%xmm2
  145. pand 112(%r10),%xmm4
  146. movdqa -80(%r12),%xmm3
  147. pand 128(%r10),%xmm5
  148. por %xmm4,%xmm0
  149. pand 144(%r10),%xmm2
  150. por %xmm5,%xmm1
  151. pand 160(%r10),%xmm3
  152. por %xmm2,%xmm0
  153. por %xmm3,%xmm1
  154. movdqa -64(%r12),%xmm4
  155. movdqa -48(%r12),%xmm5
  156. movdqa -32(%r12),%xmm2
  157. pand 176(%r10),%xmm4
  158. movdqa -16(%r12),%xmm3
  159. pand 192(%r10),%xmm5
  160. por %xmm4,%xmm0
  161. pand 208(%r10),%xmm2
  162. por %xmm5,%xmm1
  163. pand 224(%r10),%xmm3
  164. por %xmm2,%xmm0
  165. por %xmm3,%xmm1
  166. movdqa 0(%r12),%xmm4
  167. movdqa 16(%r12),%xmm5
  168. movdqa 32(%r12),%xmm2
  169. pand 240(%r10),%xmm4
  170. movdqa 48(%r12),%xmm3
  171. pand 256(%r10),%xmm5
  172. por %xmm4,%xmm0
  173. pand 272(%r10),%xmm2
  174. por %xmm5,%xmm1
  175. pand 288(%r10),%xmm3
  176. por %xmm2,%xmm0
  177. por %xmm3,%xmm1
  178. por %xmm1,%xmm0
  179. pshufd $0x4e,%xmm0,%xmm1
  180. por %xmm1,%xmm0
  181. leaq 256(%r12),%r12
  182. .byte 102,72,15,126,195
  183. movq (%r8),%r8
  184. movq (%rsi),%rax
  185. xorq %r14,%r14
  186. xorq %r15,%r15
  187. movq %r8,%rbp
  188. mulq %rbx
  189. movq %rax,%r10
  190. movq (%rcx),%rax
  191. imulq %r10,%rbp
  192. movq %rdx,%r11
  193. mulq %rbp
  194. addq %rax,%r10
  195. movq 8(%rsi),%rax
  196. adcq $0,%rdx
  197. movq %rdx,%r13
  198. leaq 1(%r15),%r15
  199. jmp .L1st_enter
  200. .align 16
  201. .L1st:
  202. addq %rax,%r13
  203. movq (%rsi,%r15,8),%rax
  204. adcq $0,%rdx
  205. addq %r11,%r13
  206. movq %r10,%r11
  207. adcq $0,%rdx
  208. movq %r13,-16(%rsp,%r15,8)
  209. movq %rdx,%r13
  210. .L1st_enter:
  211. mulq %rbx
  212. addq %rax,%r11
  213. movq (%rcx,%r15,8),%rax
  214. adcq $0,%rdx
  215. leaq 1(%r15),%r15
  216. movq %rdx,%r10
  217. mulq %rbp
  218. cmpq %r9,%r15
  219. jne .L1st
  220. addq %rax,%r13
  221. adcq $0,%rdx
  222. addq %r11,%r13
  223. adcq $0,%rdx
  224. movq %r13,-16(%rsp,%r9,8)
  225. movq %rdx,%r13
  226. movq %r10,%r11
  227. xorq %rdx,%rdx
  228. addq %r11,%r13
  229. adcq $0,%rdx
  230. movq %r13,-8(%rsp,%r9,8)
  231. movq %rdx,(%rsp,%r9,8)
  232. leaq 1(%r14),%r14
  233. jmp .Louter
  234. .align 16
  235. .Louter:
  236. leaq 24+128(%rsp,%r9,8),%rdx
  237. andq $-16,%rdx
  238. pxor %xmm4,%xmm4
  239. pxor %xmm5,%xmm5
  240. movdqa -128(%r12),%xmm0
  241. movdqa -112(%r12),%xmm1
  242. movdqa -96(%r12),%xmm2
  243. movdqa -80(%r12),%xmm3
  244. pand -128(%rdx),%xmm0
  245. pand -112(%rdx),%xmm1
  246. por %xmm0,%xmm4
  247. pand -96(%rdx),%xmm2
  248. por %xmm1,%xmm5
  249. pand -80(%rdx),%xmm3
  250. por %xmm2,%xmm4
  251. por %xmm3,%xmm5
  252. movdqa -64(%r12),%xmm0
  253. movdqa -48(%r12),%xmm1
  254. movdqa -32(%r12),%xmm2
  255. movdqa -16(%r12),%xmm3
  256. pand -64(%rdx),%xmm0
  257. pand -48(%rdx),%xmm1
  258. por %xmm0,%xmm4
  259. pand -32(%rdx),%xmm2
  260. por %xmm1,%xmm5
  261. pand -16(%rdx),%xmm3
  262. por %xmm2,%xmm4
  263. por %xmm3,%xmm5
  264. movdqa 0(%r12),%xmm0
  265. movdqa 16(%r12),%xmm1
  266. movdqa 32(%r12),%xmm2
  267. movdqa 48(%r12),%xmm3
  268. pand 0(%rdx),%xmm0
  269. pand 16(%rdx),%xmm1
  270. por %xmm0,%xmm4
  271. pand 32(%rdx),%xmm2
  272. por %xmm1,%xmm5
  273. pand 48(%rdx),%xmm3
  274. por %xmm2,%xmm4
  275. por %xmm3,%xmm5
  276. movdqa 64(%r12),%xmm0
  277. movdqa 80(%r12),%xmm1
  278. movdqa 96(%r12),%xmm2
  279. movdqa 112(%r12),%xmm3
  280. pand 64(%rdx),%xmm0
  281. pand 80(%rdx),%xmm1
  282. por %xmm0,%xmm4
  283. pand 96(%rdx),%xmm2
  284. por %xmm1,%xmm5
  285. pand 112(%rdx),%xmm3
  286. por %xmm2,%xmm4
  287. por %xmm3,%xmm5
  288. por %xmm5,%xmm4
  289. pshufd $0x4e,%xmm4,%xmm0
  290. por %xmm4,%xmm0
  291. leaq 256(%r12),%r12
  292. movq (%rsi),%rax
  293. .byte 102,72,15,126,195
  294. xorq %r15,%r15
  295. movq %r8,%rbp
  296. movq (%rsp),%r10
  297. mulq %rbx
  298. addq %rax,%r10
  299. movq (%rcx),%rax
  300. adcq $0,%rdx
  301. imulq %r10,%rbp
  302. movq %rdx,%r11
  303. mulq %rbp
  304. addq %rax,%r10
  305. movq 8(%rsi),%rax
  306. adcq $0,%rdx
  307. movq 8(%rsp),%r10
  308. movq %rdx,%r13
  309. leaq 1(%r15),%r15
  310. jmp .Linner_enter
  311. .align 16
  312. .Linner:
  313. addq %rax,%r13
  314. movq (%rsi,%r15,8),%rax
  315. adcq $0,%rdx
  316. addq %r10,%r13
  317. movq (%rsp,%r15,8),%r10
  318. adcq $0,%rdx
  319. movq %r13,-16(%rsp,%r15,8)
  320. movq %rdx,%r13
  321. .Linner_enter:
  322. mulq %rbx
  323. addq %rax,%r11
  324. movq (%rcx,%r15,8),%rax
  325. adcq $0,%rdx
  326. addq %r11,%r10
  327. movq %rdx,%r11
  328. adcq $0,%r11
  329. leaq 1(%r15),%r15
  330. mulq %rbp
  331. cmpq %r9,%r15
  332. jne .Linner
  333. addq %rax,%r13
  334. adcq $0,%rdx
  335. addq %r10,%r13
  336. movq (%rsp,%r9,8),%r10
  337. adcq $0,%rdx
  338. movq %r13,-16(%rsp,%r9,8)
  339. movq %rdx,%r13
  340. xorq %rdx,%rdx
  341. addq %r11,%r13
  342. adcq $0,%rdx
  343. addq %r10,%r13
  344. adcq $0,%rdx
  345. movq %r13,-8(%rsp,%r9,8)
  346. movq %rdx,(%rsp,%r9,8)
  347. leaq 1(%r14),%r14
  348. cmpq %r9,%r14
  349. jb .Louter
  350. xorq %r14,%r14
  351. movq (%rsp),%rax
  352. leaq (%rsp),%rsi
  353. movq %r9,%r15
  354. jmp .Lsub
  355. .align 16
  356. .Lsub: sbbq (%rcx,%r14,8),%rax
  357. movq %rax,(%rdi,%r14,8)
  358. movq 8(%rsi,%r14,8),%rax
  359. leaq 1(%r14),%r14
  360. decq %r15
  361. jnz .Lsub
  362. sbbq $0,%rax
  363. movq $-1,%rbx
  364. xorq %rax,%rbx
  365. xorq %r14,%r14
  366. movq %r9,%r15
  367. .Lcopy:
  368. movq (%rdi,%r14,8),%rcx
  369. movq (%rsp,%r14,8),%rdx
  370. andq %rbx,%rcx
  371. andq %rax,%rdx
  372. movq %r14,(%rsp,%r14,8)
  373. orq %rcx,%rdx
  374. movq %rdx,(%rdi,%r14,8)
  375. leaq 1(%r14),%r14
  376. subq $1,%r15
  377. jnz .Lcopy
  378. movq 8(%rsp,%r9,8),%rsi
  379. .cfi_def_cfa %rsi,8
  380. movq $1,%rax
  381. movq -48(%rsi),%r15
  382. .cfi_restore %r15
  383. movq -40(%rsi),%r14
  384. .cfi_restore %r14
  385. movq -32(%rsi),%r13
  386. .cfi_restore %r13
  387. movq -24(%rsi),%r12
  388. .cfi_restore %r12
  389. movq -16(%rsi),%rbp
  390. .cfi_restore %rbp
  391. movq -8(%rsi),%rbx
  392. .cfi_restore %rbx
  393. leaq (%rsi),%rsp
  394. .cfi_def_cfa_register %rsp
  395. .Lmul_epilogue:
  396. .byte 0xf3,0xc3
  397. .cfi_endproc
  398. .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
  399. .type bn_mul4x_mont_gather5,@function
  400. .align 32
  401. bn_mul4x_mont_gather5:
  402. .cfi_startproc
  403. .byte 0x67
  404. movq %rsp,%rax
  405. .cfi_def_cfa_register %rax
  406. .Lmul4x_enter:
  407. andl $0x80108,%r11d
  408. cmpl $0x80108,%r11d
  409. je .Lmulx4x_enter
  410. pushq %rbx
  411. .cfi_offset %rbx,-16
  412. pushq %rbp
  413. .cfi_offset %rbp,-24
  414. pushq %r12
  415. .cfi_offset %r12,-32
  416. pushq %r13
  417. .cfi_offset %r13,-40
  418. pushq %r14
  419. .cfi_offset %r14,-48
  420. pushq %r15
  421. .cfi_offset %r15,-56
  422. .Lmul4x_prologue:
  423. .byte 0x67
  424. shll $3,%r9d
  425. leaq (%r9,%r9,2),%r10
  426. negq %r9
  427. leaq -320(%rsp,%r9,2),%r11
  428. movq %rsp,%rbp
  429. subq %rdi,%r11
  430. andq $4095,%r11
  431. cmpq %r11,%r10
  432. jb .Lmul4xsp_alt
  433. subq %r11,%rbp
  434. leaq -320(%rbp,%r9,2),%rbp
  435. jmp .Lmul4xsp_done
  436. .align 32
  437. .Lmul4xsp_alt:
  438. leaq 4096-320(,%r9,2),%r10
  439. leaq -320(%rbp,%r9,2),%rbp
  440. subq %r10,%r11
  441. movq $0,%r10
  442. cmovcq %r10,%r11
  443. subq %r11,%rbp
  444. .Lmul4xsp_done:
  445. andq $-64,%rbp
  446. movq %rsp,%r11
  447. subq %rbp,%r11
  448. andq $-4096,%r11
  449. leaq (%r11,%rbp,1),%rsp
  450. movq (%rsp),%r10
  451. cmpq %rbp,%rsp
  452. ja .Lmul4x_page_walk
  453. jmp .Lmul4x_page_walk_done
  454. .Lmul4x_page_walk:
  455. leaq -4096(%rsp),%rsp
  456. movq (%rsp),%r10
  457. cmpq %rbp,%rsp
  458. ja .Lmul4x_page_walk
  459. .Lmul4x_page_walk_done:
  460. negq %r9
  461. movq %rax,40(%rsp)
  462. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  463. .Lmul4x_body:
  464. call mul4x_internal
  465. movq 40(%rsp),%rsi
  466. .cfi_def_cfa %rsi,8
  467. movq $1,%rax
  468. movq -48(%rsi),%r15
  469. .cfi_restore %r15
  470. movq -40(%rsi),%r14
  471. .cfi_restore %r14
  472. movq -32(%rsi),%r13
  473. .cfi_restore %r13
  474. movq -24(%rsi),%r12
  475. .cfi_restore %r12
  476. movq -16(%rsi),%rbp
  477. .cfi_restore %rbp
  478. movq -8(%rsi),%rbx
  479. .cfi_restore %rbx
  480. leaq (%rsi),%rsp
  481. .cfi_def_cfa_register %rsp
  482. .Lmul4x_epilogue:
  483. .byte 0xf3,0xc3
  484. .cfi_endproc
  485. .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
  486. .type mul4x_internal,@function
  487. .align 32
  488. mul4x_internal:
  489. .cfi_startproc
  490. shlq $5,%r9
  491. movd 8(%rax),%xmm5
  492. leaq .Linc(%rip),%rax
  493. leaq 128(%rdx,%r9,1),%r13
  494. shrq $5,%r9
  495. movdqa 0(%rax),%xmm0
  496. movdqa 16(%rax),%xmm1
  497. leaq 88-112(%rsp,%r9,1),%r10
  498. leaq 128(%rdx),%r12
  499. pshufd $0,%xmm5,%xmm5
  500. movdqa %xmm1,%xmm4
  501. .byte 0x67,0x67
  502. movdqa %xmm1,%xmm2
  503. paddd %xmm0,%xmm1
  504. pcmpeqd %xmm5,%xmm0
  505. .byte 0x67
  506. movdqa %xmm4,%xmm3
  507. paddd %xmm1,%xmm2
  508. pcmpeqd %xmm5,%xmm1
  509. movdqa %xmm0,112(%r10)
  510. movdqa %xmm4,%xmm0
  511. paddd %xmm2,%xmm3
  512. pcmpeqd %xmm5,%xmm2
  513. movdqa %xmm1,128(%r10)
  514. movdqa %xmm4,%xmm1
  515. paddd %xmm3,%xmm0
  516. pcmpeqd %xmm5,%xmm3
  517. movdqa %xmm2,144(%r10)
  518. movdqa %xmm4,%xmm2
  519. paddd %xmm0,%xmm1
  520. pcmpeqd %xmm5,%xmm0
  521. movdqa %xmm3,160(%r10)
  522. movdqa %xmm4,%xmm3
  523. paddd %xmm1,%xmm2
  524. pcmpeqd %xmm5,%xmm1
  525. movdqa %xmm0,176(%r10)
  526. movdqa %xmm4,%xmm0
  527. paddd %xmm2,%xmm3
  528. pcmpeqd %xmm5,%xmm2
  529. movdqa %xmm1,192(%r10)
  530. movdqa %xmm4,%xmm1
  531. paddd %xmm3,%xmm0
  532. pcmpeqd %xmm5,%xmm3
  533. movdqa %xmm2,208(%r10)
  534. movdqa %xmm4,%xmm2
  535. paddd %xmm0,%xmm1
  536. pcmpeqd %xmm5,%xmm0
  537. movdqa %xmm3,224(%r10)
  538. movdqa %xmm4,%xmm3
  539. paddd %xmm1,%xmm2
  540. pcmpeqd %xmm5,%xmm1
  541. movdqa %xmm0,240(%r10)
  542. movdqa %xmm4,%xmm0
  543. paddd %xmm2,%xmm3
  544. pcmpeqd %xmm5,%xmm2
  545. movdqa %xmm1,256(%r10)
  546. movdqa %xmm4,%xmm1
  547. paddd %xmm3,%xmm0
  548. pcmpeqd %xmm5,%xmm3
  549. movdqa %xmm2,272(%r10)
  550. movdqa %xmm4,%xmm2
  551. paddd %xmm0,%xmm1
  552. pcmpeqd %xmm5,%xmm0
  553. movdqa %xmm3,288(%r10)
  554. movdqa %xmm4,%xmm3
  555. paddd %xmm1,%xmm2
  556. pcmpeqd %xmm5,%xmm1
  557. movdqa %xmm0,304(%r10)
  558. paddd %xmm2,%xmm3
  559. .byte 0x67
  560. pcmpeqd %xmm5,%xmm2
  561. movdqa %xmm1,320(%r10)
  562. pcmpeqd %xmm5,%xmm3
  563. movdqa %xmm2,336(%r10)
  564. pand 64(%r12),%xmm0
  565. pand 80(%r12),%xmm1
  566. pand 96(%r12),%xmm2
  567. movdqa %xmm3,352(%r10)
  568. pand 112(%r12),%xmm3
  569. por %xmm2,%xmm0
  570. por %xmm3,%xmm1
  571. movdqa -128(%r12),%xmm4
  572. movdqa -112(%r12),%xmm5
  573. movdqa -96(%r12),%xmm2
  574. pand 112(%r10),%xmm4
  575. movdqa -80(%r12),%xmm3
  576. pand 128(%r10),%xmm5
  577. por %xmm4,%xmm0
  578. pand 144(%r10),%xmm2
  579. por %xmm5,%xmm1
  580. pand 160(%r10),%xmm3
  581. por %xmm2,%xmm0
  582. por %xmm3,%xmm1
  583. movdqa -64(%r12),%xmm4
  584. movdqa -48(%r12),%xmm5
  585. movdqa -32(%r12),%xmm2
  586. pand 176(%r10),%xmm4
  587. movdqa -16(%r12),%xmm3
  588. pand 192(%r10),%xmm5
  589. por %xmm4,%xmm0
  590. pand 208(%r10),%xmm2
  591. por %xmm5,%xmm1
  592. pand 224(%r10),%xmm3
  593. por %xmm2,%xmm0
  594. por %xmm3,%xmm1
  595. movdqa 0(%r12),%xmm4
  596. movdqa 16(%r12),%xmm5
  597. movdqa 32(%r12),%xmm2
  598. pand 240(%r10),%xmm4
  599. movdqa 48(%r12),%xmm3
  600. pand 256(%r10),%xmm5
  601. por %xmm4,%xmm0
  602. pand 272(%r10),%xmm2
  603. por %xmm5,%xmm1
  604. pand 288(%r10),%xmm3
  605. por %xmm2,%xmm0
  606. por %xmm3,%xmm1
  607. por %xmm1,%xmm0
  608. pshufd $0x4e,%xmm0,%xmm1
  609. por %xmm1,%xmm0
  610. leaq 256(%r12),%r12
  611. .byte 102,72,15,126,195
  612. movq %r13,16+8(%rsp)
  613. movq %rdi,56+8(%rsp)
  614. movq (%r8),%r8
  615. movq (%rsi),%rax
  616. leaq (%rsi,%r9,1),%rsi
  617. negq %r9
  618. movq %r8,%rbp
  619. mulq %rbx
  620. movq %rax,%r10
  621. movq (%rcx),%rax
  622. imulq %r10,%rbp
  623. leaq 64+8(%rsp),%r14
  624. movq %rdx,%r11
  625. mulq %rbp
  626. addq %rax,%r10
  627. movq 8(%rsi,%r9,1),%rax
  628. adcq $0,%rdx
  629. movq %rdx,%rdi
  630. mulq %rbx
  631. addq %rax,%r11
  632. movq 8(%rcx),%rax
  633. adcq $0,%rdx
  634. movq %rdx,%r10
  635. mulq %rbp
  636. addq %rax,%rdi
  637. movq 16(%rsi,%r9,1),%rax
  638. adcq $0,%rdx
  639. addq %r11,%rdi
  640. leaq 32(%r9),%r15
  641. leaq 32(%rcx),%rcx
  642. adcq $0,%rdx
  643. movq %rdi,(%r14)
  644. movq %rdx,%r13
  645. jmp .L1st4x
  646. .align 32
  647. .L1st4x:
  648. mulq %rbx
  649. addq %rax,%r10
  650. movq -16(%rcx),%rax
  651. leaq 32(%r14),%r14
  652. adcq $0,%rdx
  653. movq %rdx,%r11
  654. mulq %rbp
  655. addq %rax,%r13
  656. movq -8(%rsi,%r15,1),%rax
  657. adcq $0,%rdx
  658. addq %r10,%r13
  659. adcq $0,%rdx
  660. movq %r13,-24(%r14)
  661. movq %rdx,%rdi
  662. mulq %rbx
  663. addq %rax,%r11
  664. movq -8(%rcx),%rax
  665. adcq $0,%rdx
  666. movq %rdx,%r10
  667. mulq %rbp
  668. addq %rax,%rdi
  669. movq (%rsi,%r15,1),%rax
  670. adcq $0,%rdx
  671. addq %r11,%rdi
  672. adcq $0,%rdx
  673. movq %rdi,-16(%r14)
  674. movq %rdx,%r13
  675. mulq %rbx
  676. addq %rax,%r10
  677. movq 0(%rcx),%rax
  678. adcq $0,%rdx
  679. movq %rdx,%r11
  680. mulq %rbp
  681. addq %rax,%r13
  682. movq 8(%rsi,%r15,1),%rax
  683. adcq $0,%rdx
  684. addq %r10,%r13
  685. adcq $0,%rdx
  686. movq %r13,-8(%r14)
  687. movq %rdx,%rdi
  688. mulq %rbx
  689. addq %rax,%r11
  690. movq 8(%rcx),%rax
  691. adcq $0,%rdx
  692. movq %rdx,%r10
  693. mulq %rbp
  694. addq %rax,%rdi
  695. movq 16(%rsi,%r15,1),%rax
  696. adcq $0,%rdx
  697. addq %r11,%rdi
  698. leaq 32(%rcx),%rcx
  699. adcq $0,%rdx
  700. movq %rdi,(%r14)
  701. movq %rdx,%r13
  702. addq $32,%r15
  703. jnz .L1st4x
  704. mulq %rbx
  705. addq %rax,%r10
  706. movq -16(%rcx),%rax
  707. leaq 32(%r14),%r14
  708. adcq $0,%rdx
  709. movq %rdx,%r11
  710. mulq %rbp
  711. addq %rax,%r13
  712. movq -8(%rsi),%rax
  713. adcq $0,%rdx
  714. addq %r10,%r13
  715. adcq $0,%rdx
  716. movq %r13,-24(%r14)
  717. movq %rdx,%rdi
  718. mulq %rbx
  719. addq %rax,%r11
  720. movq -8(%rcx),%rax
  721. adcq $0,%rdx
  722. movq %rdx,%r10
  723. mulq %rbp
  724. addq %rax,%rdi
  725. movq (%rsi,%r9,1),%rax
  726. adcq $0,%rdx
  727. addq %r11,%rdi
  728. adcq $0,%rdx
  729. movq %rdi,-16(%r14)
  730. movq %rdx,%r13
  731. leaq (%rcx,%r9,1),%rcx
  732. xorq %rdi,%rdi
  733. addq %r10,%r13
  734. adcq $0,%rdi
  735. movq %r13,-8(%r14)
  736. jmp .Louter4x
  737. .align 32
  738. .Louter4x:
  739. leaq 16+128(%r14),%rdx
  740. pxor %xmm4,%xmm4
  741. pxor %xmm5,%xmm5
  742. movdqa -128(%r12),%xmm0
  743. movdqa -112(%r12),%xmm1
  744. movdqa -96(%r12),%xmm2
  745. movdqa -80(%r12),%xmm3
  746. pand -128(%rdx),%xmm0
  747. pand -112(%rdx),%xmm1
  748. por %xmm0,%xmm4
  749. pand -96(%rdx),%xmm2
  750. por %xmm1,%xmm5
  751. pand -80(%rdx),%xmm3
  752. por %xmm2,%xmm4
  753. por %xmm3,%xmm5
  754. movdqa -64(%r12),%xmm0
  755. movdqa -48(%r12),%xmm1
  756. movdqa -32(%r12),%xmm2
  757. movdqa -16(%r12),%xmm3
  758. pand -64(%rdx),%xmm0
  759. pand -48(%rdx),%xmm1
  760. por %xmm0,%xmm4
  761. pand -32(%rdx),%xmm2
  762. por %xmm1,%xmm5
  763. pand -16(%rdx),%xmm3
  764. por %xmm2,%xmm4
  765. por %xmm3,%xmm5
  766. movdqa 0(%r12),%xmm0
  767. movdqa 16(%r12),%xmm1
  768. movdqa 32(%r12),%xmm2
  769. movdqa 48(%r12),%xmm3
  770. pand 0(%rdx),%xmm0
  771. pand 16(%rdx),%xmm1
  772. por %xmm0,%xmm4
  773. pand 32(%rdx),%xmm2
  774. por %xmm1,%xmm5
  775. pand 48(%rdx),%xmm3
  776. por %xmm2,%xmm4
  777. por %xmm3,%xmm5
  778. movdqa 64(%r12),%xmm0
  779. movdqa 80(%r12),%xmm1
  780. movdqa 96(%r12),%xmm2
  781. movdqa 112(%r12),%xmm3
  782. pand 64(%rdx),%xmm0
  783. pand 80(%rdx),%xmm1
  784. por %xmm0,%xmm4
  785. pand 96(%rdx),%xmm2
  786. por %xmm1,%xmm5
  787. pand 112(%rdx),%xmm3
  788. por %xmm2,%xmm4
  789. por %xmm3,%xmm5
  790. por %xmm5,%xmm4
  791. pshufd $0x4e,%xmm4,%xmm0
  792. por %xmm4,%xmm0
  793. leaq 256(%r12),%r12
  794. .byte 102,72,15,126,195
  795. movq (%r14,%r9,1),%r10
  796. movq %r8,%rbp
  797. mulq %rbx
  798. addq %rax,%r10
  799. movq (%rcx),%rax
  800. adcq $0,%rdx
  801. imulq %r10,%rbp
  802. movq %rdx,%r11
  803. movq %rdi,(%r14)
  804. leaq (%r14,%r9,1),%r14
  805. mulq %rbp
  806. addq %rax,%r10
  807. movq 8(%rsi,%r9,1),%rax
  808. adcq $0,%rdx
  809. movq %rdx,%rdi
  810. mulq %rbx
  811. addq %rax,%r11
  812. movq 8(%rcx),%rax
  813. adcq $0,%rdx
  814. addq 8(%r14),%r11
  815. adcq $0,%rdx
  816. movq %rdx,%r10
  817. mulq %rbp
  818. addq %rax,%rdi
  819. movq 16(%rsi,%r9,1),%rax
  820. adcq $0,%rdx
  821. addq %r11,%rdi
  822. leaq 32(%r9),%r15
  823. leaq 32(%rcx),%rcx
  824. adcq $0,%rdx
  825. movq %rdx,%r13
  826. jmp .Linner4x
  827. .align 32
  828. .Linner4x:
  829. mulq %rbx
  830. addq %rax,%r10
  831. movq -16(%rcx),%rax
  832. adcq $0,%rdx
  833. addq 16(%r14),%r10
  834. leaq 32(%r14),%r14
  835. adcq $0,%rdx
  836. movq %rdx,%r11
  837. mulq %rbp
  838. addq %rax,%r13
  839. movq -8(%rsi,%r15,1),%rax
  840. adcq $0,%rdx
  841. addq %r10,%r13
  842. adcq $0,%rdx
  843. movq %rdi,-32(%r14)
  844. movq %rdx,%rdi
  845. mulq %rbx
  846. addq %rax,%r11
  847. movq -8(%rcx),%rax
  848. adcq $0,%rdx
  849. addq -8(%r14),%r11
  850. adcq $0,%rdx
  851. movq %rdx,%r10
  852. mulq %rbp
  853. addq %rax,%rdi
  854. movq (%rsi,%r15,1),%rax
  855. adcq $0,%rdx
  856. addq %r11,%rdi
  857. adcq $0,%rdx
  858. movq %r13,-24(%r14)
  859. movq %rdx,%r13
  860. mulq %rbx
  861. addq %rax,%r10
  862. movq 0(%rcx),%rax
  863. adcq $0,%rdx
  864. addq (%r14),%r10
  865. adcq $0,%rdx
  866. movq %rdx,%r11
  867. mulq %rbp
  868. addq %rax,%r13
  869. movq 8(%rsi,%r15,1),%rax
  870. adcq $0,%rdx
  871. addq %r10,%r13
  872. adcq $0,%rdx
  873. movq %rdi,-16(%r14)
  874. movq %rdx,%rdi
  875. mulq %rbx
  876. addq %rax,%r11
  877. movq 8(%rcx),%rax
  878. adcq $0,%rdx
  879. addq 8(%r14),%r11
  880. adcq $0,%rdx
  881. movq %rdx,%r10
  882. mulq %rbp
  883. addq %rax,%rdi
  884. movq 16(%rsi,%r15,1),%rax
  885. adcq $0,%rdx
  886. addq %r11,%rdi
  887. leaq 32(%rcx),%rcx
  888. adcq $0,%rdx
  889. movq %r13,-8(%r14)
  890. movq %rdx,%r13
  891. addq $32,%r15
  892. jnz .Linner4x
  893. mulq %rbx
  894. addq %rax,%r10
  895. movq -16(%rcx),%rax
  896. adcq $0,%rdx
  897. addq 16(%r14),%r10
  898. leaq 32(%r14),%r14
  899. adcq $0,%rdx
  900. movq %rdx,%r11
  901. mulq %rbp
  902. addq %rax,%r13
  903. movq -8(%rsi),%rax
  904. adcq $0,%rdx
  905. addq %r10,%r13
  906. adcq $0,%rdx
  907. movq %rdi,-32(%r14)
  908. movq %rdx,%rdi
  909. mulq %rbx
  910. addq %rax,%r11
  911. movq %rbp,%rax
  912. movq -8(%rcx),%rbp
  913. adcq $0,%rdx
  914. addq -8(%r14),%r11
  915. adcq $0,%rdx
  916. movq %rdx,%r10
  917. mulq %rbp
  918. addq %rax,%rdi
  919. movq (%rsi,%r9,1),%rax
  920. adcq $0,%rdx
  921. addq %r11,%rdi
  922. adcq $0,%rdx
  923. movq %r13,-24(%r14)
  924. movq %rdx,%r13
  925. movq %rdi,-16(%r14)
  926. leaq (%rcx,%r9,1),%rcx
  927. xorq %rdi,%rdi
  928. addq %r10,%r13
  929. adcq $0,%rdi
  930. addq (%r14),%r13
  931. adcq $0,%rdi
  932. movq %r13,-8(%r14)
  933. cmpq 16+8(%rsp),%r12
  934. jb .Louter4x
  935. xorq %rax,%rax
  936. subq %r13,%rbp
  937. adcq %r15,%r15
  938. orq %r15,%rdi
  939. subq %rdi,%rax
  940. leaq (%r14,%r9,1),%rbx
  941. movq (%rcx),%r12
  942. leaq (%rcx),%rbp
  943. movq %r9,%rcx
  944. sarq $3+2,%rcx
  945. movq 56+8(%rsp),%rdi
  946. decq %r12
  947. xorq %r10,%r10
  948. movq 8(%rbp),%r13
  949. movq 16(%rbp),%r14
  950. movq 24(%rbp),%r15
  951. jmp .Lsqr4x_sub_entry
  952. .cfi_endproc
  953. .size mul4x_internal,.-mul4x_internal
  954. .globl bn_power5
  955. .hidden bn_power5
  956. .type bn_power5,@function
  957. .align 32
  958. bn_power5:
  959. .cfi_startproc
  960. movq %rsp,%rax
  961. .cfi_def_cfa_register %rax
  962. leaq OPENSSL_ia32cap_P(%rip),%r11
  963. movl 8(%r11),%r11d
  964. andl $0x80108,%r11d
  965. cmpl $0x80108,%r11d
  966. je .Lpowerx5_enter
  967. pushq %rbx
  968. .cfi_offset %rbx,-16
  969. pushq %rbp
  970. .cfi_offset %rbp,-24
  971. pushq %r12
  972. .cfi_offset %r12,-32
  973. pushq %r13
  974. .cfi_offset %r13,-40
  975. pushq %r14
  976. .cfi_offset %r14,-48
  977. pushq %r15
  978. .cfi_offset %r15,-56
  979. .Lpower5_prologue:
  980. shll $3,%r9d
  981. leal (%r9,%r9,2),%r10d
  982. negq %r9
  983. movq (%r8),%r8
  984. leaq -320(%rsp,%r9,2),%r11
  985. movq %rsp,%rbp
  986. subq %rdi,%r11
  987. andq $4095,%r11
  988. cmpq %r11,%r10
  989. jb .Lpwr_sp_alt
  990. subq %r11,%rbp
  991. leaq -320(%rbp,%r9,2),%rbp
  992. jmp .Lpwr_sp_done
  993. .align 32
  994. .Lpwr_sp_alt:
  995. leaq 4096-320(,%r9,2),%r10
  996. leaq -320(%rbp,%r9,2),%rbp
  997. subq %r10,%r11
  998. movq $0,%r10
  999. cmovcq %r10,%r11
  1000. subq %r11,%rbp
  1001. .Lpwr_sp_done:
  1002. andq $-64,%rbp
  1003. movq %rsp,%r11
  1004. subq %rbp,%r11
  1005. andq $-4096,%r11
  1006. leaq (%r11,%rbp,1),%rsp
  1007. movq (%rsp),%r10
  1008. cmpq %rbp,%rsp
  1009. ja .Lpwr_page_walk
  1010. jmp .Lpwr_page_walk_done
  1011. .Lpwr_page_walk:
  1012. leaq -4096(%rsp),%rsp
  1013. movq (%rsp),%r10
  1014. cmpq %rbp,%rsp
  1015. ja .Lpwr_page_walk
  1016. .Lpwr_page_walk_done:
  1017. movq %r9,%r10
  1018. negq %r9
  1019. movq %r8,32(%rsp)
  1020. movq %rax,40(%rsp)
  1021. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  1022. .Lpower5_body:
  1023. .byte 102,72,15,110,207
  1024. .byte 102,72,15,110,209
  1025. .byte 102,73,15,110,218
  1026. .byte 102,72,15,110,226
  1027. call __bn_sqr8x_internal
  1028. call __bn_post4x_internal
  1029. call __bn_sqr8x_internal
  1030. call __bn_post4x_internal
  1031. call __bn_sqr8x_internal
  1032. call __bn_post4x_internal
  1033. call __bn_sqr8x_internal
  1034. call __bn_post4x_internal
  1035. call __bn_sqr8x_internal
  1036. call __bn_post4x_internal
  1037. .byte 102,72,15,126,209
  1038. .byte 102,72,15,126,226
  1039. movq %rsi,%rdi
  1040. movq 40(%rsp),%rax
  1041. leaq 32(%rsp),%r8
  1042. call mul4x_internal
  1043. movq 40(%rsp),%rsi
  1044. .cfi_def_cfa %rsi,8
  1045. movq $1,%rax
  1046. movq -48(%rsi),%r15
  1047. .cfi_restore %r15
  1048. movq -40(%rsi),%r14
  1049. .cfi_restore %r14
  1050. movq -32(%rsi),%r13
  1051. .cfi_restore %r13
  1052. movq -24(%rsi),%r12
  1053. .cfi_restore %r12
  1054. movq -16(%rsi),%rbp
  1055. .cfi_restore %rbp
  1056. movq -8(%rsi),%rbx
  1057. .cfi_restore %rbx
  1058. leaq (%rsi),%rsp
  1059. .cfi_def_cfa_register %rsp
  1060. .Lpower5_epilogue:
  1061. .byte 0xf3,0xc3
  1062. .cfi_endproc
  1063. .size bn_power5,.-bn_power5
  1064. .globl bn_sqr8x_internal
  1065. .hidden bn_sqr8x_internal
  1066. .hidden bn_sqr8x_internal
  1067. .type bn_sqr8x_internal,@function
  1068. .align 32
  1069. bn_sqr8x_internal:
  1070. __bn_sqr8x_internal:
  1071. .cfi_startproc
  1072. leaq 32(%r10),%rbp
  1073. leaq (%rsi,%r9,1),%rsi
  1074. movq %r9,%rcx
  1075. movq -32(%rsi,%rbp,1),%r14
  1076. leaq 48+8(%rsp,%r9,2),%rdi
  1077. movq -24(%rsi,%rbp,1),%rax
  1078. leaq -32(%rdi,%rbp,1),%rdi
  1079. movq -16(%rsi,%rbp,1),%rbx
  1080. movq %rax,%r15
  1081. mulq %r14
  1082. movq %rax,%r10
  1083. movq %rbx,%rax
  1084. movq %rdx,%r11
  1085. movq %r10,-24(%rdi,%rbp,1)
  1086. mulq %r14
  1087. addq %rax,%r11
  1088. movq %rbx,%rax
  1089. adcq $0,%rdx
  1090. movq %r11,-16(%rdi,%rbp,1)
  1091. movq %rdx,%r10
  1092. movq -8(%rsi,%rbp,1),%rbx
  1093. mulq %r15
  1094. movq %rax,%r12
  1095. movq %rbx,%rax
  1096. movq %rdx,%r13
  1097. leaq (%rbp),%rcx
  1098. mulq %r14
  1099. addq %rax,%r10
  1100. movq %rbx,%rax
  1101. movq %rdx,%r11
  1102. adcq $0,%r11
  1103. addq %r12,%r10
  1104. adcq $0,%r11
  1105. movq %r10,-8(%rdi,%rcx,1)
  1106. jmp .Lsqr4x_1st
  1107. .align 32
  1108. .Lsqr4x_1st:
  1109. movq (%rsi,%rcx,1),%rbx
  1110. mulq %r15
  1111. addq %rax,%r13
  1112. movq %rbx,%rax
  1113. movq %rdx,%r12
  1114. adcq $0,%r12
  1115. mulq %r14
  1116. addq %rax,%r11
  1117. movq %rbx,%rax
  1118. movq 8(%rsi,%rcx,1),%rbx
  1119. movq %rdx,%r10
  1120. adcq $0,%r10
  1121. addq %r13,%r11
  1122. adcq $0,%r10
  1123. mulq %r15
  1124. addq %rax,%r12
  1125. movq %rbx,%rax
  1126. movq %r11,(%rdi,%rcx,1)
  1127. movq %rdx,%r13
  1128. adcq $0,%r13
  1129. mulq %r14
  1130. addq %rax,%r10
  1131. movq %rbx,%rax
  1132. movq 16(%rsi,%rcx,1),%rbx
  1133. movq %rdx,%r11
  1134. adcq $0,%r11
  1135. addq %r12,%r10
  1136. adcq $0,%r11
  1137. mulq %r15
  1138. addq %rax,%r13
  1139. movq %rbx,%rax
  1140. movq %r10,8(%rdi,%rcx,1)
  1141. movq %rdx,%r12
  1142. adcq $0,%r12
  1143. mulq %r14
  1144. addq %rax,%r11
  1145. movq %rbx,%rax
  1146. movq 24(%rsi,%rcx,1),%rbx
  1147. movq %rdx,%r10
  1148. adcq $0,%r10
  1149. addq %r13,%r11
  1150. adcq $0,%r10
  1151. mulq %r15
  1152. addq %rax,%r12
  1153. movq %rbx,%rax
  1154. movq %r11,16(%rdi,%rcx,1)
  1155. movq %rdx,%r13
  1156. adcq $0,%r13
  1157. leaq 32(%rcx),%rcx
  1158. mulq %r14
  1159. addq %rax,%r10
  1160. movq %rbx,%rax
  1161. movq %rdx,%r11
  1162. adcq $0,%r11
  1163. addq %r12,%r10
  1164. adcq $0,%r11
  1165. movq %r10,-8(%rdi,%rcx,1)
  1166. cmpq $0,%rcx
  1167. jne .Lsqr4x_1st
  1168. mulq %r15
  1169. addq %rax,%r13
  1170. leaq 16(%rbp),%rbp
  1171. adcq $0,%rdx
  1172. addq %r11,%r13
  1173. adcq $0,%rdx
  1174. movq %r13,(%rdi)
  1175. movq %rdx,%r12
  1176. movq %rdx,8(%rdi)
  1177. jmp .Lsqr4x_outer
  1178. .align 32
  1179. .Lsqr4x_outer:
  1180. movq -32(%rsi,%rbp,1),%r14
  1181. leaq 48+8(%rsp,%r9,2),%rdi
  1182. movq -24(%rsi,%rbp,1),%rax
  1183. leaq -32(%rdi,%rbp,1),%rdi
  1184. movq -16(%rsi,%rbp,1),%rbx
  1185. movq %rax,%r15
  1186. mulq %r14
  1187. movq -24(%rdi,%rbp,1),%r10
  1188. addq %rax,%r10
  1189. movq %rbx,%rax
  1190. adcq $0,%rdx
  1191. movq %r10,-24(%rdi,%rbp,1)
  1192. movq %rdx,%r11
  1193. mulq %r14
  1194. addq %rax,%r11
  1195. movq %rbx,%rax
  1196. adcq $0,%rdx
  1197. addq -16(%rdi,%rbp,1),%r11
  1198. movq %rdx,%r10
  1199. adcq $0,%r10
  1200. movq %r11,-16(%rdi,%rbp,1)
  1201. xorq %r12,%r12
  1202. movq -8(%rsi,%rbp,1),%rbx
  1203. mulq %r15
  1204. addq %rax,%r12
  1205. movq %rbx,%rax
  1206. adcq $0,%rdx
  1207. addq -8(%rdi,%rbp,1),%r12
  1208. movq %rdx,%r13
  1209. adcq $0,%r13
  1210. mulq %r14
  1211. addq %rax,%r10
  1212. movq %rbx,%rax
  1213. adcq $0,%rdx
  1214. addq %r12,%r10
  1215. movq %rdx,%r11
  1216. adcq $0,%r11
  1217. movq %r10,-8(%rdi,%rbp,1)
  1218. leaq (%rbp),%rcx
  1219. jmp .Lsqr4x_inner
  1220. .align 32
  1221. .Lsqr4x_inner:
  1222. movq (%rsi,%rcx,1),%rbx
  1223. mulq %r15
  1224. addq %rax,%r13
  1225. movq %rbx,%rax
  1226. movq %rdx,%r12
  1227. adcq $0,%r12
  1228. addq (%rdi,%rcx,1),%r13
  1229. adcq $0,%r12
  1230. .byte 0x67
  1231. mulq %r14
  1232. addq %rax,%r11
  1233. movq %rbx,%rax
  1234. movq 8(%rsi,%rcx,1),%rbx
  1235. movq %rdx,%r10
  1236. adcq $0,%r10
  1237. addq %r13,%r11
  1238. adcq $0,%r10
  1239. mulq %r15
  1240. addq %rax,%r12
  1241. movq %r11,(%rdi,%rcx,1)
  1242. movq %rbx,%rax
  1243. movq %rdx,%r13
  1244. adcq $0,%r13
  1245. addq 8(%rdi,%rcx,1),%r12
  1246. leaq 16(%rcx),%rcx
  1247. adcq $0,%r13
  1248. mulq %r14
  1249. addq %rax,%r10
  1250. movq %rbx,%rax
  1251. adcq $0,%rdx
  1252. addq %r12,%r10
  1253. movq %rdx,%r11
  1254. adcq $0,%r11
  1255. movq %r10,-8(%rdi,%rcx,1)
  1256. cmpq $0,%rcx
  1257. jne .Lsqr4x_inner
  1258. .byte 0x67
  1259. mulq %r15
  1260. addq %rax,%r13
  1261. adcq $0,%rdx
  1262. addq %r11,%r13
  1263. adcq $0,%rdx
  1264. movq %r13,(%rdi)
  1265. movq %rdx,%r12
  1266. movq %rdx,8(%rdi)
  1267. addq $16,%rbp
  1268. jnz .Lsqr4x_outer
  1269. movq -32(%rsi),%r14
  1270. leaq 48+8(%rsp,%r9,2),%rdi
  1271. movq -24(%rsi),%rax
  1272. leaq -32(%rdi,%rbp,1),%rdi
  1273. movq -16(%rsi),%rbx
  1274. movq %rax,%r15
  1275. mulq %r14
  1276. addq %rax,%r10
  1277. movq %rbx,%rax
  1278. movq %rdx,%r11
  1279. adcq $0,%r11
  1280. mulq %r14
  1281. addq %rax,%r11
  1282. movq %rbx,%rax
  1283. movq %r10,-24(%rdi)
  1284. movq %rdx,%r10
  1285. adcq $0,%r10
  1286. addq %r13,%r11
  1287. movq -8(%rsi),%rbx
  1288. adcq $0,%r10
  1289. mulq %r15
  1290. addq %rax,%r12
  1291. movq %rbx,%rax
  1292. movq %r11,-16(%rdi)
  1293. movq %rdx,%r13
  1294. adcq $0,%r13
  1295. mulq %r14
  1296. addq %rax,%r10
  1297. movq %rbx,%rax
  1298. movq %rdx,%r11
  1299. adcq $0,%r11
  1300. addq %r12,%r10
  1301. adcq $0,%r11
  1302. movq %r10,-8(%rdi)
  1303. mulq %r15
  1304. addq %rax,%r13
  1305. movq -16(%rsi),%rax
  1306. adcq $0,%rdx
  1307. addq %r11,%r13
  1308. adcq $0,%rdx
  1309. movq %r13,(%rdi)
  1310. movq %rdx,%r12
  1311. movq %rdx,8(%rdi)
  1312. mulq %rbx
  1313. addq $16,%rbp
  1314. xorq %r14,%r14
  1315. subq %r9,%rbp
  1316. xorq %r15,%r15
  1317. addq %r12,%rax
  1318. adcq $0,%rdx
  1319. movq %rax,8(%rdi)
  1320. movq %rdx,16(%rdi)
  1321. movq %r15,24(%rdi)
  1322. movq -16(%rsi,%rbp,1),%rax
  1323. leaq 48+8(%rsp),%rdi
  1324. xorq %r10,%r10
  1325. movq 8(%rdi),%r11
  1326. leaq (%r14,%r10,2),%r12
  1327. shrq $63,%r10
  1328. leaq (%rcx,%r11,2),%r13
  1329. shrq $63,%r11
  1330. orq %r10,%r13
  1331. movq 16(%rdi),%r10
  1332. movq %r11,%r14
  1333. mulq %rax
  1334. negq %r15
  1335. movq 24(%rdi),%r11
  1336. adcq %rax,%r12
  1337. movq -8(%rsi,%rbp,1),%rax
  1338. movq %r12,(%rdi)
  1339. adcq %rdx,%r13
  1340. leaq (%r14,%r10,2),%rbx
  1341. movq %r13,8(%rdi)
  1342. sbbq %r15,%r15
  1343. shrq $63,%r10
  1344. leaq (%rcx,%r11,2),%r8
  1345. shrq $63,%r11
  1346. orq %r10,%r8
  1347. movq 32(%rdi),%r10
  1348. movq %r11,%r14
  1349. mulq %rax
  1350. negq %r15
  1351. movq 40(%rdi),%r11
  1352. adcq %rax,%rbx
  1353. movq 0(%rsi,%rbp,1),%rax
  1354. movq %rbx,16(%rdi)
  1355. adcq %rdx,%r8
  1356. leaq 16(%rbp),%rbp
  1357. movq %r8,24(%rdi)
  1358. sbbq %r15,%r15
  1359. leaq 64(%rdi),%rdi
  1360. jmp .Lsqr4x_shift_n_add
  1361. .align 32
  1362. .Lsqr4x_shift_n_add:
  1363. leaq (%r14,%r10,2),%r12
  1364. shrq $63,%r10
  1365. leaq (%rcx,%r11,2),%r13
  1366. shrq $63,%r11
  1367. orq %r10,%r13
  1368. movq -16(%rdi),%r10
  1369. movq %r11,%r14
  1370. mulq %rax
  1371. negq %r15
  1372. movq -8(%rdi),%r11
  1373. adcq %rax,%r12
  1374. movq -8(%rsi,%rbp,1),%rax
  1375. movq %r12,-32(%rdi)
  1376. adcq %rdx,%r13
  1377. leaq (%r14,%r10,2),%rbx
  1378. movq %r13,-24(%rdi)
  1379. sbbq %r15,%r15
  1380. shrq $63,%r10
  1381. leaq (%rcx,%r11,2),%r8
  1382. shrq $63,%r11
  1383. orq %r10,%r8
  1384. movq 0(%rdi),%r10
  1385. movq %r11,%r14
  1386. mulq %rax
  1387. negq %r15
  1388. movq 8(%rdi),%r11
  1389. adcq %rax,%rbx
  1390. movq 0(%rsi,%rbp,1),%rax
  1391. movq %rbx,-16(%rdi)
  1392. adcq %rdx,%r8
  1393. leaq (%r14,%r10,2),%r12
  1394. movq %r8,-8(%rdi)
  1395. sbbq %r15,%r15
  1396. shrq $63,%r10
  1397. leaq (%rcx,%r11,2),%r13
  1398. shrq $63,%r11
  1399. orq %r10,%r13
  1400. movq 16(%rdi),%r10
  1401. movq %r11,%r14
  1402. mulq %rax
  1403. negq %r15
  1404. movq 24(%rdi),%r11
  1405. adcq %rax,%r12
  1406. movq 8(%rsi,%rbp,1),%rax
  1407. movq %r12,0(%rdi)
  1408. adcq %rdx,%r13
  1409. leaq (%r14,%r10,2),%rbx
  1410. movq %r13,8(%rdi)
  1411. sbbq %r15,%r15
  1412. shrq $63,%r10
  1413. leaq (%rcx,%r11,2),%r8
  1414. shrq $63,%r11
  1415. orq %r10,%r8
  1416. movq 32(%rdi),%r10
  1417. movq %r11,%r14
  1418. mulq %rax
  1419. negq %r15
  1420. movq 40(%rdi),%r11
  1421. adcq %rax,%rbx
  1422. movq 16(%rsi,%rbp,1),%rax
  1423. movq %rbx,16(%rdi)
  1424. adcq %rdx,%r8
  1425. movq %r8,24(%rdi)
  1426. sbbq %r15,%r15
  1427. leaq 64(%rdi),%rdi
  1428. addq $32,%rbp
  1429. jnz .Lsqr4x_shift_n_add
  1430. leaq (%r14,%r10,2),%r12
  1431. .byte 0x67
  1432. shrq $63,%r10
  1433. leaq (%rcx,%r11,2),%r13
  1434. shrq $63,%r11
  1435. orq %r10,%r13
  1436. movq -16(%rdi),%r10
  1437. movq %r11,%r14
  1438. mulq %rax
  1439. negq %r15
  1440. movq -8(%rdi),%r11
  1441. adcq %rax,%r12
  1442. movq -8(%rsi),%rax
  1443. movq %r12,-32(%rdi)
  1444. adcq %rdx,%r13
  1445. leaq (%r14,%r10,2),%rbx
  1446. movq %r13,-24(%rdi)
  1447. sbbq %r15,%r15
  1448. shrq $63,%r10
  1449. leaq (%rcx,%r11,2),%r8
  1450. shrq $63,%r11
  1451. orq %r10,%r8
  1452. mulq %rax
  1453. negq %r15
  1454. adcq %rax,%rbx
  1455. adcq %rdx,%r8
  1456. movq %rbx,-16(%rdi)
  1457. movq %r8,-8(%rdi)
  1458. .byte 102,72,15,126,213
  1459. __bn_sqr8x_reduction:
  1460. xorq %rax,%rax
  1461. leaq (%r9,%rbp,1),%rcx
  1462. leaq 48+8(%rsp,%r9,2),%rdx
  1463. movq %rcx,0+8(%rsp)
  1464. leaq 48+8(%rsp,%r9,1),%rdi
  1465. movq %rdx,8+8(%rsp)
  1466. negq %r9
  1467. jmp .L8x_reduction_loop
  1468. .align 32
  1469. .L8x_reduction_loop:
  1470. leaq (%rdi,%r9,1),%rdi
  1471. .byte 0x66
  1472. movq 0(%rdi),%rbx
  1473. movq 8(%rdi),%r9
  1474. movq 16(%rdi),%r10
  1475. movq 24(%rdi),%r11
  1476. movq 32(%rdi),%r12
  1477. movq 40(%rdi),%r13
  1478. movq 48(%rdi),%r14
  1479. movq 56(%rdi),%r15
  1480. movq %rax,(%rdx)
  1481. leaq 64(%rdi),%rdi
  1482. .byte 0x67
  1483. movq %rbx,%r8
  1484. imulq 32+8(%rsp),%rbx
  1485. movq 0(%rbp),%rax
  1486. movl $8,%ecx
  1487. jmp .L8x_reduce
  1488. .align 32
  1489. .L8x_reduce:
  1490. mulq %rbx
  1491. movq 8(%rbp),%rax
  1492. negq %r8
  1493. movq %rdx,%r8
  1494. adcq $0,%r8
  1495. mulq %rbx
  1496. addq %rax,%r9
  1497. movq 16(%rbp),%rax
  1498. adcq $0,%rdx
  1499. addq %r9,%r8
  1500. movq %rbx,48-8+8(%rsp,%rcx,8)
  1501. movq %rdx,%r9
  1502. adcq $0,%r9
  1503. mulq %rbx
  1504. addq %rax,%r10
  1505. movq 24(%rbp),%rax
  1506. adcq $0,%rdx
  1507. addq %r10,%r9
  1508. movq 32+8(%rsp),%rsi
  1509. movq %rdx,%r10
  1510. adcq $0,%r10
  1511. mulq %rbx
  1512. addq %rax,%r11
  1513. movq 32(%rbp),%rax
  1514. adcq $0,%rdx
  1515. imulq %r8,%rsi
  1516. addq %r11,%r10
  1517. movq %rdx,%r11
  1518. adcq $0,%r11
  1519. mulq %rbx
  1520. addq %rax,%r12
  1521. movq 40(%rbp),%rax
  1522. adcq $0,%rdx
  1523. addq %r12,%r11
  1524. movq %rdx,%r12
  1525. adcq $0,%r12
  1526. mulq %rbx
  1527. addq %rax,%r13
  1528. movq 48(%rbp),%rax
  1529. adcq $0,%rdx
  1530. addq %r13,%r12
  1531. movq %rdx,%r13
  1532. adcq $0,%r13
  1533. mulq %rbx
  1534. addq %rax,%r14
  1535. movq 56(%rbp),%rax
  1536. adcq $0,%rdx
  1537. addq %r14,%r13
  1538. movq %rdx,%r14
  1539. adcq $0,%r14
  1540. mulq %rbx
  1541. movq %rsi,%rbx
  1542. addq %rax,%r15
  1543. movq 0(%rbp),%rax
  1544. adcq $0,%rdx
  1545. addq %r15,%r14
  1546. movq %rdx,%r15
  1547. adcq $0,%r15
  1548. decl %ecx
  1549. jnz .L8x_reduce
  1550. leaq 64(%rbp),%rbp
  1551. xorq %rax,%rax
  1552. movq 8+8(%rsp),%rdx
  1553. cmpq 0+8(%rsp),%rbp
  1554. jae .L8x_no_tail
  1555. .byte 0x66
  1556. addq 0(%rdi),%r8
  1557. adcq 8(%rdi),%r9
  1558. adcq 16(%rdi),%r10
  1559. adcq 24(%rdi),%r11
  1560. adcq 32(%rdi),%r12
  1561. adcq 40(%rdi),%r13
  1562. adcq 48(%rdi),%r14
  1563. adcq 56(%rdi),%r15
  1564. sbbq %rsi,%rsi
  1565. movq 48+56+8(%rsp),%rbx
  1566. movl $8,%ecx
  1567. movq 0(%rbp),%rax
  1568. jmp .L8x_tail
  1569. .align 32
  1570. .L8x_tail:
  1571. mulq %rbx
  1572. addq %rax,%r8
  1573. movq 8(%rbp),%rax
  1574. movq %r8,(%rdi)
  1575. movq %rdx,%r8
  1576. adcq $0,%r8
  1577. mulq %rbx
  1578. addq %rax,%r9
  1579. movq 16(%rbp),%rax
  1580. adcq $0,%rdx
  1581. addq %r9,%r8
  1582. leaq 8(%rdi),%rdi
  1583. movq %rdx,%r9
  1584. adcq $0,%r9
  1585. mulq %rbx
  1586. addq %rax,%r10
  1587. movq 24(%rbp),%rax
  1588. adcq $0,%rdx
  1589. addq %r10,%r9
  1590. movq %rdx,%r10
  1591. adcq $0,%r10
  1592. mulq %rbx
  1593. addq %rax,%r11
  1594. movq 32(%rbp),%rax
  1595. adcq $0,%rdx
  1596. addq %r11,%r10
  1597. movq %rdx,%r11
  1598. adcq $0,%r11
  1599. mulq %rbx
  1600. addq %rax,%r12
  1601. movq 40(%rbp),%rax
  1602. adcq $0,%rdx
  1603. addq %r12,%r11
  1604. movq %rdx,%r12
  1605. adcq $0,%r12
  1606. mulq %rbx
  1607. addq %rax,%r13
  1608. movq 48(%rbp),%rax
  1609. adcq $0,%rdx
  1610. addq %r13,%r12
  1611. movq %rdx,%r13
  1612. adcq $0,%r13
  1613. mulq %rbx
  1614. addq %rax,%r14
  1615. movq 56(%rbp),%rax
  1616. adcq $0,%rdx
  1617. addq %r14,%r13
  1618. movq %rdx,%r14
  1619. adcq $0,%r14
  1620. mulq %rbx
  1621. movq 48-16+8(%rsp,%rcx,8),%rbx
  1622. addq %rax,%r15
  1623. adcq $0,%rdx
  1624. addq %r15,%r14
  1625. movq 0(%rbp),%rax
  1626. movq %rdx,%r15
  1627. adcq $0,%r15
  1628. decl %ecx
  1629. jnz .L8x_tail
  1630. leaq 64(%rbp),%rbp
  1631. movq 8+8(%rsp),%rdx
  1632. cmpq 0+8(%rsp),%rbp
  1633. jae .L8x_tail_done
  1634. movq 48+56+8(%rsp),%rbx
  1635. negq %rsi
  1636. movq 0(%rbp),%rax
  1637. adcq 0(%rdi),%r8
  1638. adcq 8(%rdi),%r9
  1639. adcq 16(%rdi),%r10
  1640. adcq 24(%rdi),%r11
  1641. adcq 32(%rdi),%r12
  1642. adcq 40(%rdi),%r13
  1643. adcq 48(%rdi),%r14
  1644. adcq 56(%rdi),%r15
  1645. sbbq %rsi,%rsi
  1646. movl $8,%ecx
  1647. jmp .L8x_tail
  1648. .align 32
  1649. .L8x_tail_done:
  1650. xorq %rax,%rax
  1651. addq (%rdx),%r8
  1652. adcq $0,%r9
  1653. adcq $0,%r10
  1654. adcq $0,%r11
  1655. adcq $0,%r12
  1656. adcq $0,%r13
  1657. adcq $0,%r14
  1658. adcq $0,%r15
  1659. adcq $0,%rax
  1660. negq %rsi
  1661. .L8x_no_tail:
  1662. adcq 0(%rdi),%r8
  1663. adcq 8(%rdi),%r9
  1664. adcq 16(%rdi),%r10
  1665. adcq 24(%rdi),%r11
  1666. adcq 32(%rdi),%r12
  1667. adcq 40(%rdi),%r13
  1668. adcq 48(%rdi),%r14
  1669. adcq 56(%rdi),%r15
  1670. adcq $0,%rax
  1671. movq -8(%rbp),%rcx
  1672. xorq %rsi,%rsi
  1673. .byte 102,72,15,126,213
  1674. movq %r8,0(%rdi)
  1675. movq %r9,8(%rdi)
  1676. .byte 102,73,15,126,217
  1677. movq %r10,16(%rdi)
  1678. movq %r11,24(%rdi)
  1679. movq %r12,32(%rdi)
  1680. movq %r13,40(%rdi)
  1681. movq %r14,48(%rdi)
  1682. movq %r15,56(%rdi)
  1683. leaq 64(%rdi),%rdi
  1684. cmpq %rdx,%rdi
  1685. jb .L8x_reduction_loop
  1686. .byte 0xf3,0xc3
  1687. .cfi_endproc
  1688. .size bn_sqr8x_internal,.-bn_sqr8x_internal
  1689. .type __bn_post4x_internal,@function
  1690. .align 32
  1691. __bn_post4x_internal:
  1692. .cfi_startproc
  1693. movq 0(%rbp),%r12
  1694. leaq (%rdi,%r9,1),%rbx
  1695. movq %r9,%rcx
  1696. .byte 102,72,15,126,207
  1697. negq %rax
  1698. .byte 102,72,15,126,206
  1699. sarq $3+2,%rcx
  1700. decq %r12
  1701. xorq %r10,%r10
  1702. movq 8(%rbp),%r13
  1703. movq 16(%rbp),%r14
  1704. movq 24(%rbp),%r15
  1705. jmp .Lsqr4x_sub_entry
  1706. .align 16
  1707. .Lsqr4x_sub:
  1708. movq 0(%rbp),%r12
  1709. movq 8(%rbp),%r13
  1710. movq 16(%rbp),%r14
  1711. movq 24(%rbp),%r15
  1712. .Lsqr4x_sub_entry:
  1713. leaq 32(%rbp),%rbp
  1714. notq %r12
  1715. notq %r13
  1716. notq %r14
  1717. notq %r15
  1718. andq %rax,%r12
  1719. andq %rax,%r13
  1720. andq %rax,%r14
  1721. andq %rax,%r15
  1722. negq %r10
  1723. adcq 0(%rbx),%r12
  1724. adcq 8(%rbx),%r13
  1725. adcq 16(%rbx),%r14
  1726. adcq 24(%rbx),%r15
  1727. movq %r12,0(%rdi)
  1728. leaq 32(%rbx),%rbx
  1729. movq %r13,8(%rdi)
  1730. sbbq %r10,%r10
  1731. movq %r14,16(%rdi)
  1732. movq %r15,24(%rdi)
  1733. leaq 32(%rdi),%rdi
  1734. incq %rcx
  1735. jnz .Lsqr4x_sub
  1736. movq %r9,%r10
  1737. negq %r9
  1738. .byte 0xf3,0xc3
  1739. .cfi_endproc
  1740. .size __bn_post4x_internal,.-__bn_post4x_internal
  1741. .globl bn_from_montgomery
  1742. .hidden bn_from_montgomery
  1743. .type bn_from_montgomery,@function
  1744. .align 32
  1745. bn_from_montgomery:
  1746. .cfi_startproc
  1747. testl $7,%r9d
  1748. jz bn_from_mont8x
  1749. xorl %eax,%eax
  1750. .byte 0xf3,0xc3
  1751. .cfi_endproc
  1752. .size bn_from_montgomery,.-bn_from_montgomery
  1753. .type bn_from_mont8x,@function
  1754. .align 32
  1755. bn_from_mont8x:
  1756. .cfi_startproc
  1757. .byte 0x67
  1758. movq %rsp,%rax
  1759. .cfi_def_cfa_register %rax
  1760. pushq %rbx
  1761. .cfi_offset %rbx,-16
  1762. pushq %rbp
  1763. .cfi_offset %rbp,-24
  1764. pushq %r12
  1765. .cfi_offset %r12,-32
  1766. pushq %r13
  1767. .cfi_offset %r13,-40
  1768. pushq %r14
  1769. .cfi_offset %r14,-48
  1770. pushq %r15
  1771. .cfi_offset %r15,-56
  1772. .Lfrom_prologue:
  1773. shll $3,%r9d
  1774. leaq (%r9,%r9,2),%r10
  1775. negq %r9
  1776. movq (%r8),%r8
  1777. leaq -320(%rsp,%r9,2),%r11
  1778. movq %rsp,%rbp
  1779. subq %rdi,%r11
  1780. andq $4095,%r11
  1781. cmpq %r11,%r10
  1782. jb .Lfrom_sp_alt
  1783. subq %r11,%rbp
  1784. leaq -320(%rbp,%r9,2),%rbp
  1785. jmp .Lfrom_sp_done
  1786. .align 32
  1787. .Lfrom_sp_alt:
  1788. leaq 4096-320(,%r9,2),%r10
  1789. leaq -320(%rbp,%r9,2),%rbp
  1790. subq %r10,%r11
  1791. movq $0,%r10
  1792. cmovcq %r10,%r11
  1793. subq %r11,%rbp
  1794. .Lfrom_sp_done:
  1795. andq $-64,%rbp
  1796. movq %rsp,%r11
  1797. subq %rbp,%r11
  1798. andq $-4096,%r11
  1799. leaq (%r11,%rbp,1),%rsp
  1800. movq (%rsp),%r10
  1801. cmpq %rbp,%rsp
  1802. ja .Lfrom_page_walk
  1803. jmp .Lfrom_page_walk_done
  1804. .Lfrom_page_walk:
  1805. leaq -4096(%rsp),%rsp
  1806. movq (%rsp),%r10
  1807. cmpq %rbp,%rsp
  1808. ja .Lfrom_page_walk
  1809. .Lfrom_page_walk_done:
  1810. movq %r9,%r10
  1811. negq %r9
  1812. movq %r8,32(%rsp)
  1813. movq %rax,40(%rsp)
  1814. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  1815. .Lfrom_body:
  1816. movq %r9,%r11
  1817. leaq 48(%rsp),%rax
  1818. pxor %xmm0,%xmm0
  1819. jmp .Lmul_by_1
  1820. .align 32
  1821. .Lmul_by_1:
  1822. movdqu (%rsi),%xmm1
  1823. movdqu 16(%rsi),%xmm2
  1824. movdqu 32(%rsi),%xmm3
  1825. movdqa %xmm0,(%rax,%r9,1)
  1826. movdqu 48(%rsi),%xmm4
  1827. movdqa %xmm0,16(%rax,%r9,1)
  1828. .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
  1829. movdqa %xmm1,(%rax)
  1830. movdqa %xmm0,32(%rax,%r9,1)
  1831. movdqa %xmm2,16(%rax)
  1832. movdqa %xmm0,48(%rax,%r9,1)
  1833. movdqa %xmm3,32(%rax)
  1834. movdqa %xmm4,48(%rax)
  1835. leaq 64(%rax),%rax
  1836. subq $64,%r11
  1837. jnz .Lmul_by_1
  1838. .byte 102,72,15,110,207
  1839. .byte 102,72,15,110,209
  1840. .byte 0x67
  1841. movq %rcx,%rbp
  1842. .byte 102,73,15,110,218
  1843. leaq OPENSSL_ia32cap_P(%rip),%r11
  1844. movl 8(%r11),%r11d
  1845. andl $0x80108,%r11d
  1846. cmpl $0x80108,%r11d
  1847. jne .Lfrom_mont_nox
  1848. leaq (%rax,%r9,1),%rdi
  1849. call __bn_sqrx8x_reduction
  1850. call __bn_postx4x_internal
  1851. pxor %xmm0,%xmm0
  1852. leaq 48(%rsp),%rax
  1853. jmp .Lfrom_mont_zero
  1854. .align 32
  1855. .Lfrom_mont_nox:
  1856. call __bn_sqr8x_reduction
  1857. call __bn_post4x_internal
  1858. pxor %xmm0,%xmm0
  1859. leaq 48(%rsp),%rax
  1860. jmp .Lfrom_mont_zero
  1861. .align 32
  1862. .Lfrom_mont_zero:
  1863. movq 40(%rsp),%rsi
  1864. .cfi_def_cfa %rsi,8
  1865. movdqa %xmm0,0(%rax)
  1866. movdqa %xmm0,16(%rax)
  1867. movdqa %xmm0,32(%rax)
  1868. movdqa %xmm0,48(%rax)
  1869. leaq 64(%rax),%rax
  1870. subq $32,%r9
  1871. jnz .Lfrom_mont_zero
  1872. movq $1,%rax
  1873. movq -48(%rsi),%r15
  1874. .cfi_restore %r15
  1875. movq -40(%rsi),%r14
  1876. .cfi_restore %r14
  1877. movq -32(%rsi),%r13
  1878. .cfi_restore %r13
  1879. movq -24(%rsi),%r12
  1880. .cfi_restore %r12
  1881. movq -16(%rsi),%rbp
  1882. .cfi_restore %rbp
  1883. movq -8(%rsi),%rbx
  1884. .cfi_restore %rbx
  1885. leaq (%rsi),%rsp
  1886. .cfi_def_cfa_register %rsp
  1887. .Lfrom_epilogue:
  1888. .byte 0xf3,0xc3
  1889. .cfi_endproc
  1890. .size bn_from_mont8x,.-bn_from_mont8x
  1891. .type bn_mulx4x_mont_gather5,@function
  1892. .align 32
  1893. bn_mulx4x_mont_gather5:
  1894. .cfi_startproc
  1895. movq %rsp,%rax
  1896. .cfi_def_cfa_register %rax
  1897. .Lmulx4x_enter:
  1898. pushq %rbx
  1899. .cfi_offset %rbx,-16
  1900. pushq %rbp
  1901. .cfi_offset %rbp,-24
  1902. pushq %r12
  1903. .cfi_offset %r12,-32
  1904. pushq %r13
  1905. .cfi_offset %r13,-40
  1906. pushq %r14
  1907. .cfi_offset %r14,-48
  1908. pushq %r15
  1909. .cfi_offset %r15,-56
  1910. .Lmulx4x_prologue:
  1911. shll $3,%r9d
  1912. leaq (%r9,%r9,2),%r10
  1913. negq %r9
  1914. movq (%r8),%r8
  1915. leaq -320(%rsp,%r9,2),%r11
  1916. movq %rsp,%rbp
  1917. subq %rdi,%r11
  1918. andq $4095,%r11
  1919. cmpq %r11,%r10
  1920. jb .Lmulx4xsp_alt
  1921. subq %r11,%rbp
  1922. leaq -320(%rbp,%r9,2),%rbp
  1923. jmp .Lmulx4xsp_done
  1924. .Lmulx4xsp_alt:
  1925. leaq 4096-320(,%r9,2),%r10
  1926. leaq -320(%rbp,%r9,2),%rbp
  1927. subq %r10,%r11
  1928. movq $0,%r10
  1929. cmovcq %r10,%r11
  1930. subq %r11,%rbp
  1931. .Lmulx4xsp_done:
  1932. andq $-64,%rbp
  1933. movq %rsp,%r11
  1934. subq %rbp,%r11
  1935. andq $-4096,%r11
  1936. leaq (%r11,%rbp,1),%rsp
  1937. movq (%rsp),%r10
  1938. cmpq %rbp,%rsp
  1939. ja .Lmulx4x_page_walk
  1940. jmp .Lmulx4x_page_walk_done
  1941. .Lmulx4x_page_walk:
  1942. leaq -4096(%rsp),%rsp
  1943. movq (%rsp),%r10
  1944. cmpq %rbp,%rsp
  1945. ja .Lmulx4x_page_walk
  1946. .Lmulx4x_page_walk_done:
  1947. movq %r8,32(%rsp)
  1948. movq %rax,40(%rsp)
  1949. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  1950. .Lmulx4x_body:
  1951. call mulx4x_internal
  1952. movq 40(%rsp),%rsi
  1953. .cfi_def_cfa %rsi,8
  1954. movq $1,%rax
  1955. movq -48(%rsi),%r15
  1956. .cfi_restore %r15
  1957. movq -40(%rsi),%r14
  1958. .cfi_restore %r14
  1959. movq -32(%rsi),%r13
  1960. .cfi_restore %r13
  1961. movq -24(%rsi),%r12
  1962. .cfi_restore %r12
  1963. movq -16(%rsi),%rbp
  1964. .cfi_restore %rbp
  1965. movq -8(%rsi),%rbx
  1966. .cfi_restore %rbx
  1967. leaq (%rsi),%rsp
  1968. .cfi_def_cfa_register %rsp
  1969. .Lmulx4x_epilogue:
  1970. .byte 0xf3,0xc3
  1971. .cfi_endproc
  1972. .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
  1973. .type mulx4x_internal,@function
  1974. .align 32
  1975. mulx4x_internal:
  1976. .cfi_startproc
  1977. movq %r9,8(%rsp)
  1978. movq %r9,%r10
  1979. negq %r9
  1980. shlq $5,%r9
  1981. negq %r10
  1982. leaq 128(%rdx,%r9,1),%r13
  1983. shrq $5+5,%r9
  1984. movd 8(%rax),%xmm5
  1985. subq $1,%r9
  1986. leaq .Linc(%rip),%rax
  1987. movq %r13,16+8(%rsp)
  1988. movq %r9,24+8(%rsp)
  1989. movq %rdi,56+8(%rsp)
  1990. movdqa 0(%rax),%xmm0
  1991. movdqa 16(%rax),%xmm1
  1992. leaq 88-112(%rsp,%r10,1),%r10
  1993. leaq 128(%rdx),%rdi
  1994. pshufd $0,%xmm5,%xmm5
  1995. movdqa %xmm1,%xmm4
  1996. .byte 0x67
  1997. movdqa %xmm1,%xmm2
  1998. .byte 0x67
  1999. paddd %xmm0,%xmm1
  2000. pcmpeqd %xmm5,%xmm0
  2001. movdqa %xmm4,%xmm3
  2002. paddd %xmm1,%xmm2
  2003. pcmpeqd %xmm5,%xmm1
  2004. movdqa %xmm0,112(%r10)
  2005. movdqa %xmm4,%xmm0
  2006. paddd %xmm2,%xmm3
  2007. pcmpeqd %xmm5,%xmm2
  2008. movdqa %xmm1,128(%r10)
  2009. movdqa %xmm4,%xmm1
  2010. paddd %xmm3,%xmm0
  2011. pcmpeqd %xmm5,%xmm3
  2012. movdqa %xmm2,144(%r10)
  2013. movdqa %xmm4,%xmm2
  2014. paddd %xmm0,%xmm1
  2015. pcmpeqd %xmm5,%xmm0
  2016. movdqa %xmm3,160(%r10)
  2017. movdqa %xmm4,%xmm3
  2018. paddd %xmm1,%xmm2
  2019. pcmpeqd %xmm5,%xmm1
  2020. movdqa %xmm0,176(%r10)
  2021. movdqa %xmm4,%xmm0
  2022. paddd %xmm2,%xmm3
  2023. pcmpeqd %xmm5,%xmm2
  2024. movdqa %xmm1,192(%r10)
  2025. movdqa %xmm4,%xmm1
  2026. paddd %xmm3,%xmm0
  2027. pcmpeqd %xmm5,%xmm3
  2028. movdqa %xmm2,208(%r10)
  2029. movdqa %xmm4,%xmm2
  2030. paddd %xmm0,%xmm1
  2031. pcmpeqd %xmm5,%xmm0
  2032. movdqa %xmm3,224(%r10)
  2033. movdqa %xmm4,%xmm3
  2034. paddd %xmm1,%xmm2
  2035. pcmpeqd %xmm5,%xmm1
  2036. movdqa %xmm0,240(%r10)
  2037. movdqa %xmm4,%xmm0
  2038. paddd %xmm2,%xmm3
  2039. pcmpeqd %xmm5,%xmm2
  2040. movdqa %xmm1,256(%r10)
  2041. movdqa %xmm4,%xmm1
  2042. paddd %xmm3,%xmm0
  2043. pcmpeqd %xmm5,%xmm3
  2044. movdqa %xmm2,272(%r10)
  2045. movdqa %xmm4,%xmm2
  2046. paddd %xmm0,%xmm1
  2047. pcmpeqd %xmm5,%xmm0
  2048. movdqa %xmm3,288(%r10)
  2049. movdqa %xmm4,%xmm3
  2050. .byte 0x67
  2051. paddd %xmm1,%xmm2
  2052. pcmpeqd %xmm5,%xmm1
  2053. movdqa %xmm0,304(%r10)
  2054. paddd %xmm2,%xmm3
  2055. pcmpeqd %xmm5,%xmm2
  2056. movdqa %xmm1,320(%r10)
  2057. pcmpeqd %xmm5,%xmm3
  2058. movdqa %xmm2,336(%r10)
  2059. pand 64(%rdi),%xmm0
  2060. pand 80(%rdi),%xmm1
  2061. pand 96(%rdi),%xmm2
  2062. movdqa %xmm3,352(%r10)
  2063. pand 112(%rdi),%xmm3
  2064. por %xmm2,%xmm0
  2065. por %xmm3,%xmm1
  2066. movdqa -128(%rdi),%xmm4
  2067. movdqa -112(%rdi),%xmm5
  2068. movdqa -96(%rdi),%xmm2
  2069. pand 112(%r10),%xmm4
  2070. movdqa -80(%rdi),%xmm3
  2071. pand 128(%r10),%xmm5
  2072. por %xmm4,%xmm0
  2073. pand 144(%r10),%xmm2
  2074. por %xmm5,%xmm1
  2075. pand 160(%r10),%xmm3
  2076. por %xmm2,%xmm0
  2077. por %xmm3,%xmm1
  2078. movdqa -64(%rdi),%xmm4
  2079. movdqa -48(%rdi),%xmm5
  2080. movdqa -32(%rdi),%xmm2
  2081. pand 176(%r10),%xmm4
  2082. movdqa -16(%rdi),%xmm3
  2083. pand 192(%r10),%xmm5
  2084. por %xmm4,%xmm0
  2085. pand 208(%r10),%xmm2
  2086. por %xmm5,%xmm1
  2087. pand 224(%r10),%xmm3
  2088. por %xmm2,%xmm0
  2089. por %xmm3,%xmm1
  2090. movdqa 0(%rdi),%xmm4
  2091. movdqa 16(%rdi),%xmm5
  2092. movdqa 32(%rdi),%xmm2
  2093. pand 240(%r10),%xmm4
  2094. movdqa 48(%rdi),%xmm3
  2095. pand 256(%r10),%xmm5
  2096. por %xmm4,%xmm0
  2097. pand 272(%r10),%xmm2
  2098. por %xmm5,%xmm1
  2099. pand 288(%r10),%xmm3
  2100. por %xmm2,%xmm0
  2101. por %xmm3,%xmm1
  2102. pxor %xmm1,%xmm0
  2103. pshufd $0x4e,%xmm0,%xmm1
  2104. por %xmm1,%xmm0
  2105. leaq 256(%rdi),%rdi
  2106. .byte 102,72,15,126,194
  2107. leaq 64+32+8(%rsp),%rbx
  2108. movq %rdx,%r9
  2109. mulxq 0(%rsi),%r8,%rax
  2110. mulxq 8(%rsi),%r11,%r12
  2111. addq %rax,%r11
  2112. mulxq 16(%rsi),%rax,%r13
  2113. adcq %rax,%r12
  2114. adcq $0,%r13
  2115. mulxq 24(%rsi),%rax,%r14
  2116. movq %r8,%r15
  2117. imulq 32+8(%rsp),%r8
  2118. xorq %rbp,%rbp
  2119. movq %r8,%rdx
  2120. movq %rdi,8+8(%rsp)
  2121. leaq 32(%rsi),%rsi
  2122. adcxq %rax,%r13
  2123. adcxq %rbp,%r14
  2124. mulxq 0(%rcx),%rax,%r10
  2125. adcxq %rax,%r15
  2126. adoxq %r11,%r10
  2127. mulxq 8(%rcx),%rax,%r11
  2128. adcxq %rax,%r10
  2129. adoxq %r12,%r11
  2130. mulxq 16(%rcx),%rax,%r12
  2131. movq 24+8(%rsp),%rdi
  2132. movq %r10,-32(%rbx)
  2133. adcxq %rax,%r11
  2134. adoxq %r13,%r12
  2135. mulxq 24(%rcx),%rax,%r15
  2136. movq %r9,%rdx
  2137. movq %r11,-24(%rbx)
  2138. adcxq %rax,%r12
  2139. adoxq %rbp,%r15
  2140. leaq 32(%rcx),%rcx
  2141. movq %r12,-16(%rbx)
  2142. jmp .Lmulx4x_1st
  2143. .align 32
  2144. .Lmulx4x_1st:
  2145. adcxq %rbp,%r15
  2146. mulxq 0(%rsi),%r10,%rax
  2147. adcxq %r14,%r10
  2148. mulxq 8(%rsi),%r11,%r14
  2149. adcxq %rax,%r11
  2150. mulxq 16(%rsi),%r12,%rax
  2151. adcxq %r14,%r12
  2152. mulxq 24(%rsi),%r13,%r14
  2153. .byte 0x67,0x67
  2154. movq %r8,%rdx
  2155. adcxq %rax,%r13
  2156. adcxq %rbp,%r14
  2157. leaq 32(%rsi),%rsi
  2158. leaq 32(%rbx),%rbx
  2159. adoxq %r15,%r10
  2160. mulxq 0(%rcx),%rax,%r15
  2161. adcxq %rax,%r10
  2162. adoxq %r15,%r11
  2163. mulxq 8(%rcx),%rax,%r15
  2164. adcxq %rax,%r11
  2165. adoxq %r15,%r12
  2166. mulxq 16(%rcx),%rax,%r15
  2167. movq %r10,-40(%rbx)
  2168. adcxq %rax,%r12
  2169. movq %r11,-32(%rbx)
  2170. adoxq %r15,%r13
  2171. mulxq 24(%rcx),%rax,%r15
  2172. movq %r9,%rdx
  2173. movq %r12,-24(%rbx)
  2174. adcxq %rax,%r13
  2175. adoxq %rbp,%r15
  2176. leaq 32(%rcx),%rcx
  2177. movq %r13,-16(%rbx)
  2178. decq %rdi
  2179. jnz .Lmulx4x_1st
  2180. movq 8(%rsp),%rax
  2181. adcq %rbp,%r15
  2182. leaq (%rsi,%rax,1),%rsi
  2183. addq %r15,%r14
  2184. movq 8+8(%rsp),%rdi
  2185. adcq %rbp,%rbp
  2186. movq %r14,-8(%rbx)
  2187. jmp .Lmulx4x_outer
  2188. .align 32
  2189. .Lmulx4x_outer:
  2190. leaq 16-256(%rbx),%r10
  2191. pxor %xmm4,%xmm4
  2192. .byte 0x67,0x67
  2193. pxor %xmm5,%xmm5
  2194. movdqa -128(%rdi),%xmm0
  2195. movdqa -112(%rdi),%xmm1
  2196. movdqa -96(%rdi),%xmm2
  2197. pand 256(%r10),%xmm0
  2198. movdqa -80(%rdi),%xmm3
  2199. pand 272(%r10),%xmm1
  2200. por %xmm0,%xmm4
  2201. pand 288(%r10),%xmm2
  2202. por %xmm1,%xmm5
  2203. pand 304(%r10),%xmm3
  2204. por %xmm2,%xmm4
  2205. por %xmm3,%xmm5
  2206. movdqa -64(%rdi),%xmm0
  2207. movdqa -48(%rdi),%xmm1
  2208. movdqa -32(%rdi),%xmm2
  2209. pand 320(%r10),%xmm0
  2210. movdqa -16(%rdi),%xmm3
  2211. pand 336(%r10),%xmm1
  2212. por %xmm0,%xmm4
  2213. pand 352(%r10),%xmm2
  2214. por %xmm1,%xmm5
  2215. pand 368(%r10),%xmm3
  2216. por %xmm2,%xmm4
  2217. por %xmm3,%xmm5
  2218. movdqa 0(%rdi),%xmm0
  2219. movdqa 16(%rdi),%xmm1
  2220. movdqa 32(%rdi),%xmm2
  2221. pand 384(%r10),%xmm0
  2222. movdqa 48(%rdi),%xmm3
  2223. pand 400(%r10),%xmm1
  2224. por %xmm0,%xmm4
  2225. pand 416(%r10),%xmm2
  2226. por %xmm1,%xmm5
  2227. pand 432(%r10),%xmm3
  2228. por %xmm2,%xmm4
  2229. por %xmm3,%xmm5
  2230. movdqa 64(%rdi),%xmm0
  2231. movdqa 80(%rdi),%xmm1
  2232. movdqa 96(%rdi),%xmm2
  2233. pand 448(%r10),%xmm0
  2234. movdqa 112(%rdi),%xmm3
  2235. pand 464(%r10),%xmm1
  2236. por %xmm0,%xmm4
  2237. pand 480(%r10),%xmm2
  2238. por %xmm1,%xmm5
  2239. pand 496(%r10),%xmm3
  2240. por %xmm2,%xmm4
  2241. por %xmm3,%xmm5
  2242. por %xmm5,%xmm4
  2243. pshufd $0x4e,%xmm4,%xmm0
  2244. por %xmm4,%xmm0
  2245. leaq 256(%rdi),%rdi
  2246. .byte 102,72,15,126,194
  2247. movq %rbp,(%rbx)
  2248. leaq 32(%rbx,%rax,1),%rbx
  2249. mulxq 0(%rsi),%r8,%r11
  2250. xorq %rbp,%rbp
  2251. movq %rdx,%r9
  2252. mulxq 8(%rsi),%r14,%r12
  2253. adoxq -32(%rbx),%r8
  2254. adcxq %r14,%r11
  2255. mulxq 16(%rsi),%r15,%r13
  2256. adoxq -24(%rbx),%r11
  2257. adcxq %r15,%r12
  2258. mulxq 24(%rsi),%rdx,%r14
  2259. adoxq -16(%rbx),%r12
  2260. adcxq %rdx,%r13
  2261. leaq (%rcx,%rax,1),%rcx
  2262. leaq 32(%rsi),%rsi
  2263. adoxq -8(%rbx),%r13
  2264. adcxq %rbp,%r14
  2265. adoxq %rbp,%r14
  2266. movq %r8,%r15
  2267. imulq 32+8(%rsp),%r8
  2268. movq %r8,%rdx
  2269. xorq %rbp,%rbp
  2270. movq %rdi,8+8(%rsp)
  2271. mulxq 0(%rcx),%rax,%r10
  2272. adcxq %rax,%r15
  2273. adoxq %r11,%r10
  2274. mulxq 8(%rcx),%rax,%r11
  2275. adcxq %rax,%r10
  2276. adoxq %r12,%r11
  2277. mulxq 16(%rcx),%rax,%r12
  2278. adcxq %rax,%r11
  2279. adoxq %r13,%r12
  2280. mulxq 24(%rcx),%rax,%r15
  2281. movq %r9,%rdx
  2282. movq 24+8(%rsp),%rdi
  2283. movq %r10,-32(%rbx)
  2284. adcxq %rax,%r12
  2285. movq %r11,-24(%rbx)
  2286. adoxq %rbp,%r15
  2287. movq %r12,-16(%rbx)
  2288. leaq 32(%rcx),%rcx
  2289. jmp .Lmulx4x_inner
  2290. .align 32
  2291. .Lmulx4x_inner:
  2292. mulxq 0(%rsi),%r10,%rax
  2293. adcxq %rbp,%r15
  2294. adoxq %r14,%r10
  2295. mulxq 8(%rsi),%r11,%r14
  2296. adcxq 0(%rbx),%r10
  2297. adoxq %rax,%r11
  2298. mulxq 16(%rsi),%r12,%rax
  2299. adcxq 8(%rbx),%r11
  2300. adoxq %r14,%r12
  2301. mulxq 24(%rsi),%r13,%r14
  2302. movq %r8,%rdx
  2303. adcxq 16(%rbx),%r12
  2304. adoxq %rax,%r13
  2305. adcxq 24(%rbx),%r13
  2306. adoxq %rbp,%r14
  2307. leaq 32(%rsi),%rsi
  2308. leaq 32(%rbx),%rbx
  2309. adcxq %rbp,%r14
  2310. adoxq %r15,%r10
  2311. mulxq 0(%rcx),%rax,%r15
  2312. adcxq %rax,%r10
  2313. adoxq %r15,%r11
  2314. mulxq 8(%rcx),%rax,%r15
  2315. adcxq %rax,%r11
  2316. adoxq %r15,%r12
  2317. mulxq 16(%rcx),%rax,%r15
  2318. movq %r10,-40(%rbx)
  2319. adcxq %rax,%r12
  2320. adoxq %r15,%r13
  2321. movq %r11,-32(%rbx)
  2322. mulxq 24(%rcx),%rax,%r15
  2323. movq %r9,%rdx
  2324. leaq 32(%rcx),%rcx
  2325. movq %r12,-24(%rbx)
  2326. adcxq %rax,%r13
  2327. adoxq %rbp,%r15
  2328. movq %r13,-16(%rbx)
  2329. decq %rdi
  2330. jnz .Lmulx4x_inner
  2331. movq 0+8(%rsp),%rax
  2332. adcq %rbp,%r15
  2333. subq 0(%rbx),%rdi
  2334. movq 8+8(%rsp),%rdi
  2335. movq 16+8(%rsp),%r10
  2336. adcq %r15,%r14
  2337. leaq (%rsi,%rax,1),%rsi
  2338. adcq %rbp,%rbp
  2339. movq %r14,-8(%rbx)
  2340. cmpq %r10,%rdi
  2341. jb .Lmulx4x_outer
  2342. movq -8(%rcx),%r10
  2343. movq %rbp,%r8
  2344. movq (%rcx,%rax,1),%r12
  2345. leaq (%rcx,%rax,1),%rbp
  2346. movq %rax,%rcx
  2347. leaq (%rbx,%rax,1),%rdi
  2348. xorl %eax,%eax
  2349. xorq %r15,%r15
  2350. subq %r14,%r10
  2351. adcq %r15,%r15
  2352. orq %r15,%r8
  2353. sarq $3+2,%rcx
  2354. subq %r8,%rax
  2355. movq 56+8(%rsp),%rdx
  2356. decq %r12
  2357. movq 8(%rbp),%r13
  2358. xorq %r8,%r8
  2359. movq 16(%rbp),%r14
  2360. movq 24(%rbp),%r15
  2361. jmp .Lsqrx4x_sub_entry
  2362. .cfi_endproc
  2363. .size mulx4x_internal,.-mulx4x_internal
  2364. .type bn_powerx5,@function
  2365. .align 32
  2366. bn_powerx5:
  2367. .cfi_startproc
  2368. movq %rsp,%rax
  2369. .cfi_def_cfa_register %rax
  2370. .Lpowerx5_enter:
  2371. pushq %rbx
  2372. .cfi_offset %rbx,-16
  2373. pushq %rbp
  2374. .cfi_offset %rbp,-24
  2375. pushq %r12
  2376. .cfi_offset %r12,-32
  2377. pushq %r13
  2378. .cfi_offset %r13,-40
  2379. pushq %r14
  2380. .cfi_offset %r14,-48
  2381. pushq %r15
  2382. .cfi_offset %r15,-56
  2383. .Lpowerx5_prologue:
  2384. shll $3,%r9d
  2385. leaq (%r9,%r9,2),%r10
  2386. negq %r9
  2387. movq (%r8),%r8
  2388. leaq -320(%rsp,%r9,2),%r11
  2389. movq %rsp,%rbp
  2390. subq %rdi,%r11
  2391. andq $4095,%r11
  2392. cmpq %r11,%r10
  2393. jb .Lpwrx_sp_alt
  2394. subq %r11,%rbp
  2395. leaq -320(%rbp,%r9,2),%rbp
  2396. jmp .Lpwrx_sp_done
  2397. .align 32
  2398. .Lpwrx_sp_alt:
  2399. leaq 4096-320(,%r9,2),%r10
  2400. leaq -320(%rbp,%r9,2),%rbp
  2401. subq %r10,%r11
  2402. movq $0,%r10
  2403. cmovcq %r10,%r11
  2404. subq %r11,%rbp
  2405. .Lpwrx_sp_done:
  2406. andq $-64,%rbp
  2407. movq %rsp,%r11
  2408. subq %rbp,%r11
  2409. andq $-4096,%r11
  2410. leaq (%r11,%rbp,1),%rsp
  2411. movq (%rsp),%r10
  2412. cmpq %rbp,%rsp
  2413. ja .Lpwrx_page_walk
  2414. jmp .Lpwrx_page_walk_done
  2415. .Lpwrx_page_walk:
  2416. leaq -4096(%rsp),%rsp
  2417. movq (%rsp),%r10
  2418. cmpq %rbp,%rsp
  2419. ja .Lpwrx_page_walk
  2420. .Lpwrx_page_walk_done:
  2421. movq %r9,%r10
  2422. negq %r9
  2423. pxor %xmm0,%xmm0
  2424. .byte 102,72,15,110,207
  2425. .byte 102,72,15,110,209
  2426. .byte 102,73,15,110,218
  2427. .byte 102,72,15,110,226
  2428. movq %r8,32(%rsp)
  2429. movq %rax,40(%rsp)
  2430. .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
  2431. .Lpowerx5_body:
  2432. call __bn_sqrx8x_internal
  2433. call __bn_postx4x_internal
  2434. call __bn_sqrx8x_internal
  2435. call __bn_postx4x_internal
  2436. call __bn_sqrx8x_internal
  2437. call __bn_postx4x_internal
  2438. call __bn_sqrx8x_internal
  2439. call __bn_postx4x_internal
  2440. call __bn_sqrx8x_internal
  2441. call __bn_postx4x_internal
  2442. movq %r10,%r9
  2443. movq %rsi,%rdi
  2444. .byte 102,72,15,126,209
  2445. .byte 102,72,15,126,226
  2446. movq 40(%rsp),%rax
  2447. call mulx4x_internal
  2448. movq 40(%rsp),%rsi
  2449. .cfi_def_cfa %rsi,8
  2450. movq $1,%rax
  2451. movq -48(%rsi),%r15
  2452. .cfi_restore %r15
  2453. movq -40(%rsi),%r14
  2454. .cfi_restore %r14
  2455. movq -32(%rsi),%r13
  2456. .cfi_restore %r13
  2457. movq -24(%rsi),%r12
  2458. .cfi_restore %r12
  2459. movq -16(%rsi),%rbp
  2460. .cfi_restore %rbp
  2461. movq -8(%rsi),%rbx
  2462. .cfi_restore %rbx
  2463. leaq (%rsi),%rsp
  2464. .cfi_def_cfa_register %rsp
  2465. .Lpowerx5_epilogue:
  2466. .byte 0xf3,0xc3
  2467. .cfi_endproc
  2468. .size bn_powerx5,.-bn_powerx5
  2469. .globl bn_sqrx8x_internal
  2470. .hidden bn_sqrx8x_internal
  2471. .hidden bn_sqrx8x_internal
  2472. .type bn_sqrx8x_internal,@function
  2473. .align 32
  2474. bn_sqrx8x_internal:
  2475. __bn_sqrx8x_internal:
  2476. .cfi_startproc
  2477. leaq 48+8(%rsp),%rdi
  2478. leaq (%rsi,%r9,1),%rbp
  2479. movq %r9,0+8(%rsp)
  2480. movq %rbp,8+8(%rsp)
  2481. jmp .Lsqr8x_zero_start
  2482. .align 32
  2483. .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
  2484. .Lsqrx8x_zero:
  2485. .byte 0x3e
  2486. movdqa %xmm0,0(%rdi)
  2487. movdqa %xmm0,16(%rdi)
  2488. movdqa %xmm0,32(%rdi)
  2489. movdqa %xmm0,48(%rdi)
  2490. .Lsqr8x_zero_start:
  2491. movdqa %xmm0,64(%rdi)
  2492. movdqa %xmm0,80(%rdi)
  2493. movdqa %xmm0,96(%rdi)
  2494. movdqa %xmm0,112(%rdi)
  2495. leaq 128(%rdi),%rdi
  2496. subq $64,%r9
  2497. jnz .Lsqrx8x_zero
  2498. movq 0(%rsi),%rdx
  2499. xorq %r10,%r10
  2500. xorq %r11,%r11
  2501. xorq %r12,%r12
  2502. xorq %r13,%r13
  2503. xorq %r14,%r14
  2504. xorq %r15,%r15
  2505. leaq 48+8(%rsp),%rdi
  2506. xorq %rbp,%rbp
  2507. jmp .Lsqrx8x_outer_loop
  2508. .align 32
  2509. .Lsqrx8x_outer_loop:
  2510. mulxq 8(%rsi),%r8,%rax
  2511. adcxq %r9,%r8
  2512. adoxq %rax,%r10
  2513. mulxq 16(%rsi),%r9,%rax
  2514. adcxq %r10,%r9
  2515. adoxq %rax,%r11
  2516. .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
  2517. adcxq %r11,%r10
  2518. adoxq %rax,%r12
  2519. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
  2520. adcxq %r12,%r11
  2521. adoxq %rax,%r13
  2522. mulxq 40(%rsi),%r12,%rax
  2523. adcxq %r13,%r12
  2524. adoxq %rax,%r14
  2525. mulxq 48(%rsi),%r13,%rax
  2526. adcxq %r14,%r13
  2527. adoxq %r15,%rax
  2528. mulxq 56(%rsi),%r14,%r15
  2529. movq 8(%rsi),%rdx
  2530. adcxq %rax,%r14
  2531. adoxq %rbp,%r15
  2532. adcq 64(%rdi),%r15
  2533. movq %r8,8(%rdi)
  2534. movq %r9,16(%rdi)
  2535. sbbq %rcx,%rcx
  2536. xorq %rbp,%rbp
  2537. mulxq 16(%rsi),%r8,%rbx
  2538. mulxq 24(%rsi),%r9,%rax
  2539. adcxq %r10,%r8
  2540. adoxq %rbx,%r9
  2541. mulxq 32(%rsi),%r10,%rbx
  2542. adcxq %r11,%r9
  2543. adoxq %rax,%r10
  2544. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
  2545. adcxq %r12,%r10
  2546. adoxq %rbx,%r11
  2547. .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
  2548. adcxq %r13,%r11
  2549. adoxq %r14,%r12
  2550. .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
  2551. movq 16(%rsi),%rdx
  2552. adcxq %rax,%r12
  2553. adoxq %rbx,%r13
  2554. adcxq %r15,%r13
  2555. adoxq %rbp,%r14
  2556. adcxq %rbp,%r14
  2557. movq %r8,24(%rdi)
  2558. movq %r9,32(%rdi)
  2559. mulxq 24(%rsi),%r8,%rbx
  2560. mulxq 32(%rsi),%r9,%rax
  2561. adcxq %r10,%r8
  2562. adoxq %rbx,%r9
  2563. mulxq 40(%rsi),%r10,%rbx
  2564. adcxq %r11,%r9
  2565. adoxq %rax,%r10
  2566. .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
  2567. adcxq %r12,%r10
  2568. adoxq %r13,%r11
  2569. .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
  2570. .byte 0x3e
  2571. movq 24(%rsi),%rdx
  2572. adcxq %rbx,%r11
  2573. adoxq %rax,%r12
  2574. adcxq %r14,%r12
  2575. movq %r8,40(%rdi)
  2576. movq %r9,48(%rdi)
  2577. mulxq 32(%rsi),%r8,%rax
  2578. adoxq %rbp,%r13
  2579. adcxq %rbp,%r13
  2580. mulxq 40(%rsi),%r9,%rbx
  2581. adcxq %r10,%r8
  2582. adoxq %rax,%r9
  2583. mulxq 48(%rsi),%r10,%rax
  2584. adcxq %r11,%r9
  2585. adoxq %r12,%r10
  2586. mulxq 56(%rsi),%r11,%r12
  2587. movq 32(%rsi),%rdx
  2588. movq 40(%rsi),%r14
  2589. adcxq %rbx,%r10
  2590. adoxq %rax,%r11
  2591. movq 48(%rsi),%r15
  2592. adcxq %r13,%r11
  2593. adoxq %rbp,%r12
  2594. adcxq %rbp,%r12
  2595. movq %r8,56(%rdi)
  2596. movq %r9,64(%rdi)
  2597. mulxq %r14,%r9,%rax
  2598. movq 56(%rsi),%r8
  2599. adcxq %r10,%r9
  2600. mulxq %r15,%r10,%rbx
  2601. adoxq %rax,%r10
  2602. adcxq %r11,%r10
  2603. mulxq %r8,%r11,%rax
  2604. movq %r14,%rdx
  2605. adoxq %rbx,%r11
  2606. adcxq %r12,%r11
  2607. adcxq %rbp,%rax
  2608. mulxq %r15,%r14,%rbx
  2609. mulxq %r8,%r12,%r13
  2610. movq %r15,%rdx
  2611. leaq 64(%rsi),%rsi
  2612. adcxq %r14,%r11
  2613. adoxq %rbx,%r12
  2614. adcxq %rax,%r12
  2615. adoxq %rbp,%r13
  2616. .byte 0x67,0x67
  2617. mulxq %r8,%r8,%r14
  2618. adcxq %r8,%r13
  2619. adcxq %rbp,%r14
  2620. cmpq 8+8(%rsp),%rsi
  2621. je .Lsqrx8x_outer_break
  2622. negq %rcx
  2623. movq $-8,%rcx
  2624. movq %rbp,%r15
  2625. movq 64(%rdi),%r8
  2626. adcxq 72(%rdi),%r9
  2627. adcxq 80(%rdi),%r10
  2628. adcxq 88(%rdi),%r11
  2629. adcq 96(%rdi),%r12
  2630. adcq 104(%rdi),%r13
  2631. adcq 112(%rdi),%r14
  2632. adcq 120(%rdi),%r15
  2633. leaq (%rsi),%rbp
  2634. leaq 128(%rdi),%rdi
  2635. sbbq %rax,%rax
  2636. movq -64(%rsi),%rdx
  2637. movq %rax,16+8(%rsp)
  2638. movq %rdi,24+8(%rsp)
  2639. xorl %eax,%eax
  2640. jmp .Lsqrx8x_loop
  2641. .align 32
  2642. .Lsqrx8x_loop:
  2643. movq %r8,%rbx
  2644. mulxq 0(%rbp),%rax,%r8
  2645. adcxq %rax,%rbx
  2646. adoxq %r9,%r8
  2647. mulxq 8(%rbp),%rax,%r9
  2648. adcxq %rax,%r8
  2649. adoxq %r10,%r9
  2650. mulxq 16(%rbp),%rax,%r10
  2651. adcxq %rax,%r9
  2652. adoxq %r11,%r10
  2653. mulxq 24(%rbp),%rax,%r11
  2654. adcxq %rax,%r10
  2655. adoxq %r12,%r11
  2656. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2657. adcxq %rax,%r11
  2658. adoxq %r13,%r12
  2659. mulxq 40(%rbp),%rax,%r13
  2660. adcxq %rax,%r12
  2661. adoxq %r14,%r13
  2662. mulxq 48(%rbp),%rax,%r14
  2663. movq %rbx,(%rdi,%rcx,8)
  2664. movl $0,%ebx
  2665. adcxq %rax,%r13
  2666. adoxq %r15,%r14
  2667. .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
  2668. movq 8(%rsi,%rcx,8),%rdx
  2669. adcxq %rax,%r14
  2670. adoxq %rbx,%r15
  2671. adcxq %rbx,%r15
  2672. .byte 0x67
  2673. incq %rcx
  2674. jnz .Lsqrx8x_loop
  2675. leaq 64(%rbp),%rbp
  2676. movq $-8,%rcx
  2677. cmpq 8+8(%rsp),%rbp
  2678. je .Lsqrx8x_break
  2679. subq 16+8(%rsp),%rbx
  2680. .byte 0x66
  2681. movq -64(%rsi),%rdx
  2682. adcxq 0(%rdi),%r8
  2683. adcxq 8(%rdi),%r9
  2684. adcq 16(%rdi),%r10
  2685. adcq 24(%rdi),%r11
  2686. adcq 32(%rdi),%r12
  2687. adcq 40(%rdi),%r13
  2688. adcq 48(%rdi),%r14
  2689. adcq 56(%rdi),%r15
  2690. leaq 64(%rdi),%rdi
  2691. .byte 0x67
  2692. sbbq %rax,%rax
  2693. xorl %ebx,%ebx
  2694. movq %rax,16+8(%rsp)
  2695. jmp .Lsqrx8x_loop
  2696. .align 32
  2697. .Lsqrx8x_break:
  2698. xorq %rbp,%rbp
  2699. subq 16+8(%rsp),%rbx
  2700. adcxq %rbp,%r8
  2701. movq 24+8(%rsp),%rcx
  2702. adcxq %rbp,%r9
  2703. movq 0(%rsi),%rdx
  2704. adcq $0,%r10
  2705. movq %r8,0(%rdi)
  2706. adcq $0,%r11
  2707. adcq $0,%r12
  2708. adcq $0,%r13
  2709. adcq $0,%r14
  2710. adcq $0,%r15
  2711. cmpq %rcx,%rdi
  2712. je .Lsqrx8x_outer_loop
  2713. movq %r9,8(%rdi)
  2714. movq 8(%rcx),%r9
  2715. movq %r10,16(%rdi)
  2716. movq 16(%rcx),%r10
  2717. movq %r11,24(%rdi)
  2718. movq 24(%rcx),%r11
  2719. movq %r12,32(%rdi)
  2720. movq 32(%rcx),%r12
  2721. movq %r13,40(%rdi)
  2722. movq 40(%rcx),%r13
  2723. movq %r14,48(%rdi)
  2724. movq 48(%rcx),%r14
  2725. movq %r15,56(%rdi)
  2726. movq 56(%rcx),%r15
  2727. movq %rcx,%rdi
  2728. jmp .Lsqrx8x_outer_loop
  2729. .align 32
  2730. .Lsqrx8x_outer_break:
  2731. movq %r9,72(%rdi)
  2732. .byte 102,72,15,126,217
  2733. movq %r10,80(%rdi)
  2734. movq %r11,88(%rdi)
  2735. movq %r12,96(%rdi)
  2736. movq %r13,104(%rdi)
  2737. movq %r14,112(%rdi)
  2738. leaq 48+8(%rsp),%rdi
  2739. movq (%rsi,%rcx,1),%rdx
  2740. movq 8(%rdi),%r11
  2741. xorq %r10,%r10
  2742. movq 0+8(%rsp),%r9
  2743. adoxq %r11,%r11
  2744. movq 16(%rdi),%r12
  2745. movq 24(%rdi),%r13
  2746. .align 32
  2747. .Lsqrx4x_shift_n_add:
  2748. mulxq %rdx,%rax,%rbx
  2749. adoxq %r12,%r12
  2750. adcxq %r10,%rax
  2751. .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
  2752. .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
  2753. adoxq %r13,%r13
  2754. adcxq %r11,%rbx
  2755. movq 40(%rdi),%r11
  2756. movq %rax,0(%rdi)
  2757. movq %rbx,8(%rdi)
  2758. mulxq %rdx,%rax,%rbx
  2759. adoxq %r10,%r10
  2760. adcxq %r12,%rax
  2761. movq 16(%rsi,%rcx,1),%rdx
  2762. movq 48(%rdi),%r12
  2763. adoxq %r11,%r11
  2764. adcxq %r13,%rbx
  2765. movq 56(%rdi),%r13
  2766. movq %rax,16(%rdi)
  2767. movq %rbx,24(%rdi)
  2768. mulxq %rdx,%rax,%rbx
  2769. adoxq %r12,%r12
  2770. adcxq %r10,%rax
  2771. movq 24(%rsi,%rcx,1),%rdx
  2772. leaq 32(%rcx),%rcx
  2773. movq 64(%rdi),%r10
  2774. adoxq %r13,%r13
  2775. adcxq %r11,%rbx
  2776. movq 72(%rdi),%r11
  2777. movq %rax,32(%rdi)
  2778. movq %rbx,40(%rdi)
  2779. mulxq %rdx,%rax,%rbx
  2780. adoxq %r10,%r10
  2781. adcxq %r12,%rax
  2782. jrcxz .Lsqrx4x_shift_n_add_break
  2783. .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
  2784. adoxq %r11,%r11
  2785. adcxq %r13,%rbx
  2786. movq 80(%rdi),%r12
  2787. movq 88(%rdi),%r13
  2788. movq %rax,48(%rdi)
  2789. movq %rbx,56(%rdi)
  2790. leaq 64(%rdi),%rdi
  2791. nop
  2792. jmp .Lsqrx4x_shift_n_add
  2793. .align 32
  2794. .Lsqrx4x_shift_n_add_break:
  2795. adcxq %r13,%rbx
  2796. movq %rax,48(%rdi)
  2797. movq %rbx,56(%rdi)
  2798. leaq 64(%rdi),%rdi
  2799. .byte 102,72,15,126,213
  2800. __bn_sqrx8x_reduction:
  2801. xorl %eax,%eax
  2802. movq 32+8(%rsp),%rbx
  2803. movq 48+8(%rsp),%rdx
  2804. leaq -64(%rbp,%r9,1),%rcx
  2805. movq %rcx,0+8(%rsp)
  2806. movq %rdi,8+8(%rsp)
  2807. leaq 48+8(%rsp),%rdi
  2808. jmp .Lsqrx8x_reduction_loop
  2809. .align 32
  2810. .Lsqrx8x_reduction_loop:
  2811. movq 8(%rdi),%r9
  2812. movq 16(%rdi),%r10
  2813. movq 24(%rdi),%r11
  2814. movq 32(%rdi),%r12
  2815. movq %rdx,%r8
  2816. imulq %rbx,%rdx
  2817. movq 40(%rdi),%r13
  2818. movq 48(%rdi),%r14
  2819. movq 56(%rdi),%r15
  2820. movq %rax,24+8(%rsp)
  2821. leaq 64(%rdi),%rdi
  2822. xorq %rsi,%rsi
  2823. movq $-8,%rcx
  2824. jmp .Lsqrx8x_reduce
  2825. .align 32
  2826. .Lsqrx8x_reduce:
  2827. movq %r8,%rbx
  2828. mulxq 0(%rbp),%rax,%r8
  2829. adcxq %rbx,%rax
  2830. adoxq %r9,%r8
  2831. mulxq 8(%rbp),%rbx,%r9
  2832. adcxq %rbx,%r8
  2833. adoxq %r10,%r9
  2834. mulxq 16(%rbp),%rbx,%r10
  2835. adcxq %rbx,%r9
  2836. adoxq %r11,%r10
  2837. mulxq 24(%rbp),%rbx,%r11
  2838. adcxq %rbx,%r10
  2839. adoxq %r12,%r11
  2840. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
  2841. movq %rdx,%rax
  2842. movq %r8,%rdx
  2843. adcxq %rbx,%r11
  2844. adoxq %r13,%r12
  2845. mulxq 32+8(%rsp),%rbx,%rdx
  2846. movq %rax,%rdx
  2847. movq %rax,64+48+8(%rsp,%rcx,8)
  2848. mulxq 40(%rbp),%rax,%r13
  2849. adcxq %rax,%r12
  2850. adoxq %r14,%r13
  2851. mulxq 48(%rbp),%rax,%r14
  2852. adcxq %rax,%r13
  2853. adoxq %r15,%r14
  2854. mulxq 56(%rbp),%rax,%r15
  2855. movq %rbx,%rdx
  2856. adcxq %rax,%r14
  2857. adoxq %rsi,%r15
  2858. adcxq %rsi,%r15
  2859. .byte 0x67,0x67,0x67
  2860. incq %rcx
  2861. jnz .Lsqrx8x_reduce
  2862. movq %rsi,%rax
  2863. cmpq 0+8(%rsp),%rbp
  2864. jae .Lsqrx8x_no_tail
  2865. movq 48+8(%rsp),%rdx
  2866. addq 0(%rdi),%r8
  2867. leaq 64(%rbp),%rbp
  2868. movq $-8,%rcx
  2869. adcxq 8(%rdi),%r9
  2870. adcxq 16(%rdi),%r10
  2871. adcq 24(%rdi),%r11
  2872. adcq 32(%rdi),%r12
  2873. adcq 40(%rdi),%r13
  2874. adcq 48(%rdi),%r14
  2875. adcq 56(%rdi),%r15
  2876. leaq 64(%rdi),%rdi
  2877. sbbq %rax,%rax
  2878. xorq %rsi,%rsi
  2879. movq %rax,16+8(%rsp)
  2880. jmp .Lsqrx8x_tail
  2881. .align 32
  2882. .Lsqrx8x_tail:
  2883. movq %r8,%rbx
  2884. mulxq 0(%rbp),%rax,%r8
  2885. adcxq %rax,%rbx
  2886. adoxq %r9,%r8
  2887. mulxq 8(%rbp),%rax,%r9
  2888. adcxq %rax,%r8
  2889. adoxq %r10,%r9
  2890. mulxq 16(%rbp),%rax,%r10
  2891. adcxq %rax,%r9
  2892. adoxq %r11,%r10
  2893. mulxq 24(%rbp),%rax,%r11
  2894. adcxq %rax,%r10
  2895. adoxq %r12,%r11
  2896. .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
  2897. adcxq %rax,%r11
  2898. adoxq %r13,%r12
  2899. mulxq 40(%rbp),%rax,%r13
  2900. adcxq %rax,%r12
  2901. adoxq %r14,%r13
  2902. mulxq 48(%rbp),%rax,%r14
  2903. adcxq %rax,%r13
  2904. adoxq %r15,%r14
  2905. mulxq 56(%rbp),%rax,%r15
  2906. movq 72+48+8(%rsp,%rcx,8),%rdx
  2907. adcxq %rax,%r14
  2908. adoxq %rsi,%r15
  2909. movq %rbx,(%rdi,%rcx,8)
  2910. movq %r8,%rbx
  2911. adcxq %rsi,%r15
  2912. incq %rcx
  2913. jnz .Lsqrx8x_tail
  2914. cmpq 0+8(%rsp),%rbp
  2915. jae .Lsqrx8x_tail_done
  2916. subq 16+8(%rsp),%rsi
  2917. movq 48+8(%rsp),%rdx
  2918. leaq 64(%rbp),%rbp
  2919. adcq 0(%rdi),%r8
  2920. adcq 8(%rdi),%r9
  2921. adcq 16(%rdi),%r10
  2922. adcq 24(%rdi),%r11
  2923. adcq 32(%rdi),%r12
  2924. adcq 40(%rdi),%r13
  2925. adcq 48(%rdi),%r14
  2926. adcq 56(%rdi),%r15
  2927. leaq 64(%rdi),%rdi
  2928. sbbq %rax,%rax
  2929. subq $8,%rcx
  2930. xorq %rsi,%rsi
  2931. movq %rax,16+8(%rsp)
  2932. jmp .Lsqrx8x_tail
  2933. .align 32
  2934. .Lsqrx8x_tail_done:
  2935. xorq %rax,%rax
  2936. addq 24+8(%rsp),%r8
  2937. adcq $0,%r9
  2938. adcq $0,%r10
  2939. adcq $0,%r11
  2940. adcq $0,%r12
  2941. adcq $0,%r13
  2942. adcq $0,%r14
  2943. adcq $0,%r15
  2944. adcq $0,%rax
  2945. subq 16+8(%rsp),%rsi
  2946. .Lsqrx8x_no_tail:
  2947. adcq 0(%rdi),%r8
  2948. .byte 102,72,15,126,217
  2949. adcq 8(%rdi),%r9
  2950. movq 56(%rbp),%rsi
  2951. .byte 102,72,15,126,213
  2952. adcq 16(%rdi),%r10
  2953. adcq 24(%rdi),%r11
  2954. adcq 32(%rdi),%r12
  2955. adcq 40(%rdi),%r13
  2956. adcq 48(%rdi),%r14
  2957. adcq 56(%rdi),%r15
  2958. adcq $0,%rax
  2959. movq 32+8(%rsp),%rbx
  2960. movq 64(%rdi,%rcx,1),%rdx
  2961. movq %r8,0(%rdi)
  2962. leaq 64(%rdi),%r8
  2963. movq %r9,8(%rdi)
  2964. movq %r10,16(%rdi)
  2965. movq %r11,24(%rdi)
  2966. movq %r12,32(%rdi)
  2967. movq %r13,40(%rdi)
  2968. movq %r14,48(%rdi)
  2969. movq %r15,56(%rdi)
  2970. leaq 64(%rdi,%rcx,1),%rdi
  2971. cmpq 8+8(%rsp),%r8
  2972. jb .Lsqrx8x_reduction_loop
  2973. .byte 0xf3,0xc3
  2974. .cfi_endproc
  2975. .size bn_sqrx8x_internal,.-bn_sqrx8x_internal
  2976. .align 32
  2977. .type __bn_postx4x_internal,@function
  2978. __bn_postx4x_internal:
  2979. .cfi_startproc
  2980. movq 0(%rbp),%r12
  2981. movq %rcx,%r10
  2982. movq %rcx,%r9
  2983. negq %rax
  2984. sarq $3+2,%rcx
  2985. .byte 102,72,15,126,202
  2986. .byte 102,72,15,126,206
  2987. decq %r12
  2988. movq 8(%rbp),%r13
  2989. xorq %r8,%r8
  2990. movq 16(%rbp),%r14
  2991. movq 24(%rbp),%r15
  2992. jmp .Lsqrx4x_sub_entry
  2993. .align 16
  2994. .Lsqrx4x_sub:
  2995. movq 0(%rbp),%r12
  2996. movq 8(%rbp),%r13
  2997. movq 16(%rbp),%r14
  2998. movq 24(%rbp),%r15
  2999. .Lsqrx4x_sub_entry:
  3000. andnq %rax,%r12,%r12
  3001. leaq 32(%rbp),%rbp
  3002. andnq %rax,%r13,%r13
  3003. andnq %rax,%r14,%r14
  3004. andnq %rax,%r15,%r15
  3005. negq %r8
  3006. adcq 0(%rdi),%r12
  3007. adcq 8(%rdi),%r13
  3008. adcq 16(%rdi),%r14
  3009. adcq 24(%rdi),%r15
  3010. movq %r12,0(%rdx)
  3011. leaq 32(%rdi),%rdi
  3012. movq %r13,8(%rdx)
  3013. sbbq %r8,%r8
  3014. movq %r14,16(%rdx)
  3015. movq %r15,24(%rdx)
  3016. leaq 32(%rdx),%rdx
  3017. incq %rcx
  3018. jnz .Lsqrx4x_sub
  3019. negq %r9
  3020. .byte 0xf3,0xc3
  3021. .cfi_endproc
  3022. .size __bn_postx4x_internal,.-__bn_postx4x_internal
  3023. .globl bn_scatter5
  3024. .hidden bn_scatter5
  3025. .type bn_scatter5,@function
  3026. .align 16
  3027. bn_scatter5:
  3028. .cfi_startproc
  3029. cmpl $0,%esi
  3030. jz .Lscatter_epilogue
  3031. leaq (%rdx,%rcx,8),%rdx
  3032. .Lscatter:
  3033. movq (%rdi),%rax
  3034. leaq 8(%rdi),%rdi
  3035. movq %rax,(%rdx)
  3036. leaq 256(%rdx),%rdx
  3037. subl $1,%esi
  3038. jnz .Lscatter
  3039. .Lscatter_epilogue:
  3040. .byte 0xf3,0xc3
  3041. .cfi_endproc
  3042. .size bn_scatter5,.-bn_scatter5
  3043. .globl bn_gather5
  3044. .hidden bn_gather5
  3045. .type bn_gather5,@function
  3046. .align 32
  3047. bn_gather5:
  3048. .cfi_startproc
  3049. .LSEH_begin_bn_gather5:
  3050. .byte 0x4c,0x8d,0x14,0x24
  3051. .cfi_def_cfa_register %r10
  3052. .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
  3053. leaq .Linc(%rip),%rax
  3054. andq $-16,%rsp
  3055. movd %ecx,%xmm5
  3056. movdqa 0(%rax),%xmm0
  3057. movdqa 16(%rax),%xmm1
  3058. leaq 128(%rdx),%r11
  3059. leaq 128(%rsp),%rax
  3060. pshufd $0,%xmm5,%xmm5
  3061. movdqa %xmm1,%xmm4
  3062. movdqa %xmm1,%xmm2
  3063. paddd %xmm0,%xmm1
  3064. pcmpeqd %xmm5,%xmm0
  3065. movdqa %xmm4,%xmm3
  3066. paddd %xmm1,%xmm2
  3067. pcmpeqd %xmm5,%xmm1
  3068. movdqa %xmm0,-128(%rax)
  3069. movdqa %xmm4,%xmm0
  3070. paddd %xmm2,%xmm3
  3071. pcmpeqd %xmm5,%xmm2
  3072. movdqa %xmm1,-112(%rax)
  3073. movdqa %xmm4,%xmm1
  3074. paddd %xmm3,%xmm0
  3075. pcmpeqd %xmm5,%xmm3
  3076. movdqa %xmm2,-96(%rax)
  3077. movdqa %xmm4,%xmm2
  3078. paddd %xmm0,%xmm1
  3079. pcmpeqd %xmm5,%xmm0
  3080. movdqa %xmm3,-80(%rax)
  3081. movdqa %xmm4,%xmm3
  3082. paddd %xmm1,%xmm2
  3083. pcmpeqd %xmm5,%xmm1
  3084. movdqa %xmm0,-64(%rax)
  3085. movdqa %xmm4,%xmm0
  3086. paddd %xmm2,%xmm3
  3087. pcmpeqd %xmm5,%xmm2
  3088. movdqa %xmm1,-48(%rax)
  3089. movdqa %xmm4,%xmm1
  3090. paddd %xmm3,%xmm0
  3091. pcmpeqd %xmm5,%xmm3
  3092. movdqa %xmm2,-32(%rax)
  3093. movdqa %xmm4,%xmm2
  3094. paddd %xmm0,%xmm1
  3095. pcmpeqd %xmm5,%xmm0
  3096. movdqa %xmm3,-16(%rax)
  3097. movdqa %xmm4,%xmm3
  3098. paddd %xmm1,%xmm2
  3099. pcmpeqd %xmm5,%xmm1
  3100. movdqa %xmm0,0(%rax)
  3101. movdqa %xmm4,%xmm0
  3102. paddd %xmm2,%xmm3
  3103. pcmpeqd %xmm5,%xmm2
  3104. movdqa %xmm1,16(%rax)
  3105. movdqa %xmm4,%xmm1
  3106. paddd %xmm3,%xmm0
  3107. pcmpeqd %xmm5,%xmm3
  3108. movdqa %xmm2,32(%rax)
  3109. movdqa %xmm4,%xmm2
  3110. paddd %xmm0,%xmm1
  3111. pcmpeqd %xmm5,%xmm0
  3112. movdqa %xmm3,48(%rax)
  3113. movdqa %xmm4,%xmm3
  3114. paddd %xmm1,%xmm2
  3115. pcmpeqd %xmm5,%xmm1
  3116. movdqa %xmm0,64(%rax)
  3117. movdqa %xmm4,%xmm0
  3118. paddd %xmm2,%xmm3
  3119. pcmpeqd %xmm5,%xmm2
  3120. movdqa %xmm1,80(%rax)
  3121. movdqa %xmm4,%xmm1
  3122. paddd %xmm3,%xmm0
  3123. pcmpeqd %xmm5,%xmm3
  3124. movdqa %xmm2,96(%rax)
  3125. movdqa %xmm4,%xmm2
  3126. movdqa %xmm3,112(%rax)
  3127. jmp .Lgather
  3128. .align 32
  3129. .Lgather:
  3130. pxor %xmm4,%xmm4
  3131. pxor %xmm5,%xmm5
  3132. movdqa -128(%r11),%xmm0
  3133. movdqa -112(%r11),%xmm1
  3134. movdqa -96(%r11),%xmm2
  3135. pand -128(%rax),%xmm0
  3136. movdqa -80(%r11),%xmm3
  3137. pand -112(%rax),%xmm1
  3138. por %xmm0,%xmm4
  3139. pand -96(%rax),%xmm2
  3140. por %xmm1,%xmm5
  3141. pand -80(%rax),%xmm3
  3142. por %xmm2,%xmm4
  3143. por %xmm3,%xmm5
  3144. movdqa -64(%r11),%xmm0
  3145. movdqa -48(%r11),%xmm1
  3146. movdqa -32(%r11),%xmm2
  3147. pand -64(%rax),%xmm0
  3148. movdqa -16(%r11),%xmm3
  3149. pand -48(%rax),%xmm1
  3150. por %xmm0,%xmm4
  3151. pand -32(%rax),%xmm2
  3152. por %xmm1,%xmm5
  3153. pand -16(%rax),%xmm3
  3154. por %xmm2,%xmm4
  3155. por %xmm3,%xmm5
  3156. movdqa 0(%r11),%xmm0
  3157. movdqa 16(%r11),%xmm1
  3158. movdqa 32(%r11),%xmm2
  3159. pand 0(%rax),%xmm0
  3160. movdqa 48(%r11),%xmm3
  3161. pand 16(%rax),%xmm1
  3162. por %xmm0,%xmm4
  3163. pand 32(%rax),%xmm2
  3164. por %xmm1,%xmm5
  3165. pand 48(%rax),%xmm3
  3166. por %xmm2,%xmm4
  3167. por %xmm3,%xmm5
  3168. movdqa 64(%r11),%xmm0
  3169. movdqa 80(%r11),%xmm1
  3170. movdqa 96(%r11),%xmm2
  3171. pand 64(%rax),%xmm0
  3172. movdqa 112(%r11),%xmm3
  3173. pand 80(%rax),%xmm1
  3174. por %xmm0,%xmm4
  3175. pand 96(%rax),%xmm2
  3176. por %xmm1,%xmm5
  3177. pand 112(%rax),%xmm3
  3178. por %xmm2,%xmm4
  3179. por %xmm3,%xmm5
  3180. por %xmm5,%xmm4
  3181. leaq 256(%r11),%r11
  3182. pshufd $0x4e,%xmm4,%xmm0
  3183. por %xmm4,%xmm0
  3184. movq %xmm0,(%rdi)
  3185. leaq 8(%rdi),%rdi
  3186. subl $1,%esi
  3187. jnz .Lgather
  3188. leaq (%r10),%rsp
  3189. .cfi_def_cfa_register %rsp
  3190. .byte 0xf3,0xc3
  3191. .LSEH_end_bn_gather5:
  3192. .cfi_endproc
  3193. .size bn_gather5,.-bn_gather5
  3194. .align 64
  3195. .Linc:
  3196. .long 0,0, 1,1
  3197. .long 2,2, 2,2
  3198. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  3199. #endif
  3200. .section .note.GNU-stack,"",@progbits