armv8-mont.S 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .text
  16. .globl bn_mul_mont
  17. .hidden bn_mul_mont
  18. .type bn_mul_mont,%function
  19. .align 5
  20. bn_mul_mont:
  21. AARCH64_SIGN_LINK_REGISTER
  22. tst x5,#7
  23. b.eq __bn_sqr8x_mont
  24. tst x5,#3
  25. b.eq __bn_mul4x_mont
  26. .Lmul_mont:
  27. stp x29,x30,[sp,#-64]!
  28. add x29,sp,#0
  29. stp x19,x20,[sp,#16]
  30. stp x21,x22,[sp,#32]
  31. stp x23,x24,[sp,#48]
  32. ldr x9,[x2],#8 // bp[0]
  33. sub x22,sp,x5,lsl#3
  34. ldp x7,x8,[x1],#16 // ap[0..1]
  35. lsl x5,x5,#3
  36. ldr x4,[x4] // *n0
  37. and x22,x22,#-16 // ABI says so
  38. ldp x13,x14,[x3],#16 // np[0..1]
  39. mul x6,x7,x9 // ap[0]*bp[0]
  40. sub x21,x5,#16 // j=num-2
  41. umulh x7,x7,x9
  42. mul x10,x8,x9 // ap[1]*bp[0]
  43. umulh x11,x8,x9
  44. mul x15,x6,x4 // "tp[0]"*n0
  45. mov sp,x22 // alloca
  46. // (*) mul x12,x13,x15 // np[0]*m1
  47. umulh x13,x13,x15
  48. mul x16,x14,x15 // np[1]*m1
  49. // (*) adds x12,x12,x6 // discarded
  50. // (*) As for removal of first multiplication and addition
  51. // instructions. The outcome of first addition is
  52. // guaranteed to be zero, which leaves two computationally
  53. // significant outcomes: it either carries or not. Then
  54. // question is when does it carry? Is there alternative
  55. // way to deduce it? If you follow operations, you can
  56. // observe that condition for carry is quite simple:
  57. // x6 being non-zero. So that carry can be calculated
  58. // by adding -1 to x6. That's what next instruction does.
  59. subs xzr,x6,#1 // (*)
  60. umulh x17,x14,x15
  61. adc x13,x13,xzr
  62. cbz x21,.L1st_skip
  63. .L1st:
  64. ldr x8,[x1],#8
  65. adds x6,x10,x7
  66. sub x21,x21,#8 // j--
  67. adc x7,x11,xzr
  68. ldr x14,[x3],#8
  69. adds x12,x16,x13
  70. mul x10,x8,x9 // ap[j]*bp[0]
  71. adc x13,x17,xzr
  72. umulh x11,x8,x9
  73. adds x12,x12,x6
  74. mul x16,x14,x15 // np[j]*m1
  75. adc x13,x13,xzr
  76. umulh x17,x14,x15
  77. str x12,[x22],#8 // tp[j-1]
  78. cbnz x21,.L1st
  79. .L1st_skip:
  80. adds x6,x10,x7
  81. sub x1,x1,x5 // rewind x1
  82. adc x7,x11,xzr
  83. adds x12,x16,x13
  84. sub x3,x3,x5 // rewind x3
  85. adc x13,x17,xzr
  86. adds x12,x12,x6
  87. sub x20,x5,#8 // i=num-1
  88. adcs x13,x13,x7
  89. adc x19,xzr,xzr // upmost overflow bit
  90. stp x12,x13,[x22]
  91. .Louter:
  92. ldr x9,[x2],#8 // bp[i]
  93. ldp x7,x8,[x1],#16
  94. ldr x23,[sp] // tp[0]
  95. add x22,sp,#8
  96. mul x6,x7,x9 // ap[0]*bp[i]
  97. sub x21,x5,#16 // j=num-2
  98. umulh x7,x7,x9
  99. ldp x13,x14,[x3],#16
  100. mul x10,x8,x9 // ap[1]*bp[i]
  101. adds x6,x6,x23
  102. umulh x11,x8,x9
  103. adc x7,x7,xzr
  104. mul x15,x6,x4
  105. sub x20,x20,#8 // i--
  106. // (*) mul x12,x13,x15 // np[0]*m1
  107. umulh x13,x13,x15
  108. mul x16,x14,x15 // np[1]*m1
  109. // (*) adds x12,x12,x6
  110. subs xzr,x6,#1 // (*)
  111. umulh x17,x14,x15
  112. cbz x21,.Linner_skip
  113. .Linner:
  114. ldr x8,[x1],#8
  115. adc x13,x13,xzr
  116. ldr x23,[x22],#8 // tp[j]
  117. adds x6,x10,x7
  118. sub x21,x21,#8 // j--
  119. adc x7,x11,xzr
  120. adds x12,x16,x13
  121. ldr x14,[x3],#8
  122. adc x13,x17,xzr
  123. mul x10,x8,x9 // ap[j]*bp[i]
  124. adds x6,x6,x23
  125. umulh x11,x8,x9
  126. adc x7,x7,xzr
  127. mul x16,x14,x15 // np[j]*m1
  128. adds x12,x12,x6
  129. umulh x17,x14,x15
  130. str x12,[x22,#-16] // tp[j-1]
  131. cbnz x21,.Linner
  132. .Linner_skip:
  133. ldr x23,[x22],#8 // tp[j]
  134. adc x13,x13,xzr
  135. adds x6,x10,x7
  136. sub x1,x1,x5 // rewind x1
  137. adc x7,x11,xzr
  138. adds x12,x16,x13
  139. sub x3,x3,x5 // rewind x3
  140. adcs x13,x17,x19
  141. adc x19,xzr,xzr
  142. adds x6,x6,x23
  143. adc x7,x7,xzr
  144. adds x12,x12,x6
  145. adcs x13,x13,x7
  146. adc x19,x19,xzr // upmost overflow bit
  147. stp x12,x13,[x22,#-16]
  148. cbnz x20,.Louter
  149. // Final step. We see if result is larger than modulus, and
  150. // if it is, subtract the modulus. But comparison implies
  151. // subtraction. So we subtract modulus, see if it borrowed,
  152. // and conditionally copy original value.
  153. ldr x23,[sp] // tp[0]
  154. add x22,sp,#8
  155. ldr x14,[x3],#8 // np[0]
  156. subs x21,x5,#8 // j=num-1 and clear borrow
  157. mov x1,x0
  158. .Lsub:
  159. sbcs x8,x23,x14 // tp[j]-np[j]
  160. ldr x23,[x22],#8
  161. sub x21,x21,#8 // j--
  162. ldr x14,[x3],#8
  163. str x8,[x1],#8 // rp[j]=tp[j]-np[j]
  164. cbnz x21,.Lsub
  165. sbcs x8,x23,x14
  166. sbcs x19,x19,xzr // did it borrow?
  167. str x8,[x1],#8 // rp[num-1]
  168. ldr x23,[sp] // tp[0]
  169. add x22,sp,#8
  170. ldr x8,[x0],#8 // rp[0]
  171. sub x5,x5,#8 // num--
  172. nop
  173. .Lcond_copy:
  174. sub x5,x5,#8 // num--
  175. csel x14,x23,x8,lo // did it borrow?
  176. ldr x23,[x22],#8
  177. ldr x8,[x0],#8
  178. str xzr,[x22,#-16] // wipe tp
  179. str x14,[x0,#-16]
  180. cbnz x5,.Lcond_copy
  181. csel x14,x23,x8,lo
  182. str xzr,[x22,#-8] // wipe tp
  183. str x14,[x0,#-8]
  184. ldp x19,x20,[x29,#16]
  185. mov sp,x29
  186. ldp x21,x22,[x29,#32]
  187. mov x0,#1
  188. ldp x23,x24,[x29,#48]
  189. ldr x29,[sp],#64
  190. AARCH64_VALIDATE_LINK_REGISTER
  191. ret
  192. .size bn_mul_mont,.-bn_mul_mont
  193. .type __bn_sqr8x_mont,%function
  194. .align 5
  195. __bn_sqr8x_mont:
  196. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
  197. // only from bn_mul_mont which has already signed the return address.
  198. cmp x1,x2
  199. b.ne __bn_mul4x_mont
  200. .Lsqr8x_mont:
  201. stp x29,x30,[sp,#-128]!
  202. add x29,sp,#0
  203. stp x19,x20,[sp,#16]
  204. stp x21,x22,[sp,#32]
  205. stp x23,x24,[sp,#48]
  206. stp x25,x26,[sp,#64]
  207. stp x27,x28,[sp,#80]
  208. stp x0,x3,[sp,#96] // offload rp and np
  209. ldp x6,x7,[x1,#8*0]
  210. ldp x8,x9,[x1,#8*2]
  211. ldp x10,x11,[x1,#8*4]
  212. ldp x12,x13,[x1,#8*6]
  213. sub x2,sp,x5,lsl#4
  214. lsl x5,x5,#3
  215. ldr x4,[x4] // *n0
  216. mov sp,x2 // alloca
  217. sub x27,x5,#8*8
  218. b .Lsqr8x_zero_start
  219. .Lsqr8x_zero:
  220. sub x27,x27,#8*8
  221. stp xzr,xzr,[x2,#8*0]
  222. stp xzr,xzr,[x2,#8*2]
  223. stp xzr,xzr,[x2,#8*4]
  224. stp xzr,xzr,[x2,#8*6]
  225. .Lsqr8x_zero_start:
  226. stp xzr,xzr,[x2,#8*8]
  227. stp xzr,xzr,[x2,#8*10]
  228. stp xzr,xzr,[x2,#8*12]
  229. stp xzr,xzr,[x2,#8*14]
  230. add x2,x2,#8*16
  231. cbnz x27,.Lsqr8x_zero
  232. add x3,x1,x5
  233. add x1,x1,#8*8
  234. mov x19,xzr
  235. mov x20,xzr
  236. mov x21,xzr
  237. mov x22,xzr
  238. mov x23,xzr
  239. mov x24,xzr
  240. mov x25,xzr
  241. mov x26,xzr
  242. mov x2,sp
  243. str x4,[x29,#112] // offload n0
  244. // Multiply everything but a[i]*a[i]
  245. .align 4
  246. .Lsqr8x_outer_loop:
  247. // a[1]a[0] (i)
  248. // a[2]a[0]
  249. // a[3]a[0]
  250. // a[4]a[0]
  251. // a[5]a[0]
  252. // a[6]a[0]
  253. // a[7]a[0]
  254. // a[2]a[1] (ii)
  255. // a[3]a[1]
  256. // a[4]a[1]
  257. // a[5]a[1]
  258. // a[6]a[1]
  259. // a[7]a[1]
  260. // a[3]a[2] (iii)
  261. // a[4]a[2]
  262. // a[5]a[2]
  263. // a[6]a[2]
  264. // a[7]a[2]
  265. // a[4]a[3] (iv)
  266. // a[5]a[3]
  267. // a[6]a[3]
  268. // a[7]a[3]
  269. // a[5]a[4] (v)
  270. // a[6]a[4]
  271. // a[7]a[4]
  272. // a[6]a[5] (vi)
  273. // a[7]a[5]
  274. // a[7]a[6] (vii)
  275. mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
  276. mul x15,x8,x6
  277. mul x16,x9,x6
  278. mul x17,x10,x6
  279. adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
  280. mul x14,x11,x6
  281. adcs x21,x21,x15
  282. mul x15,x12,x6
  283. adcs x22,x22,x16
  284. mul x16,x13,x6
  285. adcs x23,x23,x17
  286. umulh x17,x7,x6 // hi(a[1..7]*a[0])
  287. adcs x24,x24,x14
  288. umulh x14,x8,x6
  289. adcs x25,x25,x15
  290. umulh x15,x9,x6
  291. adcs x26,x26,x16
  292. umulh x16,x10,x6
  293. stp x19,x20,[x2],#8*2 // t[0..1]
  294. adc x19,xzr,xzr // t[8]
  295. adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
  296. umulh x17,x11,x6
  297. adcs x22,x22,x14
  298. umulh x14,x12,x6
  299. adcs x23,x23,x15
  300. umulh x15,x13,x6
  301. adcs x24,x24,x16
  302. mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
  303. adcs x25,x25,x17
  304. mul x17,x9,x7
  305. adcs x26,x26,x14
  306. mul x14,x10,x7
  307. adc x19,x19,x15
  308. mul x15,x11,x7
  309. adds x22,x22,x16
  310. mul x16,x12,x7
  311. adcs x23,x23,x17
  312. mul x17,x13,x7
  313. adcs x24,x24,x14
  314. umulh x14,x8,x7 // hi(a[2..7]*a[1])
  315. adcs x25,x25,x15
  316. umulh x15,x9,x7
  317. adcs x26,x26,x16
  318. umulh x16,x10,x7
  319. adcs x19,x19,x17
  320. umulh x17,x11,x7
  321. stp x21,x22,[x2],#8*2 // t[2..3]
  322. adc x20,xzr,xzr // t[9]
  323. adds x23,x23,x14
  324. umulh x14,x12,x7
  325. adcs x24,x24,x15
  326. umulh x15,x13,x7
  327. adcs x25,x25,x16
  328. mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
  329. adcs x26,x26,x17
  330. mul x17,x10,x8
  331. adcs x19,x19,x14
  332. mul x14,x11,x8
  333. adc x20,x20,x15
  334. mul x15,x12,x8
  335. adds x24,x24,x16
  336. mul x16,x13,x8
  337. adcs x25,x25,x17
  338. umulh x17,x9,x8 // hi(a[3..7]*a[2])
  339. adcs x26,x26,x14
  340. umulh x14,x10,x8
  341. adcs x19,x19,x15
  342. umulh x15,x11,x8
  343. adcs x20,x20,x16
  344. umulh x16,x12,x8
  345. stp x23,x24,[x2],#8*2 // t[4..5]
  346. adc x21,xzr,xzr // t[10]
  347. adds x25,x25,x17
  348. umulh x17,x13,x8
  349. adcs x26,x26,x14
  350. mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
  351. adcs x19,x19,x15
  352. mul x15,x11,x9
  353. adcs x20,x20,x16
  354. mul x16,x12,x9
  355. adc x21,x21,x17
  356. mul x17,x13,x9
  357. adds x26,x26,x14
  358. umulh x14,x10,x9 // hi(a[4..7]*a[3])
  359. adcs x19,x19,x15
  360. umulh x15,x11,x9
  361. adcs x20,x20,x16
  362. umulh x16,x12,x9
  363. adcs x21,x21,x17
  364. umulh x17,x13,x9
  365. stp x25,x26,[x2],#8*2 // t[6..7]
  366. adc x22,xzr,xzr // t[11]
  367. adds x19,x19,x14
  368. mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
  369. adcs x20,x20,x15
  370. mul x15,x12,x10
  371. adcs x21,x21,x16
  372. mul x16,x13,x10
  373. adc x22,x22,x17
  374. umulh x17,x11,x10 // hi(a[5..7]*a[4])
  375. adds x20,x20,x14
  376. umulh x14,x12,x10
  377. adcs x21,x21,x15
  378. umulh x15,x13,x10
  379. adcs x22,x22,x16
  380. mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
  381. adc x23,xzr,xzr // t[12]
  382. adds x21,x21,x17
  383. mul x17,x13,x11
  384. adcs x22,x22,x14
  385. umulh x14,x12,x11 // hi(a[6..7]*a[5])
  386. adc x23,x23,x15
  387. umulh x15,x13,x11
  388. adds x22,x22,x16
  389. mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
  390. adcs x23,x23,x17
  391. umulh x17,x13,x12 // hi(a[7]*a[6])
  392. adc x24,xzr,xzr // t[13]
  393. adds x23,x23,x14
  394. sub x27,x3,x1 // done yet?
  395. adc x24,x24,x15
  396. adds x24,x24,x16
  397. sub x14,x3,x5 // rewinded ap
  398. adc x25,xzr,xzr // t[14]
  399. add x25,x25,x17
  400. cbz x27,.Lsqr8x_outer_break
  401. mov x4,x6
  402. ldp x6,x7,[x2,#8*0]
  403. ldp x8,x9,[x2,#8*2]
  404. ldp x10,x11,[x2,#8*4]
  405. ldp x12,x13,[x2,#8*6]
  406. adds x19,x19,x6
  407. adcs x20,x20,x7
  408. ldp x6,x7,[x1,#8*0]
  409. adcs x21,x21,x8
  410. adcs x22,x22,x9
  411. ldp x8,x9,[x1,#8*2]
  412. adcs x23,x23,x10
  413. adcs x24,x24,x11
  414. ldp x10,x11,[x1,#8*4]
  415. adcs x25,x25,x12
  416. mov x0,x1
  417. adcs x26,xzr,x13
  418. ldp x12,x13,[x1,#8*6]
  419. add x1,x1,#8*8
  420. //adc x28,xzr,xzr // moved below
  421. mov x27,#-8*8
  422. // a[8]a[0]
  423. // a[9]a[0]
  424. // a[a]a[0]
  425. // a[b]a[0]
  426. // a[c]a[0]
  427. // a[d]a[0]
  428. // a[e]a[0]
  429. // a[f]a[0]
  430. // a[8]a[1]
  431. // a[f]a[1]........................
  432. // a[8]a[2]
  433. // a[f]a[2]........................
  434. // a[8]a[3]
  435. // a[f]a[3]........................
  436. // a[8]a[4]
  437. // a[f]a[4]........................
  438. // a[8]a[5]
  439. // a[f]a[5]........................
  440. // a[8]a[6]
  441. // a[f]a[6]........................
  442. // a[8]a[7]
  443. // a[f]a[7]........................
  444. .Lsqr8x_mul:
  445. mul x14,x6,x4
  446. adc x28,xzr,xzr // carry bit, modulo-scheduled
  447. mul x15,x7,x4
  448. add x27,x27,#8
  449. mul x16,x8,x4
  450. mul x17,x9,x4
  451. adds x19,x19,x14
  452. mul x14,x10,x4
  453. adcs x20,x20,x15
  454. mul x15,x11,x4
  455. adcs x21,x21,x16
  456. mul x16,x12,x4
  457. adcs x22,x22,x17
  458. mul x17,x13,x4
  459. adcs x23,x23,x14
  460. umulh x14,x6,x4
  461. adcs x24,x24,x15
  462. umulh x15,x7,x4
  463. adcs x25,x25,x16
  464. umulh x16,x8,x4
  465. adcs x26,x26,x17
  466. umulh x17,x9,x4
  467. adc x28,x28,xzr
  468. str x19,[x2],#8
  469. adds x19,x20,x14
  470. umulh x14,x10,x4
  471. adcs x20,x21,x15
  472. umulh x15,x11,x4
  473. adcs x21,x22,x16
  474. umulh x16,x12,x4
  475. adcs x22,x23,x17
  476. umulh x17,x13,x4
  477. ldr x4,[x0,x27]
  478. adcs x23,x24,x14
  479. adcs x24,x25,x15
  480. adcs x25,x26,x16
  481. adcs x26,x28,x17
  482. //adc x28,xzr,xzr // moved above
  483. cbnz x27,.Lsqr8x_mul
  484. // note that carry flag is guaranteed
  485. // to be zero at this point
  486. cmp x1,x3 // done yet?
  487. b.eq .Lsqr8x_break
  488. ldp x6,x7,[x2,#8*0]
  489. ldp x8,x9,[x2,#8*2]
  490. ldp x10,x11,[x2,#8*4]
  491. ldp x12,x13,[x2,#8*6]
  492. adds x19,x19,x6
  493. ldr x4,[x0,#-8*8]
  494. adcs x20,x20,x7
  495. ldp x6,x7,[x1,#8*0]
  496. adcs x21,x21,x8
  497. adcs x22,x22,x9
  498. ldp x8,x9,[x1,#8*2]
  499. adcs x23,x23,x10
  500. adcs x24,x24,x11
  501. ldp x10,x11,[x1,#8*4]
  502. adcs x25,x25,x12
  503. mov x27,#-8*8
  504. adcs x26,x26,x13
  505. ldp x12,x13,[x1,#8*6]
  506. add x1,x1,#8*8
  507. //adc x28,xzr,xzr // moved above
  508. b .Lsqr8x_mul
  509. .align 4
  510. .Lsqr8x_break:
  511. ldp x6,x7,[x0,#8*0]
  512. add x1,x0,#8*8
  513. ldp x8,x9,[x0,#8*2]
  514. sub x14,x3,x1 // is it last iteration?
  515. ldp x10,x11,[x0,#8*4]
  516. sub x15,x2,x14
  517. ldp x12,x13,[x0,#8*6]
  518. cbz x14,.Lsqr8x_outer_loop
  519. stp x19,x20,[x2,#8*0]
  520. ldp x19,x20,[x15,#8*0]
  521. stp x21,x22,[x2,#8*2]
  522. ldp x21,x22,[x15,#8*2]
  523. stp x23,x24,[x2,#8*4]
  524. ldp x23,x24,[x15,#8*4]
  525. stp x25,x26,[x2,#8*6]
  526. mov x2,x15
  527. ldp x25,x26,[x15,#8*6]
  528. b .Lsqr8x_outer_loop
  529. .align 4
  530. .Lsqr8x_outer_break:
  531. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  532. ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
  533. ldp x15,x16,[sp,#8*1]
  534. ldp x11,x13,[x14,#8*2]
  535. add x1,x14,#8*4
  536. ldp x17,x14,[sp,#8*3]
  537. stp x19,x20,[x2,#8*0]
  538. mul x19,x7,x7
  539. stp x21,x22,[x2,#8*2]
  540. umulh x7,x7,x7
  541. stp x23,x24,[x2,#8*4]
  542. mul x8,x9,x9
  543. stp x25,x26,[x2,#8*6]
  544. mov x2,sp
  545. umulh x9,x9,x9
  546. adds x20,x7,x15,lsl#1
  547. extr x15,x16,x15,#63
  548. sub x27,x5,#8*4
  549. .Lsqr4x_shift_n_add:
  550. adcs x21,x8,x15
  551. extr x16,x17,x16,#63
  552. sub x27,x27,#8*4
  553. adcs x22,x9,x16
  554. ldp x15,x16,[x2,#8*5]
  555. mul x10,x11,x11
  556. ldp x7,x9,[x1],#8*2
  557. umulh x11,x11,x11
  558. mul x12,x13,x13
  559. umulh x13,x13,x13
  560. extr x17,x14,x17,#63
  561. stp x19,x20,[x2,#8*0]
  562. adcs x23,x10,x17
  563. extr x14,x15,x14,#63
  564. stp x21,x22,[x2,#8*2]
  565. adcs x24,x11,x14
  566. ldp x17,x14,[x2,#8*7]
  567. extr x15,x16,x15,#63
  568. adcs x25,x12,x15
  569. extr x16,x17,x16,#63
  570. adcs x26,x13,x16
  571. ldp x15,x16,[x2,#8*9]
  572. mul x6,x7,x7
  573. ldp x11,x13,[x1],#8*2
  574. umulh x7,x7,x7
  575. mul x8,x9,x9
  576. umulh x9,x9,x9
  577. stp x23,x24,[x2,#8*4]
  578. extr x17,x14,x17,#63
  579. stp x25,x26,[x2,#8*6]
  580. add x2,x2,#8*8
  581. adcs x19,x6,x17
  582. extr x14,x15,x14,#63
  583. adcs x20,x7,x14
  584. ldp x17,x14,[x2,#8*3]
  585. extr x15,x16,x15,#63
  586. cbnz x27,.Lsqr4x_shift_n_add
  587. ldp x1,x4,[x29,#104] // pull np and n0
  588. adcs x21,x8,x15
  589. extr x16,x17,x16,#63
  590. adcs x22,x9,x16
  591. ldp x15,x16,[x2,#8*5]
  592. mul x10,x11,x11
  593. umulh x11,x11,x11
  594. stp x19,x20,[x2,#8*0]
  595. mul x12,x13,x13
  596. umulh x13,x13,x13
  597. stp x21,x22,[x2,#8*2]
  598. extr x17,x14,x17,#63
  599. adcs x23,x10,x17
  600. extr x14,x15,x14,#63
  601. ldp x19,x20,[sp,#8*0]
  602. adcs x24,x11,x14
  603. extr x15,x16,x15,#63
  604. ldp x6,x7,[x1,#8*0]
  605. adcs x25,x12,x15
  606. extr x16,xzr,x16,#63
  607. ldp x8,x9,[x1,#8*2]
  608. adc x26,x13,x16
  609. ldp x10,x11,[x1,#8*4]
  610. // Reduce by 512 bits per iteration
  611. mul x28,x4,x19 // t[0]*n0
  612. ldp x12,x13,[x1,#8*6]
  613. add x3,x1,x5
  614. ldp x21,x22,[sp,#8*2]
  615. stp x23,x24,[x2,#8*4]
  616. ldp x23,x24,[sp,#8*4]
  617. stp x25,x26,[x2,#8*6]
  618. ldp x25,x26,[sp,#8*6]
  619. add x1,x1,#8*8
  620. mov x30,xzr // initial top-most carry
  621. mov x2,sp
  622. mov x27,#8
  623. .Lsqr8x_reduction:
  624. // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
  625. mul x15,x7,x28
  626. sub x27,x27,#1
  627. mul x16,x8,x28
  628. str x28,[x2],#8 // put aside t[0]*n0 for tail processing
  629. mul x17,x9,x28
  630. // (*) adds xzr,x19,x14
  631. subs xzr,x19,#1 // (*)
  632. mul x14,x10,x28
  633. adcs x19,x20,x15
  634. mul x15,x11,x28
  635. adcs x20,x21,x16
  636. mul x16,x12,x28
  637. adcs x21,x22,x17
  638. mul x17,x13,x28
  639. adcs x22,x23,x14
  640. umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
  641. adcs x23,x24,x15
  642. umulh x15,x7,x28
  643. adcs x24,x25,x16
  644. umulh x16,x8,x28
  645. adcs x25,x26,x17
  646. umulh x17,x9,x28
  647. adc x26,xzr,xzr
  648. adds x19,x19,x14
  649. umulh x14,x10,x28
  650. adcs x20,x20,x15
  651. umulh x15,x11,x28
  652. adcs x21,x21,x16
  653. umulh x16,x12,x28
  654. adcs x22,x22,x17
  655. umulh x17,x13,x28
  656. mul x28,x4,x19 // next t[0]*n0
  657. adcs x23,x23,x14
  658. adcs x24,x24,x15
  659. adcs x25,x25,x16
  660. adc x26,x26,x17
  661. cbnz x27,.Lsqr8x_reduction
  662. ldp x14,x15,[x2,#8*0]
  663. ldp x16,x17,[x2,#8*2]
  664. mov x0,x2
  665. sub x27,x3,x1 // done yet?
  666. adds x19,x19,x14
  667. adcs x20,x20,x15
  668. ldp x14,x15,[x2,#8*4]
  669. adcs x21,x21,x16
  670. adcs x22,x22,x17
  671. ldp x16,x17,[x2,#8*6]
  672. adcs x23,x23,x14
  673. adcs x24,x24,x15
  674. adcs x25,x25,x16
  675. adcs x26,x26,x17
  676. //adc x28,xzr,xzr // moved below
  677. cbz x27,.Lsqr8x8_post_condition
  678. ldr x4,[x2,#-8*8]
  679. ldp x6,x7,[x1,#8*0]
  680. ldp x8,x9,[x1,#8*2]
  681. ldp x10,x11,[x1,#8*4]
  682. mov x27,#-8*8
  683. ldp x12,x13,[x1,#8*6]
  684. add x1,x1,#8*8
  685. .Lsqr8x_tail:
  686. mul x14,x6,x4
  687. adc x28,xzr,xzr // carry bit, modulo-scheduled
  688. mul x15,x7,x4
  689. add x27,x27,#8
  690. mul x16,x8,x4
  691. mul x17,x9,x4
  692. adds x19,x19,x14
  693. mul x14,x10,x4
  694. adcs x20,x20,x15
  695. mul x15,x11,x4
  696. adcs x21,x21,x16
  697. mul x16,x12,x4
  698. adcs x22,x22,x17
  699. mul x17,x13,x4
  700. adcs x23,x23,x14
  701. umulh x14,x6,x4
  702. adcs x24,x24,x15
  703. umulh x15,x7,x4
  704. adcs x25,x25,x16
  705. umulh x16,x8,x4
  706. adcs x26,x26,x17
  707. umulh x17,x9,x4
  708. adc x28,x28,xzr
  709. str x19,[x2],#8
  710. adds x19,x20,x14
  711. umulh x14,x10,x4
  712. adcs x20,x21,x15
  713. umulh x15,x11,x4
  714. adcs x21,x22,x16
  715. umulh x16,x12,x4
  716. adcs x22,x23,x17
  717. umulh x17,x13,x4
  718. ldr x4,[x0,x27]
  719. adcs x23,x24,x14
  720. adcs x24,x25,x15
  721. adcs x25,x26,x16
  722. adcs x26,x28,x17
  723. //adc x28,xzr,xzr // moved above
  724. cbnz x27,.Lsqr8x_tail
  725. // note that carry flag is guaranteed
  726. // to be zero at this point
  727. ldp x6,x7,[x2,#8*0]
  728. sub x27,x3,x1 // done yet?
  729. sub x16,x3,x5 // rewinded np
  730. ldp x8,x9,[x2,#8*2]
  731. ldp x10,x11,[x2,#8*4]
  732. ldp x12,x13,[x2,#8*6]
  733. cbz x27,.Lsqr8x_tail_break
  734. ldr x4,[x0,#-8*8]
  735. adds x19,x19,x6
  736. adcs x20,x20,x7
  737. ldp x6,x7,[x1,#8*0]
  738. adcs x21,x21,x8
  739. adcs x22,x22,x9
  740. ldp x8,x9,[x1,#8*2]
  741. adcs x23,x23,x10
  742. adcs x24,x24,x11
  743. ldp x10,x11,[x1,#8*4]
  744. adcs x25,x25,x12
  745. mov x27,#-8*8
  746. adcs x26,x26,x13
  747. ldp x12,x13,[x1,#8*6]
  748. add x1,x1,#8*8
  749. //adc x28,xzr,xzr // moved above
  750. b .Lsqr8x_tail
  751. .align 4
  752. .Lsqr8x_tail_break:
  753. ldr x4,[x29,#112] // pull n0
  754. add x27,x2,#8*8 // end of current t[num] window
  755. subs xzr,x30,#1 // "move" top-most carry to carry bit
  756. adcs x14,x19,x6
  757. adcs x15,x20,x7
  758. ldp x19,x20,[x0,#8*0]
  759. adcs x21,x21,x8
  760. ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
  761. adcs x22,x22,x9
  762. ldp x8,x9,[x16,#8*2]
  763. adcs x23,x23,x10
  764. adcs x24,x24,x11
  765. ldp x10,x11,[x16,#8*4]
  766. adcs x25,x25,x12
  767. adcs x26,x26,x13
  768. ldp x12,x13,[x16,#8*6]
  769. add x1,x16,#8*8
  770. adc x30,xzr,xzr // top-most carry
  771. mul x28,x4,x19
  772. stp x14,x15,[x2,#8*0]
  773. stp x21,x22,[x2,#8*2]
  774. ldp x21,x22,[x0,#8*2]
  775. stp x23,x24,[x2,#8*4]
  776. ldp x23,x24,[x0,#8*4]
  777. cmp x27,x29 // did we hit the bottom?
  778. stp x25,x26,[x2,#8*6]
  779. mov x2,x0 // slide the window
  780. ldp x25,x26,[x0,#8*6]
  781. mov x27,#8
  782. b.ne .Lsqr8x_reduction
  783. // Final step. We see if result is larger than modulus, and
  784. // if it is, subtract the modulus. But comparison implies
  785. // subtraction. So we subtract modulus, see if it borrowed,
  786. // and conditionally copy original value.
  787. ldr x0,[x29,#96] // pull rp
  788. add x2,x2,#8*8
  789. subs x14,x19,x6
  790. sbcs x15,x20,x7
  791. sub x27,x5,#8*8
  792. mov x3,x0 // x0 copy
  793. .Lsqr8x_sub:
  794. sbcs x16,x21,x8
  795. ldp x6,x7,[x1,#8*0]
  796. sbcs x17,x22,x9
  797. stp x14,x15,[x0,#8*0]
  798. sbcs x14,x23,x10
  799. ldp x8,x9,[x1,#8*2]
  800. sbcs x15,x24,x11
  801. stp x16,x17,[x0,#8*2]
  802. sbcs x16,x25,x12
  803. ldp x10,x11,[x1,#8*4]
  804. sbcs x17,x26,x13
  805. ldp x12,x13,[x1,#8*6]
  806. add x1,x1,#8*8
  807. ldp x19,x20,[x2,#8*0]
  808. sub x27,x27,#8*8
  809. ldp x21,x22,[x2,#8*2]
  810. ldp x23,x24,[x2,#8*4]
  811. ldp x25,x26,[x2,#8*6]
  812. add x2,x2,#8*8
  813. stp x14,x15,[x0,#8*4]
  814. sbcs x14,x19,x6
  815. stp x16,x17,[x0,#8*6]
  816. add x0,x0,#8*8
  817. sbcs x15,x20,x7
  818. cbnz x27,.Lsqr8x_sub
  819. sbcs x16,x21,x8
  820. mov x2,sp
  821. add x1,sp,x5
  822. ldp x6,x7,[x3,#8*0]
  823. sbcs x17,x22,x9
  824. stp x14,x15,[x0,#8*0]
  825. sbcs x14,x23,x10
  826. ldp x8,x9,[x3,#8*2]
  827. sbcs x15,x24,x11
  828. stp x16,x17,[x0,#8*2]
  829. sbcs x16,x25,x12
  830. ldp x19,x20,[x1,#8*0]
  831. sbcs x17,x26,x13
  832. ldp x21,x22,[x1,#8*2]
  833. sbcs xzr,x30,xzr // did it borrow?
  834. ldr x30,[x29,#8] // pull return address
  835. stp x14,x15,[x0,#8*4]
  836. stp x16,x17,[x0,#8*6]
  837. sub x27,x5,#8*4
  838. .Lsqr4x_cond_copy:
  839. sub x27,x27,#8*4
  840. csel x14,x19,x6,lo
  841. stp xzr,xzr,[x2,#8*0]
  842. csel x15,x20,x7,lo
  843. ldp x6,x7,[x3,#8*4]
  844. ldp x19,x20,[x1,#8*4]
  845. csel x16,x21,x8,lo
  846. stp xzr,xzr,[x2,#8*2]
  847. add x2,x2,#8*4
  848. csel x17,x22,x9,lo
  849. ldp x8,x9,[x3,#8*6]
  850. ldp x21,x22,[x1,#8*6]
  851. add x1,x1,#8*4
  852. stp x14,x15,[x3,#8*0]
  853. stp x16,x17,[x3,#8*2]
  854. add x3,x3,#8*4
  855. stp xzr,xzr,[x1,#8*0]
  856. stp xzr,xzr,[x1,#8*2]
  857. cbnz x27,.Lsqr4x_cond_copy
  858. csel x14,x19,x6,lo
  859. stp xzr,xzr,[x2,#8*0]
  860. csel x15,x20,x7,lo
  861. stp xzr,xzr,[x2,#8*2]
  862. csel x16,x21,x8,lo
  863. csel x17,x22,x9,lo
  864. stp x14,x15,[x3,#8*0]
  865. stp x16,x17,[x3,#8*2]
  866. b .Lsqr8x_done
  867. .align 4
  868. .Lsqr8x8_post_condition:
  869. adc x28,xzr,xzr
  870. ldr x30,[x29,#8] // pull return address
  871. // x19-7,x28 hold result, x6-7 hold modulus
  872. subs x6,x19,x6
  873. ldr x1,[x29,#96] // pull rp
  874. sbcs x7,x20,x7
  875. stp xzr,xzr,[sp,#8*0]
  876. sbcs x8,x21,x8
  877. stp xzr,xzr,[sp,#8*2]
  878. sbcs x9,x22,x9
  879. stp xzr,xzr,[sp,#8*4]
  880. sbcs x10,x23,x10
  881. stp xzr,xzr,[sp,#8*6]
  882. sbcs x11,x24,x11
  883. stp xzr,xzr,[sp,#8*8]
  884. sbcs x12,x25,x12
  885. stp xzr,xzr,[sp,#8*10]
  886. sbcs x13,x26,x13
  887. stp xzr,xzr,[sp,#8*12]
  888. sbcs x28,x28,xzr // did it borrow?
  889. stp xzr,xzr,[sp,#8*14]
  890. // x6-7 hold result-modulus
  891. csel x6,x19,x6,lo
  892. csel x7,x20,x7,lo
  893. csel x8,x21,x8,lo
  894. csel x9,x22,x9,lo
  895. stp x6,x7,[x1,#8*0]
  896. csel x10,x23,x10,lo
  897. csel x11,x24,x11,lo
  898. stp x8,x9,[x1,#8*2]
  899. csel x12,x25,x12,lo
  900. csel x13,x26,x13,lo
  901. stp x10,x11,[x1,#8*4]
  902. stp x12,x13,[x1,#8*6]
  903. .Lsqr8x_done:
  904. ldp x19,x20,[x29,#16]
  905. mov sp,x29
  906. ldp x21,x22,[x29,#32]
  907. mov x0,#1
  908. ldp x23,x24,[x29,#48]
  909. ldp x25,x26,[x29,#64]
  910. ldp x27,x28,[x29,#80]
  911. ldr x29,[sp],#128
  912. // x30 is popped earlier
  913. AARCH64_VALIDATE_LINK_REGISTER
  914. ret
  915. .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
  916. .type __bn_mul4x_mont,%function
  917. .align 5
  918. __bn_mul4x_mont:
  919. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
  920. // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
  921. // return address.
  922. stp x29,x30,[sp,#-128]!
  923. add x29,sp,#0
  924. stp x19,x20,[sp,#16]
  925. stp x21,x22,[sp,#32]
  926. stp x23,x24,[sp,#48]
  927. stp x25,x26,[sp,#64]
  928. stp x27,x28,[sp,#80]
  929. sub x26,sp,x5,lsl#3
  930. lsl x5,x5,#3
  931. ldr x4,[x4] // *n0
  932. sub sp,x26,#8*4 // alloca
  933. add x10,x2,x5
  934. add x27,x1,x5
  935. stp x0,x10,[x29,#96] // offload rp and &b[num]
  936. ldr x24,[x2,#8*0] // b[0]
  937. ldp x6,x7,[x1,#8*0] // a[0..3]
  938. ldp x8,x9,[x1,#8*2]
  939. add x1,x1,#8*4
  940. mov x19,xzr
  941. mov x20,xzr
  942. mov x21,xzr
  943. mov x22,xzr
  944. ldp x14,x15,[x3,#8*0] // n[0..3]
  945. ldp x16,x17,[x3,#8*2]
  946. adds x3,x3,#8*4 // clear carry bit
  947. mov x0,xzr
  948. mov x28,#0
  949. mov x26,sp
  950. .Loop_mul4x_1st_reduction:
  951. mul x10,x6,x24 // lo(a[0..3]*b[0])
  952. adc x0,x0,xzr // modulo-scheduled
  953. mul x11,x7,x24
  954. add x28,x28,#8
  955. mul x12,x8,x24
  956. and x28,x28,#31
  957. mul x13,x9,x24
  958. adds x19,x19,x10
  959. umulh x10,x6,x24 // hi(a[0..3]*b[0])
  960. adcs x20,x20,x11
  961. mul x25,x19,x4 // t[0]*n0
  962. adcs x21,x21,x12
  963. umulh x11,x7,x24
  964. adcs x22,x22,x13
  965. umulh x12,x8,x24
  966. adc x23,xzr,xzr
  967. umulh x13,x9,x24
  968. ldr x24,[x2,x28] // next b[i] (or b[0])
  969. adds x20,x20,x10
  970. // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
  971. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  972. adcs x21,x21,x11
  973. mul x11,x15,x25
  974. adcs x22,x22,x12
  975. mul x12,x16,x25
  976. adc x23,x23,x13 // can't overflow
  977. mul x13,x17,x25
  978. // (*) adds xzr,x19,x10
  979. subs xzr,x19,#1 // (*)
  980. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
  981. adcs x19,x20,x11
  982. umulh x11,x15,x25
  983. adcs x20,x21,x12
  984. umulh x12,x16,x25
  985. adcs x21,x22,x13
  986. umulh x13,x17,x25
  987. adcs x22,x23,x0
  988. adc x0,xzr,xzr
  989. adds x19,x19,x10
  990. sub x10,x27,x1
  991. adcs x20,x20,x11
  992. adcs x21,x21,x12
  993. adcs x22,x22,x13
  994. //adc x0,x0,xzr
  995. cbnz x28,.Loop_mul4x_1st_reduction
  996. cbz x10,.Lmul4x4_post_condition
  997. ldp x6,x7,[x1,#8*0] // a[4..7]
  998. ldp x8,x9,[x1,#8*2]
  999. add x1,x1,#8*4
  1000. ldr x25,[sp] // a[0]*n0
  1001. ldp x14,x15,[x3,#8*0] // n[4..7]
  1002. ldp x16,x17,[x3,#8*2]
  1003. add x3,x3,#8*4
  1004. .Loop_mul4x_1st_tail:
  1005. mul x10,x6,x24 // lo(a[4..7]*b[i])
  1006. adc x0,x0,xzr // modulo-scheduled
  1007. mul x11,x7,x24
  1008. add x28,x28,#8
  1009. mul x12,x8,x24
  1010. and x28,x28,#31
  1011. mul x13,x9,x24
  1012. adds x19,x19,x10
  1013. umulh x10,x6,x24 // hi(a[4..7]*b[i])
  1014. adcs x20,x20,x11
  1015. umulh x11,x7,x24
  1016. adcs x21,x21,x12
  1017. umulh x12,x8,x24
  1018. adcs x22,x22,x13
  1019. umulh x13,x9,x24
  1020. adc x23,xzr,xzr
  1021. ldr x24,[x2,x28] // next b[i] (or b[0])
  1022. adds x20,x20,x10
  1023. mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
  1024. adcs x21,x21,x11
  1025. mul x11,x15,x25
  1026. adcs x22,x22,x12
  1027. mul x12,x16,x25
  1028. adc x23,x23,x13 // can't overflow
  1029. mul x13,x17,x25
  1030. adds x19,x19,x10
  1031. umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
  1032. adcs x20,x20,x11
  1033. umulh x11,x15,x25
  1034. adcs x21,x21,x12
  1035. umulh x12,x16,x25
  1036. adcs x22,x22,x13
  1037. adcs x23,x23,x0
  1038. umulh x13,x17,x25
  1039. adc x0,xzr,xzr
  1040. ldr x25,[sp,x28] // next t[0]*n0
  1041. str x19,[x26],#8 // result!!!
  1042. adds x19,x20,x10
  1043. sub x10,x27,x1 // done yet?
  1044. adcs x20,x21,x11
  1045. adcs x21,x22,x12
  1046. adcs x22,x23,x13
  1047. //adc x0,x0,xzr
  1048. cbnz x28,.Loop_mul4x_1st_tail
  1049. sub x11,x27,x5 // rewinded x1
  1050. cbz x10,.Lmul4x_proceed
  1051. ldp x6,x7,[x1,#8*0]
  1052. ldp x8,x9,[x1,#8*2]
  1053. add x1,x1,#8*4
  1054. ldp x14,x15,[x3,#8*0]
  1055. ldp x16,x17,[x3,#8*2]
  1056. add x3,x3,#8*4
  1057. b .Loop_mul4x_1st_tail
  1058. .align 5
  1059. .Lmul4x_proceed:
  1060. ldr x24,[x2,#8*4]! // *++b
  1061. adc x30,x0,xzr
  1062. ldp x6,x7,[x11,#8*0] // a[0..3]
  1063. sub x3,x3,x5 // rewind np
  1064. ldp x8,x9,[x11,#8*2]
  1065. add x1,x11,#8*4
  1066. stp x19,x20,[x26,#8*0] // result!!!
  1067. ldp x19,x20,[sp,#8*4] // t[0..3]
  1068. stp x21,x22,[x26,#8*2] // result!!!
  1069. ldp x21,x22,[sp,#8*6]
  1070. ldp x14,x15,[x3,#8*0] // n[0..3]
  1071. mov x26,sp
  1072. ldp x16,x17,[x3,#8*2]
  1073. adds x3,x3,#8*4 // clear carry bit
  1074. mov x0,xzr
  1075. .align 4
  1076. .Loop_mul4x_reduction:
  1077. mul x10,x6,x24 // lo(a[0..3]*b[4])
  1078. adc x0,x0,xzr // modulo-scheduled
  1079. mul x11,x7,x24
  1080. add x28,x28,#8
  1081. mul x12,x8,x24
  1082. and x28,x28,#31
  1083. mul x13,x9,x24
  1084. adds x19,x19,x10
  1085. umulh x10,x6,x24 // hi(a[0..3]*b[4])
  1086. adcs x20,x20,x11
  1087. mul x25,x19,x4 // t[0]*n0
  1088. adcs x21,x21,x12
  1089. umulh x11,x7,x24
  1090. adcs x22,x22,x13
  1091. umulh x12,x8,x24
  1092. adc x23,xzr,xzr
  1093. umulh x13,x9,x24
  1094. ldr x24,[x2,x28] // next b[i]
  1095. adds x20,x20,x10
  1096. // (*) mul x10,x14,x25
  1097. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  1098. adcs x21,x21,x11
  1099. mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
  1100. adcs x22,x22,x12
  1101. mul x12,x16,x25
  1102. adc x23,x23,x13 // can't overflow
  1103. mul x13,x17,x25
  1104. // (*) adds xzr,x19,x10
  1105. subs xzr,x19,#1 // (*)
  1106. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
  1107. adcs x19,x20,x11
  1108. umulh x11,x15,x25
  1109. adcs x20,x21,x12
  1110. umulh x12,x16,x25
  1111. adcs x21,x22,x13
  1112. umulh x13,x17,x25
  1113. adcs x22,x23,x0
  1114. adc x0,xzr,xzr
  1115. adds x19,x19,x10
  1116. adcs x20,x20,x11
  1117. adcs x21,x21,x12
  1118. adcs x22,x22,x13
  1119. //adc x0,x0,xzr
  1120. cbnz x28,.Loop_mul4x_reduction
  1121. adc x0,x0,xzr
  1122. ldp x10,x11,[x26,#8*4] // t[4..7]
  1123. ldp x12,x13,[x26,#8*6]
  1124. ldp x6,x7,[x1,#8*0] // a[4..7]
  1125. ldp x8,x9,[x1,#8*2]
  1126. add x1,x1,#8*4
  1127. adds x19,x19,x10
  1128. adcs x20,x20,x11
  1129. adcs x21,x21,x12
  1130. adcs x22,x22,x13
  1131. //adc x0,x0,xzr
  1132. ldr x25,[sp] // t[0]*n0
  1133. ldp x14,x15,[x3,#8*0] // n[4..7]
  1134. ldp x16,x17,[x3,#8*2]
  1135. add x3,x3,#8*4
  1136. .align 4
  1137. .Loop_mul4x_tail:
  1138. mul x10,x6,x24 // lo(a[4..7]*b[4])
  1139. adc x0,x0,xzr // modulo-scheduled
  1140. mul x11,x7,x24
  1141. add x28,x28,#8
  1142. mul x12,x8,x24
  1143. and x28,x28,#31
  1144. mul x13,x9,x24
  1145. adds x19,x19,x10
  1146. umulh x10,x6,x24 // hi(a[4..7]*b[4])
  1147. adcs x20,x20,x11
  1148. umulh x11,x7,x24
  1149. adcs x21,x21,x12
  1150. umulh x12,x8,x24
  1151. adcs x22,x22,x13
  1152. umulh x13,x9,x24
  1153. adc x23,xzr,xzr
  1154. ldr x24,[x2,x28] // next b[i]
  1155. adds x20,x20,x10
  1156. mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
  1157. adcs x21,x21,x11
  1158. mul x11,x15,x25
  1159. adcs x22,x22,x12
  1160. mul x12,x16,x25
  1161. adc x23,x23,x13 // can't overflow
  1162. mul x13,x17,x25
  1163. adds x19,x19,x10
  1164. umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
  1165. adcs x20,x20,x11
  1166. umulh x11,x15,x25
  1167. adcs x21,x21,x12
  1168. umulh x12,x16,x25
  1169. adcs x22,x22,x13
  1170. umulh x13,x17,x25
  1171. adcs x23,x23,x0
  1172. ldr x25,[sp,x28] // next a[0]*n0
  1173. adc x0,xzr,xzr
  1174. str x19,[x26],#8 // result!!!
  1175. adds x19,x20,x10
  1176. sub x10,x27,x1 // done yet?
  1177. adcs x20,x21,x11
  1178. adcs x21,x22,x12
  1179. adcs x22,x23,x13
  1180. //adc x0,x0,xzr
  1181. cbnz x28,.Loop_mul4x_tail
  1182. sub x11,x3,x5 // rewinded np?
  1183. adc x0,x0,xzr
  1184. cbz x10,.Loop_mul4x_break
  1185. ldp x10,x11,[x26,#8*4]
  1186. ldp x12,x13,[x26,#8*6]
  1187. ldp x6,x7,[x1,#8*0]
  1188. ldp x8,x9,[x1,#8*2]
  1189. add x1,x1,#8*4
  1190. adds x19,x19,x10
  1191. adcs x20,x20,x11
  1192. adcs x21,x21,x12
  1193. adcs x22,x22,x13
  1194. //adc x0,x0,xzr
  1195. ldp x14,x15,[x3,#8*0]
  1196. ldp x16,x17,[x3,#8*2]
  1197. add x3,x3,#8*4
  1198. b .Loop_mul4x_tail
  1199. .align 4
  1200. .Loop_mul4x_break:
  1201. ldp x12,x13,[x29,#96] // pull rp and &b[num]
  1202. adds x19,x19,x30
  1203. add x2,x2,#8*4 // bp++
  1204. adcs x20,x20,xzr
  1205. sub x1,x1,x5 // rewind ap
  1206. adcs x21,x21,xzr
  1207. stp x19,x20,[x26,#8*0] // result!!!
  1208. adcs x22,x22,xzr
  1209. ldp x19,x20,[sp,#8*4] // t[0..3]
  1210. adc x30,x0,xzr
  1211. stp x21,x22,[x26,#8*2] // result!!!
  1212. cmp x2,x13 // done yet?
  1213. ldp x21,x22,[sp,#8*6]
  1214. ldp x14,x15,[x11,#8*0] // n[0..3]
  1215. ldp x16,x17,[x11,#8*2]
  1216. add x3,x11,#8*4
  1217. b.eq .Lmul4x_post
  1218. ldr x24,[x2]
  1219. ldp x6,x7,[x1,#8*0] // a[0..3]
  1220. ldp x8,x9,[x1,#8*2]
  1221. adds x1,x1,#8*4 // clear carry bit
  1222. mov x0,xzr
  1223. mov x26,sp
  1224. b .Loop_mul4x_reduction
  1225. .align 4
  1226. .Lmul4x_post:
  1227. // Final step. We see if result is larger than modulus, and
  1228. // if it is, subtract the modulus. But comparison implies
  1229. // subtraction. So we subtract modulus, see if it borrowed,
  1230. // and conditionally copy original value.
  1231. mov x0,x12
  1232. mov x27,x12 // x0 copy
  1233. subs x10,x19,x14
  1234. add x26,sp,#8*8
  1235. sbcs x11,x20,x15
  1236. sub x28,x5,#8*4
  1237. .Lmul4x_sub:
  1238. sbcs x12,x21,x16
  1239. ldp x14,x15,[x3,#8*0]
  1240. sub x28,x28,#8*4
  1241. ldp x19,x20,[x26,#8*0]
  1242. sbcs x13,x22,x17
  1243. ldp x16,x17,[x3,#8*2]
  1244. add x3,x3,#8*4
  1245. ldp x21,x22,[x26,#8*2]
  1246. add x26,x26,#8*4
  1247. stp x10,x11,[x0,#8*0]
  1248. sbcs x10,x19,x14
  1249. stp x12,x13,[x0,#8*2]
  1250. add x0,x0,#8*4
  1251. sbcs x11,x20,x15
  1252. cbnz x28,.Lmul4x_sub
  1253. sbcs x12,x21,x16
  1254. mov x26,sp
  1255. add x1,sp,#8*4
  1256. ldp x6,x7,[x27,#8*0]
  1257. sbcs x13,x22,x17
  1258. stp x10,x11,[x0,#8*0]
  1259. ldp x8,x9,[x27,#8*2]
  1260. stp x12,x13,[x0,#8*2]
  1261. ldp x19,x20,[x1,#8*0]
  1262. ldp x21,x22,[x1,#8*2]
  1263. sbcs xzr,x30,xzr // did it borrow?
  1264. ldr x30,[x29,#8] // pull return address
  1265. sub x28,x5,#8*4
  1266. .Lmul4x_cond_copy:
  1267. sub x28,x28,#8*4
  1268. csel x10,x19,x6,lo
  1269. stp xzr,xzr,[x26,#8*0]
  1270. csel x11,x20,x7,lo
  1271. ldp x6,x7,[x27,#8*4]
  1272. ldp x19,x20,[x1,#8*4]
  1273. csel x12,x21,x8,lo
  1274. stp xzr,xzr,[x26,#8*2]
  1275. add x26,x26,#8*4
  1276. csel x13,x22,x9,lo
  1277. ldp x8,x9,[x27,#8*6]
  1278. ldp x21,x22,[x1,#8*6]
  1279. add x1,x1,#8*4
  1280. stp x10,x11,[x27,#8*0]
  1281. stp x12,x13,[x27,#8*2]
  1282. add x27,x27,#8*4
  1283. cbnz x28,.Lmul4x_cond_copy
  1284. csel x10,x19,x6,lo
  1285. stp xzr,xzr,[x26,#8*0]
  1286. csel x11,x20,x7,lo
  1287. stp xzr,xzr,[x26,#8*2]
  1288. csel x12,x21,x8,lo
  1289. stp xzr,xzr,[x26,#8*3]
  1290. csel x13,x22,x9,lo
  1291. stp xzr,xzr,[x26,#8*4]
  1292. stp x10,x11,[x27,#8*0]
  1293. stp x12,x13,[x27,#8*2]
  1294. b .Lmul4x_done
  1295. .align 4
  1296. .Lmul4x4_post_condition:
  1297. adc x0,x0,xzr
  1298. ldr x1,[x29,#96] // pull rp
  1299. // x19-3,x0 hold result, x14-7 hold modulus
  1300. subs x6,x19,x14
  1301. ldr x30,[x29,#8] // pull return address
  1302. sbcs x7,x20,x15
  1303. stp xzr,xzr,[sp,#8*0]
  1304. sbcs x8,x21,x16
  1305. stp xzr,xzr,[sp,#8*2]
  1306. sbcs x9,x22,x17
  1307. stp xzr,xzr,[sp,#8*4]
  1308. sbcs xzr,x0,xzr // did it borrow?
  1309. stp xzr,xzr,[sp,#8*6]
  1310. // x6-3 hold result-modulus
  1311. csel x6,x19,x6,lo
  1312. csel x7,x20,x7,lo
  1313. csel x8,x21,x8,lo
  1314. csel x9,x22,x9,lo
  1315. stp x6,x7,[x1,#8*0]
  1316. stp x8,x9,[x1,#8*2]
  1317. .Lmul4x_done:
  1318. ldp x19,x20,[x29,#16]
  1319. mov sp,x29
  1320. ldp x21,x22,[x29,#32]
  1321. mov x0,#1
  1322. ldp x23,x24,[x29,#48]
  1323. ldp x25,x26,[x29,#64]
  1324. ldp x27,x28,[x29,#80]
  1325. ldr x29,[sp],#128
  1326. // x30 is popped earlier
  1327. AARCH64_VALIDATE_LINK_REGISTER
  1328. ret
  1329. .size __bn_mul4x_mont,.-__bn_mul4x_mont
  1330. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1331. .align 2
  1332. .align 4
  1333. #endif
  1334. #endif // !OPENSSL_NO_ASM
  1335. .section .note.GNU-stack,"",%progbits