armv8-mont.S 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. .text
  15. .globl _bn_mul_mont
  16. .private_extern _bn_mul_mont
  17. .align 5
  18. _bn_mul_mont:
  19. AARCH64_SIGN_LINK_REGISTER
  20. tst x5,#7
  21. b.eq __bn_sqr8x_mont
  22. tst x5,#3
  23. b.eq __bn_mul4x_mont
  24. Lmul_mont:
  25. stp x29,x30,[sp,#-64]!
  26. add x29,sp,#0
  27. stp x19,x20,[sp,#16]
  28. stp x21,x22,[sp,#32]
  29. stp x23,x24,[sp,#48]
  30. ldr x9,[x2],#8 // bp[0]
  31. sub x22,sp,x5,lsl#3
  32. ldp x7,x8,[x1],#16 // ap[0..1]
  33. lsl x5,x5,#3
  34. ldr x4,[x4] // *n0
  35. and x22,x22,#-16 // ABI says so
  36. ldp x13,x14,[x3],#16 // np[0..1]
  37. mul x6,x7,x9 // ap[0]*bp[0]
  38. sub x21,x5,#16 // j=num-2
  39. umulh x7,x7,x9
  40. mul x10,x8,x9 // ap[1]*bp[0]
  41. umulh x11,x8,x9
  42. mul x15,x6,x4 // "tp[0]"*n0
  43. mov sp,x22 // alloca
  44. // (*) mul x12,x13,x15 // np[0]*m1
  45. umulh x13,x13,x15
  46. mul x16,x14,x15 // np[1]*m1
  47. // (*) adds x12,x12,x6 // discarded
  48. // (*) As for removal of first multiplication and addition
  49. // instructions. The outcome of first addition is
  50. // guaranteed to be zero, which leaves two computationally
  51. // significant outcomes: it either carries or not. Then
  52. // question is when does it carry? Is there alternative
  53. // way to deduce it? If you follow operations, you can
  54. // observe that condition for carry is quite simple:
  55. // x6 being non-zero. So that carry can be calculated
  56. // by adding -1 to x6. That's what next instruction does.
  57. subs xzr,x6,#1 // (*)
  58. umulh x17,x14,x15
  59. adc x13,x13,xzr
  60. cbz x21,L1st_skip
  61. L1st:
  62. ldr x8,[x1],#8
  63. adds x6,x10,x7
  64. sub x21,x21,#8 // j--
  65. adc x7,x11,xzr
  66. ldr x14,[x3],#8
  67. adds x12,x16,x13
  68. mul x10,x8,x9 // ap[j]*bp[0]
  69. adc x13,x17,xzr
  70. umulh x11,x8,x9
  71. adds x12,x12,x6
  72. mul x16,x14,x15 // np[j]*m1
  73. adc x13,x13,xzr
  74. umulh x17,x14,x15
  75. str x12,[x22],#8 // tp[j-1]
  76. cbnz x21,L1st
  77. L1st_skip:
  78. adds x6,x10,x7
  79. sub x1,x1,x5 // rewind x1
  80. adc x7,x11,xzr
  81. adds x12,x16,x13
  82. sub x3,x3,x5 // rewind x3
  83. adc x13,x17,xzr
  84. adds x12,x12,x6
  85. sub x20,x5,#8 // i=num-1
  86. adcs x13,x13,x7
  87. adc x19,xzr,xzr // upmost overflow bit
  88. stp x12,x13,[x22]
  89. Louter:
  90. ldr x9,[x2],#8 // bp[i]
  91. ldp x7,x8,[x1],#16
  92. ldr x23,[sp] // tp[0]
  93. add x22,sp,#8
  94. mul x6,x7,x9 // ap[0]*bp[i]
  95. sub x21,x5,#16 // j=num-2
  96. umulh x7,x7,x9
  97. ldp x13,x14,[x3],#16
  98. mul x10,x8,x9 // ap[1]*bp[i]
  99. adds x6,x6,x23
  100. umulh x11,x8,x9
  101. adc x7,x7,xzr
  102. mul x15,x6,x4
  103. sub x20,x20,#8 // i--
  104. // (*) mul x12,x13,x15 // np[0]*m1
  105. umulh x13,x13,x15
  106. mul x16,x14,x15 // np[1]*m1
  107. // (*) adds x12,x12,x6
  108. subs xzr,x6,#1 // (*)
  109. umulh x17,x14,x15
  110. cbz x21,Linner_skip
  111. Linner:
  112. ldr x8,[x1],#8
  113. adc x13,x13,xzr
  114. ldr x23,[x22],#8 // tp[j]
  115. adds x6,x10,x7
  116. sub x21,x21,#8 // j--
  117. adc x7,x11,xzr
  118. adds x12,x16,x13
  119. ldr x14,[x3],#8
  120. adc x13,x17,xzr
  121. mul x10,x8,x9 // ap[j]*bp[i]
  122. adds x6,x6,x23
  123. umulh x11,x8,x9
  124. adc x7,x7,xzr
  125. mul x16,x14,x15 // np[j]*m1
  126. adds x12,x12,x6
  127. umulh x17,x14,x15
  128. str x12,[x22,#-16] // tp[j-1]
  129. cbnz x21,Linner
  130. Linner_skip:
  131. ldr x23,[x22],#8 // tp[j]
  132. adc x13,x13,xzr
  133. adds x6,x10,x7
  134. sub x1,x1,x5 // rewind x1
  135. adc x7,x11,xzr
  136. adds x12,x16,x13
  137. sub x3,x3,x5 // rewind x3
  138. adcs x13,x17,x19
  139. adc x19,xzr,xzr
  140. adds x6,x6,x23
  141. adc x7,x7,xzr
  142. adds x12,x12,x6
  143. adcs x13,x13,x7
  144. adc x19,x19,xzr // upmost overflow bit
  145. stp x12,x13,[x22,#-16]
  146. cbnz x20,Louter
  147. // Final step. We see if result is larger than modulus, and
  148. // if it is, subtract the modulus. But comparison implies
  149. // subtraction. So we subtract modulus, see if it borrowed,
  150. // and conditionally copy original value.
  151. ldr x23,[sp] // tp[0]
  152. add x22,sp,#8
  153. ldr x14,[x3],#8 // np[0]
  154. subs x21,x5,#8 // j=num-1 and clear borrow
  155. mov x1,x0
  156. Lsub:
  157. sbcs x8,x23,x14 // tp[j]-np[j]
  158. ldr x23,[x22],#8
  159. sub x21,x21,#8 // j--
  160. ldr x14,[x3],#8
  161. str x8,[x1],#8 // rp[j]=tp[j]-np[j]
  162. cbnz x21,Lsub
  163. sbcs x8,x23,x14
  164. sbcs x19,x19,xzr // did it borrow?
  165. str x8,[x1],#8 // rp[num-1]
  166. ldr x23,[sp] // tp[0]
  167. add x22,sp,#8
  168. ldr x8,[x0],#8 // rp[0]
  169. sub x5,x5,#8 // num--
  170. nop
  171. Lcond_copy:
  172. sub x5,x5,#8 // num--
  173. csel x14,x23,x8,lo // did it borrow?
  174. ldr x23,[x22],#8
  175. ldr x8,[x0],#8
  176. str xzr,[x22,#-16] // wipe tp
  177. str x14,[x0,#-16]
  178. cbnz x5,Lcond_copy
  179. csel x14,x23,x8,lo
  180. str xzr,[x22,#-8] // wipe tp
  181. str x14,[x0,#-8]
  182. ldp x19,x20,[x29,#16]
  183. mov sp,x29
  184. ldp x21,x22,[x29,#32]
  185. mov x0,#1
  186. ldp x23,x24,[x29,#48]
  187. ldr x29,[sp],#64
  188. AARCH64_VALIDATE_LINK_REGISTER
  189. ret
  190. .align 5
  191. __bn_sqr8x_mont:
  192. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
  193. // only from bn_mul_mont which has already signed the return address.
  194. cmp x1,x2
  195. b.ne __bn_mul4x_mont
  196. Lsqr8x_mont:
  197. stp x29,x30,[sp,#-128]!
  198. add x29,sp,#0
  199. stp x19,x20,[sp,#16]
  200. stp x21,x22,[sp,#32]
  201. stp x23,x24,[sp,#48]
  202. stp x25,x26,[sp,#64]
  203. stp x27,x28,[sp,#80]
  204. stp x0,x3,[sp,#96] // offload rp and np
  205. ldp x6,x7,[x1,#8*0]
  206. ldp x8,x9,[x1,#8*2]
  207. ldp x10,x11,[x1,#8*4]
  208. ldp x12,x13,[x1,#8*6]
  209. sub x2,sp,x5,lsl#4
  210. lsl x5,x5,#3
  211. ldr x4,[x4] // *n0
  212. mov sp,x2 // alloca
  213. sub x27,x5,#8*8
  214. b Lsqr8x_zero_start
  215. Lsqr8x_zero:
  216. sub x27,x27,#8*8
  217. stp xzr,xzr,[x2,#8*0]
  218. stp xzr,xzr,[x2,#8*2]
  219. stp xzr,xzr,[x2,#8*4]
  220. stp xzr,xzr,[x2,#8*6]
  221. Lsqr8x_zero_start:
  222. stp xzr,xzr,[x2,#8*8]
  223. stp xzr,xzr,[x2,#8*10]
  224. stp xzr,xzr,[x2,#8*12]
  225. stp xzr,xzr,[x2,#8*14]
  226. add x2,x2,#8*16
  227. cbnz x27,Lsqr8x_zero
  228. add x3,x1,x5
  229. add x1,x1,#8*8
  230. mov x19,xzr
  231. mov x20,xzr
  232. mov x21,xzr
  233. mov x22,xzr
  234. mov x23,xzr
  235. mov x24,xzr
  236. mov x25,xzr
  237. mov x26,xzr
  238. mov x2,sp
  239. str x4,[x29,#112] // offload n0
  240. // Multiply everything but a[i]*a[i]
  241. .align 4
  242. Lsqr8x_outer_loop:
  243. // a[1]a[0] (i)
  244. // a[2]a[0]
  245. // a[3]a[0]
  246. // a[4]a[0]
  247. // a[5]a[0]
  248. // a[6]a[0]
  249. // a[7]a[0]
  250. // a[2]a[1] (ii)
  251. // a[3]a[1]
  252. // a[4]a[1]
  253. // a[5]a[1]
  254. // a[6]a[1]
  255. // a[7]a[1]
  256. // a[3]a[2] (iii)
  257. // a[4]a[2]
  258. // a[5]a[2]
  259. // a[6]a[2]
  260. // a[7]a[2]
  261. // a[4]a[3] (iv)
  262. // a[5]a[3]
  263. // a[6]a[3]
  264. // a[7]a[3]
  265. // a[5]a[4] (v)
  266. // a[6]a[4]
  267. // a[7]a[4]
  268. // a[6]a[5] (vi)
  269. // a[7]a[5]
  270. // a[7]a[6] (vii)
  271. mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
  272. mul x15,x8,x6
  273. mul x16,x9,x6
  274. mul x17,x10,x6
  275. adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
  276. mul x14,x11,x6
  277. adcs x21,x21,x15
  278. mul x15,x12,x6
  279. adcs x22,x22,x16
  280. mul x16,x13,x6
  281. adcs x23,x23,x17
  282. umulh x17,x7,x6 // hi(a[1..7]*a[0])
  283. adcs x24,x24,x14
  284. umulh x14,x8,x6
  285. adcs x25,x25,x15
  286. umulh x15,x9,x6
  287. adcs x26,x26,x16
  288. umulh x16,x10,x6
  289. stp x19,x20,[x2],#8*2 // t[0..1]
  290. adc x19,xzr,xzr // t[8]
  291. adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
  292. umulh x17,x11,x6
  293. adcs x22,x22,x14
  294. umulh x14,x12,x6
  295. adcs x23,x23,x15
  296. umulh x15,x13,x6
  297. adcs x24,x24,x16
  298. mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
  299. adcs x25,x25,x17
  300. mul x17,x9,x7
  301. adcs x26,x26,x14
  302. mul x14,x10,x7
  303. adc x19,x19,x15
  304. mul x15,x11,x7
  305. adds x22,x22,x16
  306. mul x16,x12,x7
  307. adcs x23,x23,x17
  308. mul x17,x13,x7
  309. adcs x24,x24,x14
  310. umulh x14,x8,x7 // hi(a[2..7]*a[1])
  311. adcs x25,x25,x15
  312. umulh x15,x9,x7
  313. adcs x26,x26,x16
  314. umulh x16,x10,x7
  315. adcs x19,x19,x17
  316. umulh x17,x11,x7
  317. stp x21,x22,[x2],#8*2 // t[2..3]
  318. adc x20,xzr,xzr // t[9]
  319. adds x23,x23,x14
  320. umulh x14,x12,x7
  321. adcs x24,x24,x15
  322. umulh x15,x13,x7
  323. adcs x25,x25,x16
  324. mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
  325. adcs x26,x26,x17
  326. mul x17,x10,x8
  327. adcs x19,x19,x14
  328. mul x14,x11,x8
  329. adc x20,x20,x15
  330. mul x15,x12,x8
  331. adds x24,x24,x16
  332. mul x16,x13,x8
  333. adcs x25,x25,x17
  334. umulh x17,x9,x8 // hi(a[3..7]*a[2])
  335. adcs x26,x26,x14
  336. umulh x14,x10,x8
  337. adcs x19,x19,x15
  338. umulh x15,x11,x8
  339. adcs x20,x20,x16
  340. umulh x16,x12,x8
  341. stp x23,x24,[x2],#8*2 // t[4..5]
  342. adc x21,xzr,xzr // t[10]
  343. adds x25,x25,x17
  344. umulh x17,x13,x8
  345. adcs x26,x26,x14
  346. mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
  347. adcs x19,x19,x15
  348. mul x15,x11,x9
  349. adcs x20,x20,x16
  350. mul x16,x12,x9
  351. adc x21,x21,x17
  352. mul x17,x13,x9
  353. adds x26,x26,x14
  354. umulh x14,x10,x9 // hi(a[4..7]*a[3])
  355. adcs x19,x19,x15
  356. umulh x15,x11,x9
  357. adcs x20,x20,x16
  358. umulh x16,x12,x9
  359. adcs x21,x21,x17
  360. umulh x17,x13,x9
  361. stp x25,x26,[x2],#8*2 // t[6..7]
  362. adc x22,xzr,xzr // t[11]
  363. adds x19,x19,x14
  364. mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
  365. adcs x20,x20,x15
  366. mul x15,x12,x10
  367. adcs x21,x21,x16
  368. mul x16,x13,x10
  369. adc x22,x22,x17
  370. umulh x17,x11,x10 // hi(a[5..7]*a[4])
  371. adds x20,x20,x14
  372. umulh x14,x12,x10
  373. adcs x21,x21,x15
  374. umulh x15,x13,x10
  375. adcs x22,x22,x16
  376. mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
  377. adc x23,xzr,xzr // t[12]
  378. adds x21,x21,x17
  379. mul x17,x13,x11
  380. adcs x22,x22,x14
  381. umulh x14,x12,x11 // hi(a[6..7]*a[5])
  382. adc x23,x23,x15
  383. umulh x15,x13,x11
  384. adds x22,x22,x16
  385. mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
  386. adcs x23,x23,x17
  387. umulh x17,x13,x12 // hi(a[7]*a[6])
  388. adc x24,xzr,xzr // t[13]
  389. adds x23,x23,x14
  390. sub x27,x3,x1 // done yet?
  391. adc x24,x24,x15
  392. adds x24,x24,x16
  393. sub x14,x3,x5 // rewinded ap
  394. adc x25,xzr,xzr // t[14]
  395. add x25,x25,x17
  396. cbz x27,Lsqr8x_outer_break
  397. mov x4,x6
  398. ldp x6,x7,[x2,#8*0]
  399. ldp x8,x9,[x2,#8*2]
  400. ldp x10,x11,[x2,#8*4]
  401. ldp x12,x13,[x2,#8*6]
  402. adds x19,x19,x6
  403. adcs x20,x20,x7
  404. ldp x6,x7,[x1,#8*0]
  405. adcs x21,x21,x8
  406. adcs x22,x22,x9
  407. ldp x8,x9,[x1,#8*2]
  408. adcs x23,x23,x10
  409. adcs x24,x24,x11
  410. ldp x10,x11,[x1,#8*4]
  411. adcs x25,x25,x12
  412. mov x0,x1
  413. adcs x26,xzr,x13
  414. ldp x12,x13,[x1,#8*6]
  415. add x1,x1,#8*8
  416. //adc x28,xzr,xzr // moved below
  417. mov x27,#-8*8
  418. // a[8]a[0]
  419. // a[9]a[0]
  420. // a[a]a[0]
  421. // a[b]a[0]
  422. // a[c]a[0]
  423. // a[d]a[0]
  424. // a[e]a[0]
  425. // a[f]a[0]
  426. // a[8]a[1]
  427. // a[f]a[1]........................
  428. // a[8]a[2]
  429. // a[f]a[2]........................
  430. // a[8]a[3]
  431. // a[f]a[3]........................
  432. // a[8]a[4]
  433. // a[f]a[4]........................
  434. // a[8]a[5]
  435. // a[f]a[5]........................
  436. // a[8]a[6]
  437. // a[f]a[6]........................
  438. // a[8]a[7]
  439. // a[f]a[7]........................
  440. Lsqr8x_mul:
  441. mul x14,x6,x4
  442. adc x28,xzr,xzr // carry bit, modulo-scheduled
  443. mul x15,x7,x4
  444. add x27,x27,#8
  445. mul x16,x8,x4
  446. mul x17,x9,x4
  447. adds x19,x19,x14
  448. mul x14,x10,x4
  449. adcs x20,x20,x15
  450. mul x15,x11,x4
  451. adcs x21,x21,x16
  452. mul x16,x12,x4
  453. adcs x22,x22,x17
  454. mul x17,x13,x4
  455. adcs x23,x23,x14
  456. umulh x14,x6,x4
  457. adcs x24,x24,x15
  458. umulh x15,x7,x4
  459. adcs x25,x25,x16
  460. umulh x16,x8,x4
  461. adcs x26,x26,x17
  462. umulh x17,x9,x4
  463. adc x28,x28,xzr
  464. str x19,[x2],#8
  465. adds x19,x20,x14
  466. umulh x14,x10,x4
  467. adcs x20,x21,x15
  468. umulh x15,x11,x4
  469. adcs x21,x22,x16
  470. umulh x16,x12,x4
  471. adcs x22,x23,x17
  472. umulh x17,x13,x4
  473. ldr x4,[x0,x27]
  474. adcs x23,x24,x14
  475. adcs x24,x25,x15
  476. adcs x25,x26,x16
  477. adcs x26,x28,x17
  478. //adc x28,xzr,xzr // moved above
  479. cbnz x27,Lsqr8x_mul
  480. // note that carry flag is guaranteed
  481. // to be zero at this point
  482. cmp x1,x3 // done yet?
  483. b.eq Lsqr8x_break
  484. ldp x6,x7,[x2,#8*0]
  485. ldp x8,x9,[x2,#8*2]
  486. ldp x10,x11,[x2,#8*4]
  487. ldp x12,x13,[x2,#8*6]
  488. adds x19,x19,x6
  489. ldr x4,[x0,#-8*8]
  490. adcs x20,x20,x7
  491. ldp x6,x7,[x1,#8*0]
  492. adcs x21,x21,x8
  493. adcs x22,x22,x9
  494. ldp x8,x9,[x1,#8*2]
  495. adcs x23,x23,x10
  496. adcs x24,x24,x11
  497. ldp x10,x11,[x1,#8*4]
  498. adcs x25,x25,x12
  499. mov x27,#-8*8
  500. adcs x26,x26,x13
  501. ldp x12,x13,[x1,#8*6]
  502. add x1,x1,#8*8
  503. //adc x28,xzr,xzr // moved above
  504. b Lsqr8x_mul
  505. .align 4
  506. Lsqr8x_break:
  507. ldp x6,x7,[x0,#8*0]
  508. add x1,x0,#8*8
  509. ldp x8,x9,[x0,#8*2]
  510. sub x14,x3,x1 // is it last iteration?
  511. ldp x10,x11,[x0,#8*4]
  512. sub x15,x2,x14
  513. ldp x12,x13,[x0,#8*6]
  514. cbz x14,Lsqr8x_outer_loop
  515. stp x19,x20,[x2,#8*0]
  516. ldp x19,x20,[x15,#8*0]
  517. stp x21,x22,[x2,#8*2]
  518. ldp x21,x22,[x15,#8*2]
  519. stp x23,x24,[x2,#8*4]
  520. ldp x23,x24,[x15,#8*4]
  521. stp x25,x26,[x2,#8*6]
  522. mov x2,x15
  523. ldp x25,x26,[x15,#8*6]
  524. b Lsqr8x_outer_loop
  525. .align 4
  526. Lsqr8x_outer_break:
  527. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  528. ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
  529. ldp x15,x16,[sp,#8*1]
  530. ldp x11,x13,[x14,#8*2]
  531. add x1,x14,#8*4
  532. ldp x17,x14,[sp,#8*3]
  533. stp x19,x20,[x2,#8*0]
  534. mul x19,x7,x7
  535. stp x21,x22,[x2,#8*2]
  536. umulh x7,x7,x7
  537. stp x23,x24,[x2,#8*4]
  538. mul x8,x9,x9
  539. stp x25,x26,[x2,#8*6]
  540. mov x2,sp
  541. umulh x9,x9,x9
  542. adds x20,x7,x15,lsl#1
  543. extr x15,x16,x15,#63
  544. sub x27,x5,#8*4
  545. Lsqr4x_shift_n_add:
  546. adcs x21,x8,x15
  547. extr x16,x17,x16,#63
  548. sub x27,x27,#8*4
  549. adcs x22,x9,x16
  550. ldp x15,x16,[x2,#8*5]
  551. mul x10,x11,x11
  552. ldp x7,x9,[x1],#8*2
  553. umulh x11,x11,x11
  554. mul x12,x13,x13
  555. umulh x13,x13,x13
  556. extr x17,x14,x17,#63
  557. stp x19,x20,[x2,#8*0]
  558. adcs x23,x10,x17
  559. extr x14,x15,x14,#63
  560. stp x21,x22,[x2,#8*2]
  561. adcs x24,x11,x14
  562. ldp x17,x14,[x2,#8*7]
  563. extr x15,x16,x15,#63
  564. adcs x25,x12,x15
  565. extr x16,x17,x16,#63
  566. adcs x26,x13,x16
  567. ldp x15,x16,[x2,#8*9]
  568. mul x6,x7,x7
  569. ldp x11,x13,[x1],#8*2
  570. umulh x7,x7,x7
  571. mul x8,x9,x9
  572. umulh x9,x9,x9
  573. stp x23,x24,[x2,#8*4]
  574. extr x17,x14,x17,#63
  575. stp x25,x26,[x2,#8*6]
  576. add x2,x2,#8*8
  577. adcs x19,x6,x17
  578. extr x14,x15,x14,#63
  579. adcs x20,x7,x14
  580. ldp x17,x14,[x2,#8*3]
  581. extr x15,x16,x15,#63
  582. cbnz x27,Lsqr4x_shift_n_add
  583. ldp x1,x4,[x29,#104] // pull np and n0
  584. adcs x21,x8,x15
  585. extr x16,x17,x16,#63
  586. adcs x22,x9,x16
  587. ldp x15,x16,[x2,#8*5]
  588. mul x10,x11,x11
  589. umulh x11,x11,x11
  590. stp x19,x20,[x2,#8*0]
  591. mul x12,x13,x13
  592. umulh x13,x13,x13
  593. stp x21,x22,[x2,#8*2]
  594. extr x17,x14,x17,#63
  595. adcs x23,x10,x17
  596. extr x14,x15,x14,#63
  597. ldp x19,x20,[sp,#8*0]
  598. adcs x24,x11,x14
  599. extr x15,x16,x15,#63
  600. ldp x6,x7,[x1,#8*0]
  601. adcs x25,x12,x15
  602. extr x16,xzr,x16,#63
  603. ldp x8,x9,[x1,#8*2]
  604. adc x26,x13,x16
  605. ldp x10,x11,[x1,#8*4]
  606. // Reduce by 512 bits per iteration
  607. mul x28,x4,x19 // t[0]*n0
  608. ldp x12,x13,[x1,#8*6]
  609. add x3,x1,x5
  610. ldp x21,x22,[sp,#8*2]
  611. stp x23,x24,[x2,#8*4]
  612. ldp x23,x24,[sp,#8*4]
  613. stp x25,x26,[x2,#8*6]
  614. ldp x25,x26,[sp,#8*6]
  615. add x1,x1,#8*8
  616. mov x30,xzr // initial top-most carry
  617. mov x2,sp
  618. mov x27,#8
  619. Lsqr8x_reduction:
  620. // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
  621. mul x15,x7,x28
  622. sub x27,x27,#1
  623. mul x16,x8,x28
  624. str x28,[x2],#8 // put aside t[0]*n0 for tail processing
  625. mul x17,x9,x28
  626. // (*) adds xzr,x19,x14
  627. subs xzr,x19,#1 // (*)
  628. mul x14,x10,x28
  629. adcs x19,x20,x15
  630. mul x15,x11,x28
  631. adcs x20,x21,x16
  632. mul x16,x12,x28
  633. adcs x21,x22,x17
  634. mul x17,x13,x28
  635. adcs x22,x23,x14
  636. umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
  637. adcs x23,x24,x15
  638. umulh x15,x7,x28
  639. adcs x24,x25,x16
  640. umulh x16,x8,x28
  641. adcs x25,x26,x17
  642. umulh x17,x9,x28
  643. adc x26,xzr,xzr
  644. adds x19,x19,x14
  645. umulh x14,x10,x28
  646. adcs x20,x20,x15
  647. umulh x15,x11,x28
  648. adcs x21,x21,x16
  649. umulh x16,x12,x28
  650. adcs x22,x22,x17
  651. umulh x17,x13,x28
  652. mul x28,x4,x19 // next t[0]*n0
  653. adcs x23,x23,x14
  654. adcs x24,x24,x15
  655. adcs x25,x25,x16
  656. adc x26,x26,x17
  657. cbnz x27,Lsqr8x_reduction
  658. ldp x14,x15,[x2,#8*0]
  659. ldp x16,x17,[x2,#8*2]
  660. mov x0,x2
  661. sub x27,x3,x1 // done yet?
  662. adds x19,x19,x14
  663. adcs x20,x20,x15
  664. ldp x14,x15,[x2,#8*4]
  665. adcs x21,x21,x16
  666. adcs x22,x22,x17
  667. ldp x16,x17,[x2,#8*6]
  668. adcs x23,x23,x14
  669. adcs x24,x24,x15
  670. adcs x25,x25,x16
  671. adcs x26,x26,x17
  672. //adc x28,xzr,xzr // moved below
  673. cbz x27,Lsqr8x8_post_condition
  674. ldr x4,[x2,#-8*8]
  675. ldp x6,x7,[x1,#8*0]
  676. ldp x8,x9,[x1,#8*2]
  677. ldp x10,x11,[x1,#8*4]
  678. mov x27,#-8*8
  679. ldp x12,x13,[x1,#8*6]
  680. add x1,x1,#8*8
  681. Lsqr8x_tail:
  682. mul x14,x6,x4
  683. adc x28,xzr,xzr // carry bit, modulo-scheduled
  684. mul x15,x7,x4
  685. add x27,x27,#8
  686. mul x16,x8,x4
  687. mul x17,x9,x4
  688. adds x19,x19,x14
  689. mul x14,x10,x4
  690. adcs x20,x20,x15
  691. mul x15,x11,x4
  692. adcs x21,x21,x16
  693. mul x16,x12,x4
  694. adcs x22,x22,x17
  695. mul x17,x13,x4
  696. adcs x23,x23,x14
  697. umulh x14,x6,x4
  698. adcs x24,x24,x15
  699. umulh x15,x7,x4
  700. adcs x25,x25,x16
  701. umulh x16,x8,x4
  702. adcs x26,x26,x17
  703. umulh x17,x9,x4
  704. adc x28,x28,xzr
  705. str x19,[x2],#8
  706. adds x19,x20,x14
  707. umulh x14,x10,x4
  708. adcs x20,x21,x15
  709. umulh x15,x11,x4
  710. adcs x21,x22,x16
  711. umulh x16,x12,x4
  712. adcs x22,x23,x17
  713. umulh x17,x13,x4
  714. ldr x4,[x0,x27]
  715. adcs x23,x24,x14
  716. adcs x24,x25,x15
  717. adcs x25,x26,x16
  718. adcs x26,x28,x17
  719. //adc x28,xzr,xzr // moved above
  720. cbnz x27,Lsqr8x_tail
  721. // note that carry flag is guaranteed
  722. // to be zero at this point
  723. ldp x6,x7,[x2,#8*0]
  724. sub x27,x3,x1 // done yet?
  725. sub x16,x3,x5 // rewinded np
  726. ldp x8,x9,[x2,#8*2]
  727. ldp x10,x11,[x2,#8*4]
  728. ldp x12,x13,[x2,#8*6]
  729. cbz x27,Lsqr8x_tail_break
  730. ldr x4,[x0,#-8*8]
  731. adds x19,x19,x6
  732. adcs x20,x20,x7
  733. ldp x6,x7,[x1,#8*0]
  734. adcs x21,x21,x8
  735. adcs x22,x22,x9
  736. ldp x8,x9,[x1,#8*2]
  737. adcs x23,x23,x10
  738. adcs x24,x24,x11
  739. ldp x10,x11,[x1,#8*4]
  740. adcs x25,x25,x12
  741. mov x27,#-8*8
  742. adcs x26,x26,x13
  743. ldp x12,x13,[x1,#8*6]
  744. add x1,x1,#8*8
  745. //adc x28,xzr,xzr // moved above
  746. b Lsqr8x_tail
  747. .align 4
  748. Lsqr8x_tail_break:
  749. ldr x4,[x29,#112] // pull n0
  750. add x27,x2,#8*8 // end of current t[num] window
  751. subs xzr,x30,#1 // "move" top-most carry to carry bit
  752. adcs x14,x19,x6
  753. adcs x15,x20,x7
  754. ldp x19,x20,[x0,#8*0]
  755. adcs x21,x21,x8
  756. ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
  757. adcs x22,x22,x9
  758. ldp x8,x9,[x16,#8*2]
  759. adcs x23,x23,x10
  760. adcs x24,x24,x11
  761. ldp x10,x11,[x16,#8*4]
  762. adcs x25,x25,x12
  763. adcs x26,x26,x13
  764. ldp x12,x13,[x16,#8*6]
  765. add x1,x16,#8*8
  766. adc x30,xzr,xzr // top-most carry
  767. mul x28,x4,x19
  768. stp x14,x15,[x2,#8*0]
  769. stp x21,x22,[x2,#8*2]
  770. ldp x21,x22,[x0,#8*2]
  771. stp x23,x24,[x2,#8*4]
  772. ldp x23,x24,[x0,#8*4]
  773. cmp x27,x29 // did we hit the bottom?
  774. stp x25,x26,[x2,#8*6]
  775. mov x2,x0 // slide the window
  776. ldp x25,x26,[x0,#8*6]
  777. mov x27,#8
  778. b.ne Lsqr8x_reduction
  779. // Final step. We see if result is larger than modulus, and
  780. // if it is, subtract the modulus. But comparison implies
  781. // subtraction. So we subtract modulus, see if it borrowed,
  782. // and conditionally copy original value.
  783. ldr x0,[x29,#96] // pull rp
  784. add x2,x2,#8*8
  785. subs x14,x19,x6
  786. sbcs x15,x20,x7
  787. sub x27,x5,#8*8
  788. mov x3,x0 // x0 copy
  789. Lsqr8x_sub:
  790. sbcs x16,x21,x8
  791. ldp x6,x7,[x1,#8*0]
  792. sbcs x17,x22,x9
  793. stp x14,x15,[x0,#8*0]
  794. sbcs x14,x23,x10
  795. ldp x8,x9,[x1,#8*2]
  796. sbcs x15,x24,x11
  797. stp x16,x17,[x0,#8*2]
  798. sbcs x16,x25,x12
  799. ldp x10,x11,[x1,#8*4]
  800. sbcs x17,x26,x13
  801. ldp x12,x13,[x1,#8*6]
  802. add x1,x1,#8*8
  803. ldp x19,x20,[x2,#8*0]
  804. sub x27,x27,#8*8
  805. ldp x21,x22,[x2,#8*2]
  806. ldp x23,x24,[x2,#8*4]
  807. ldp x25,x26,[x2,#8*6]
  808. add x2,x2,#8*8
  809. stp x14,x15,[x0,#8*4]
  810. sbcs x14,x19,x6
  811. stp x16,x17,[x0,#8*6]
  812. add x0,x0,#8*8
  813. sbcs x15,x20,x7
  814. cbnz x27,Lsqr8x_sub
  815. sbcs x16,x21,x8
  816. mov x2,sp
  817. add x1,sp,x5
  818. ldp x6,x7,[x3,#8*0]
  819. sbcs x17,x22,x9
  820. stp x14,x15,[x0,#8*0]
  821. sbcs x14,x23,x10
  822. ldp x8,x9,[x3,#8*2]
  823. sbcs x15,x24,x11
  824. stp x16,x17,[x0,#8*2]
  825. sbcs x16,x25,x12
  826. ldp x19,x20,[x1,#8*0]
  827. sbcs x17,x26,x13
  828. ldp x21,x22,[x1,#8*2]
  829. sbcs xzr,x30,xzr // did it borrow?
  830. ldr x30,[x29,#8] // pull return address
  831. stp x14,x15,[x0,#8*4]
  832. stp x16,x17,[x0,#8*6]
  833. sub x27,x5,#8*4
  834. Lsqr4x_cond_copy:
  835. sub x27,x27,#8*4
  836. csel x14,x19,x6,lo
  837. stp xzr,xzr,[x2,#8*0]
  838. csel x15,x20,x7,lo
  839. ldp x6,x7,[x3,#8*4]
  840. ldp x19,x20,[x1,#8*4]
  841. csel x16,x21,x8,lo
  842. stp xzr,xzr,[x2,#8*2]
  843. add x2,x2,#8*4
  844. csel x17,x22,x9,lo
  845. ldp x8,x9,[x3,#8*6]
  846. ldp x21,x22,[x1,#8*6]
  847. add x1,x1,#8*4
  848. stp x14,x15,[x3,#8*0]
  849. stp x16,x17,[x3,#8*2]
  850. add x3,x3,#8*4
  851. stp xzr,xzr,[x1,#8*0]
  852. stp xzr,xzr,[x1,#8*2]
  853. cbnz x27,Lsqr4x_cond_copy
  854. csel x14,x19,x6,lo
  855. stp xzr,xzr,[x2,#8*0]
  856. csel x15,x20,x7,lo
  857. stp xzr,xzr,[x2,#8*2]
  858. csel x16,x21,x8,lo
  859. csel x17,x22,x9,lo
  860. stp x14,x15,[x3,#8*0]
  861. stp x16,x17,[x3,#8*2]
  862. b Lsqr8x_done
  863. .align 4
  864. Lsqr8x8_post_condition:
  865. adc x28,xzr,xzr
  866. ldr x30,[x29,#8] // pull return address
  867. // x19-7,x28 hold result, x6-7 hold modulus
  868. subs x6,x19,x6
  869. ldr x1,[x29,#96] // pull rp
  870. sbcs x7,x20,x7
  871. stp xzr,xzr,[sp,#8*0]
  872. sbcs x8,x21,x8
  873. stp xzr,xzr,[sp,#8*2]
  874. sbcs x9,x22,x9
  875. stp xzr,xzr,[sp,#8*4]
  876. sbcs x10,x23,x10
  877. stp xzr,xzr,[sp,#8*6]
  878. sbcs x11,x24,x11
  879. stp xzr,xzr,[sp,#8*8]
  880. sbcs x12,x25,x12
  881. stp xzr,xzr,[sp,#8*10]
  882. sbcs x13,x26,x13
  883. stp xzr,xzr,[sp,#8*12]
  884. sbcs x28,x28,xzr // did it borrow?
  885. stp xzr,xzr,[sp,#8*14]
  886. // x6-7 hold result-modulus
  887. csel x6,x19,x6,lo
  888. csel x7,x20,x7,lo
  889. csel x8,x21,x8,lo
  890. csel x9,x22,x9,lo
  891. stp x6,x7,[x1,#8*0]
  892. csel x10,x23,x10,lo
  893. csel x11,x24,x11,lo
  894. stp x8,x9,[x1,#8*2]
  895. csel x12,x25,x12,lo
  896. csel x13,x26,x13,lo
  897. stp x10,x11,[x1,#8*4]
  898. stp x12,x13,[x1,#8*6]
  899. Lsqr8x_done:
  900. ldp x19,x20,[x29,#16]
  901. mov sp,x29
  902. ldp x21,x22,[x29,#32]
  903. mov x0,#1
  904. ldp x23,x24,[x29,#48]
  905. ldp x25,x26,[x29,#64]
  906. ldp x27,x28,[x29,#80]
  907. ldr x29,[sp],#128
  908. // x30 is popped earlier
  909. AARCH64_VALIDATE_LINK_REGISTER
  910. ret
  911. .align 5
  912. __bn_mul4x_mont:
  913. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
  914. // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
  915. // return address.
  916. stp x29,x30,[sp,#-128]!
  917. add x29,sp,#0
  918. stp x19,x20,[sp,#16]
  919. stp x21,x22,[sp,#32]
  920. stp x23,x24,[sp,#48]
  921. stp x25,x26,[sp,#64]
  922. stp x27,x28,[sp,#80]
  923. sub x26,sp,x5,lsl#3
  924. lsl x5,x5,#3
  925. ldr x4,[x4] // *n0
  926. sub sp,x26,#8*4 // alloca
  927. add x10,x2,x5
  928. add x27,x1,x5
  929. stp x0,x10,[x29,#96] // offload rp and &b[num]
  930. ldr x24,[x2,#8*0] // b[0]
  931. ldp x6,x7,[x1,#8*0] // a[0..3]
  932. ldp x8,x9,[x1,#8*2]
  933. add x1,x1,#8*4
  934. mov x19,xzr
  935. mov x20,xzr
  936. mov x21,xzr
  937. mov x22,xzr
  938. ldp x14,x15,[x3,#8*0] // n[0..3]
  939. ldp x16,x17,[x3,#8*2]
  940. adds x3,x3,#8*4 // clear carry bit
  941. mov x0,xzr
  942. mov x28,#0
  943. mov x26,sp
  944. Loop_mul4x_1st_reduction:
  945. mul x10,x6,x24 // lo(a[0..3]*b[0])
  946. adc x0,x0,xzr // modulo-scheduled
  947. mul x11,x7,x24
  948. add x28,x28,#8
  949. mul x12,x8,x24
  950. and x28,x28,#31
  951. mul x13,x9,x24
  952. adds x19,x19,x10
  953. umulh x10,x6,x24 // hi(a[0..3]*b[0])
  954. adcs x20,x20,x11
  955. mul x25,x19,x4 // t[0]*n0
  956. adcs x21,x21,x12
  957. umulh x11,x7,x24
  958. adcs x22,x22,x13
  959. umulh x12,x8,x24
  960. adc x23,xzr,xzr
  961. umulh x13,x9,x24
  962. ldr x24,[x2,x28] // next b[i] (or b[0])
  963. adds x20,x20,x10
  964. // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
  965. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  966. adcs x21,x21,x11
  967. mul x11,x15,x25
  968. adcs x22,x22,x12
  969. mul x12,x16,x25
  970. adc x23,x23,x13 // can't overflow
  971. mul x13,x17,x25
  972. // (*) adds xzr,x19,x10
  973. subs xzr,x19,#1 // (*)
  974. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
  975. adcs x19,x20,x11
  976. umulh x11,x15,x25
  977. adcs x20,x21,x12
  978. umulh x12,x16,x25
  979. adcs x21,x22,x13
  980. umulh x13,x17,x25
  981. adcs x22,x23,x0
  982. adc x0,xzr,xzr
  983. adds x19,x19,x10
  984. sub x10,x27,x1
  985. adcs x20,x20,x11
  986. adcs x21,x21,x12
  987. adcs x22,x22,x13
  988. //adc x0,x0,xzr
  989. cbnz x28,Loop_mul4x_1st_reduction
  990. cbz x10,Lmul4x4_post_condition
  991. ldp x6,x7,[x1,#8*0] // a[4..7]
  992. ldp x8,x9,[x1,#8*2]
  993. add x1,x1,#8*4
  994. ldr x25,[sp] // a[0]*n0
  995. ldp x14,x15,[x3,#8*0] // n[4..7]
  996. ldp x16,x17,[x3,#8*2]
  997. add x3,x3,#8*4
  998. Loop_mul4x_1st_tail:
  999. mul x10,x6,x24 // lo(a[4..7]*b[i])
  1000. adc x0,x0,xzr // modulo-scheduled
  1001. mul x11,x7,x24
  1002. add x28,x28,#8
  1003. mul x12,x8,x24
  1004. and x28,x28,#31
  1005. mul x13,x9,x24
  1006. adds x19,x19,x10
  1007. umulh x10,x6,x24 // hi(a[4..7]*b[i])
  1008. adcs x20,x20,x11
  1009. umulh x11,x7,x24
  1010. adcs x21,x21,x12
  1011. umulh x12,x8,x24
  1012. adcs x22,x22,x13
  1013. umulh x13,x9,x24
  1014. adc x23,xzr,xzr
  1015. ldr x24,[x2,x28] // next b[i] (or b[0])
  1016. adds x20,x20,x10
  1017. mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
  1018. adcs x21,x21,x11
  1019. mul x11,x15,x25
  1020. adcs x22,x22,x12
  1021. mul x12,x16,x25
  1022. adc x23,x23,x13 // can't overflow
  1023. mul x13,x17,x25
  1024. adds x19,x19,x10
  1025. umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
  1026. adcs x20,x20,x11
  1027. umulh x11,x15,x25
  1028. adcs x21,x21,x12
  1029. umulh x12,x16,x25
  1030. adcs x22,x22,x13
  1031. adcs x23,x23,x0
  1032. umulh x13,x17,x25
  1033. adc x0,xzr,xzr
  1034. ldr x25,[sp,x28] // next t[0]*n0
  1035. str x19,[x26],#8 // result!!!
  1036. adds x19,x20,x10
  1037. sub x10,x27,x1 // done yet?
  1038. adcs x20,x21,x11
  1039. adcs x21,x22,x12
  1040. adcs x22,x23,x13
  1041. //adc x0,x0,xzr
  1042. cbnz x28,Loop_mul4x_1st_tail
  1043. sub x11,x27,x5 // rewinded x1
  1044. cbz x10,Lmul4x_proceed
  1045. ldp x6,x7,[x1,#8*0]
  1046. ldp x8,x9,[x1,#8*2]
  1047. add x1,x1,#8*4
  1048. ldp x14,x15,[x3,#8*0]
  1049. ldp x16,x17,[x3,#8*2]
  1050. add x3,x3,#8*4
  1051. b Loop_mul4x_1st_tail
  1052. .align 5
  1053. Lmul4x_proceed:
  1054. ldr x24,[x2,#8*4]! // *++b
  1055. adc x30,x0,xzr
  1056. ldp x6,x7,[x11,#8*0] // a[0..3]
  1057. sub x3,x3,x5 // rewind np
  1058. ldp x8,x9,[x11,#8*2]
  1059. add x1,x11,#8*4
  1060. stp x19,x20,[x26,#8*0] // result!!!
  1061. ldp x19,x20,[sp,#8*4] // t[0..3]
  1062. stp x21,x22,[x26,#8*2] // result!!!
  1063. ldp x21,x22,[sp,#8*6]
  1064. ldp x14,x15,[x3,#8*0] // n[0..3]
  1065. mov x26,sp
  1066. ldp x16,x17,[x3,#8*2]
  1067. adds x3,x3,#8*4 // clear carry bit
  1068. mov x0,xzr
  1069. .align 4
  1070. Loop_mul4x_reduction:
  1071. mul x10,x6,x24 // lo(a[0..3]*b[4])
  1072. adc x0,x0,xzr // modulo-scheduled
  1073. mul x11,x7,x24
  1074. add x28,x28,#8
  1075. mul x12,x8,x24
  1076. and x28,x28,#31
  1077. mul x13,x9,x24
  1078. adds x19,x19,x10
  1079. umulh x10,x6,x24 // hi(a[0..3]*b[4])
  1080. adcs x20,x20,x11
  1081. mul x25,x19,x4 // t[0]*n0
  1082. adcs x21,x21,x12
  1083. umulh x11,x7,x24
  1084. adcs x22,x22,x13
  1085. umulh x12,x8,x24
  1086. adc x23,xzr,xzr
  1087. umulh x13,x9,x24
  1088. ldr x24,[x2,x28] // next b[i]
  1089. adds x20,x20,x10
  1090. // (*) mul x10,x14,x25
  1091. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  1092. adcs x21,x21,x11
  1093. mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
  1094. adcs x22,x22,x12
  1095. mul x12,x16,x25
  1096. adc x23,x23,x13 // can't overflow
  1097. mul x13,x17,x25
  1098. // (*) adds xzr,x19,x10
  1099. subs xzr,x19,#1 // (*)
  1100. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
  1101. adcs x19,x20,x11
  1102. umulh x11,x15,x25
  1103. adcs x20,x21,x12
  1104. umulh x12,x16,x25
  1105. adcs x21,x22,x13
  1106. umulh x13,x17,x25
  1107. adcs x22,x23,x0
  1108. adc x0,xzr,xzr
  1109. adds x19,x19,x10
  1110. adcs x20,x20,x11
  1111. adcs x21,x21,x12
  1112. adcs x22,x22,x13
  1113. //adc x0,x0,xzr
  1114. cbnz x28,Loop_mul4x_reduction
  1115. adc x0,x0,xzr
  1116. ldp x10,x11,[x26,#8*4] // t[4..7]
  1117. ldp x12,x13,[x26,#8*6]
  1118. ldp x6,x7,[x1,#8*0] // a[4..7]
  1119. ldp x8,x9,[x1,#8*2]
  1120. add x1,x1,#8*4
  1121. adds x19,x19,x10
  1122. adcs x20,x20,x11
  1123. adcs x21,x21,x12
  1124. adcs x22,x22,x13
  1125. //adc x0,x0,xzr
  1126. ldr x25,[sp] // t[0]*n0
  1127. ldp x14,x15,[x3,#8*0] // n[4..7]
  1128. ldp x16,x17,[x3,#8*2]
  1129. add x3,x3,#8*4
  1130. .align 4
  1131. Loop_mul4x_tail:
  1132. mul x10,x6,x24 // lo(a[4..7]*b[4])
  1133. adc x0,x0,xzr // modulo-scheduled
  1134. mul x11,x7,x24
  1135. add x28,x28,#8
  1136. mul x12,x8,x24
  1137. and x28,x28,#31
  1138. mul x13,x9,x24
  1139. adds x19,x19,x10
  1140. umulh x10,x6,x24 // hi(a[4..7]*b[4])
  1141. adcs x20,x20,x11
  1142. umulh x11,x7,x24
  1143. adcs x21,x21,x12
  1144. umulh x12,x8,x24
  1145. adcs x22,x22,x13
  1146. umulh x13,x9,x24
  1147. adc x23,xzr,xzr
  1148. ldr x24,[x2,x28] // next b[i]
  1149. adds x20,x20,x10
  1150. mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
  1151. adcs x21,x21,x11
  1152. mul x11,x15,x25
  1153. adcs x22,x22,x12
  1154. mul x12,x16,x25
  1155. adc x23,x23,x13 // can't overflow
  1156. mul x13,x17,x25
  1157. adds x19,x19,x10
  1158. umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
  1159. adcs x20,x20,x11
  1160. umulh x11,x15,x25
  1161. adcs x21,x21,x12
  1162. umulh x12,x16,x25
  1163. adcs x22,x22,x13
  1164. umulh x13,x17,x25
  1165. adcs x23,x23,x0
  1166. ldr x25,[sp,x28] // next a[0]*n0
  1167. adc x0,xzr,xzr
  1168. str x19,[x26],#8 // result!!!
  1169. adds x19,x20,x10
  1170. sub x10,x27,x1 // done yet?
  1171. adcs x20,x21,x11
  1172. adcs x21,x22,x12
  1173. adcs x22,x23,x13
  1174. //adc x0,x0,xzr
  1175. cbnz x28,Loop_mul4x_tail
  1176. sub x11,x3,x5 // rewinded np?
  1177. adc x0,x0,xzr
  1178. cbz x10,Loop_mul4x_break
  1179. ldp x10,x11,[x26,#8*4]
  1180. ldp x12,x13,[x26,#8*6]
  1181. ldp x6,x7,[x1,#8*0]
  1182. ldp x8,x9,[x1,#8*2]
  1183. add x1,x1,#8*4
  1184. adds x19,x19,x10
  1185. adcs x20,x20,x11
  1186. adcs x21,x21,x12
  1187. adcs x22,x22,x13
  1188. //adc x0,x0,xzr
  1189. ldp x14,x15,[x3,#8*0]
  1190. ldp x16,x17,[x3,#8*2]
  1191. add x3,x3,#8*4
  1192. b Loop_mul4x_tail
  1193. .align 4
  1194. Loop_mul4x_break:
  1195. ldp x12,x13,[x29,#96] // pull rp and &b[num]
  1196. adds x19,x19,x30
  1197. add x2,x2,#8*4 // bp++
  1198. adcs x20,x20,xzr
  1199. sub x1,x1,x5 // rewind ap
  1200. adcs x21,x21,xzr
  1201. stp x19,x20,[x26,#8*0] // result!!!
  1202. adcs x22,x22,xzr
  1203. ldp x19,x20,[sp,#8*4] // t[0..3]
  1204. adc x30,x0,xzr
  1205. stp x21,x22,[x26,#8*2] // result!!!
  1206. cmp x2,x13 // done yet?
  1207. ldp x21,x22,[sp,#8*6]
  1208. ldp x14,x15,[x11,#8*0] // n[0..3]
  1209. ldp x16,x17,[x11,#8*2]
  1210. add x3,x11,#8*4
  1211. b.eq Lmul4x_post
  1212. ldr x24,[x2]
  1213. ldp x6,x7,[x1,#8*0] // a[0..3]
  1214. ldp x8,x9,[x1,#8*2]
  1215. adds x1,x1,#8*4 // clear carry bit
  1216. mov x0,xzr
  1217. mov x26,sp
  1218. b Loop_mul4x_reduction
  1219. .align 4
  1220. Lmul4x_post:
  1221. // Final step. We see if result is larger than modulus, and
  1222. // if it is, subtract the modulus. But comparison implies
  1223. // subtraction. So we subtract modulus, see if it borrowed,
  1224. // and conditionally copy original value.
  1225. mov x0,x12
  1226. mov x27,x12 // x0 copy
  1227. subs x10,x19,x14
  1228. add x26,sp,#8*8
  1229. sbcs x11,x20,x15
  1230. sub x28,x5,#8*4
  1231. Lmul4x_sub:
  1232. sbcs x12,x21,x16
  1233. ldp x14,x15,[x3,#8*0]
  1234. sub x28,x28,#8*4
  1235. ldp x19,x20,[x26,#8*0]
  1236. sbcs x13,x22,x17
  1237. ldp x16,x17,[x3,#8*2]
  1238. add x3,x3,#8*4
  1239. ldp x21,x22,[x26,#8*2]
  1240. add x26,x26,#8*4
  1241. stp x10,x11,[x0,#8*0]
  1242. sbcs x10,x19,x14
  1243. stp x12,x13,[x0,#8*2]
  1244. add x0,x0,#8*4
  1245. sbcs x11,x20,x15
  1246. cbnz x28,Lmul4x_sub
  1247. sbcs x12,x21,x16
  1248. mov x26,sp
  1249. add x1,sp,#8*4
  1250. ldp x6,x7,[x27,#8*0]
  1251. sbcs x13,x22,x17
  1252. stp x10,x11,[x0,#8*0]
  1253. ldp x8,x9,[x27,#8*2]
  1254. stp x12,x13,[x0,#8*2]
  1255. ldp x19,x20,[x1,#8*0]
  1256. ldp x21,x22,[x1,#8*2]
  1257. sbcs xzr,x30,xzr // did it borrow?
  1258. ldr x30,[x29,#8] // pull return address
  1259. sub x28,x5,#8*4
  1260. Lmul4x_cond_copy:
  1261. sub x28,x28,#8*4
  1262. csel x10,x19,x6,lo
  1263. stp xzr,xzr,[x26,#8*0]
  1264. csel x11,x20,x7,lo
  1265. ldp x6,x7,[x27,#8*4]
  1266. ldp x19,x20,[x1,#8*4]
  1267. csel x12,x21,x8,lo
  1268. stp xzr,xzr,[x26,#8*2]
  1269. add x26,x26,#8*4
  1270. csel x13,x22,x9,lo
  1271. ldp x8,x9,[x27,#8*6]
  1272. ldp x21,x22,[x1,#8*6]
  1273. add x1,x1,#8*4
  1274. stp x10,x11,[x27,#8*0]
  1275. stp x12,x13,[x27,#8*2]
  1276. add x27,x27,#8*4
  1277. cbnz x28,Lmul4x_cond_copy
  1278. csel x10,x19,x6,lo
  1279. stp xzr,xzr,[x26,#8*0]
  1280. csel x11,x20,x7,lo
  1281. stp xzr,xzr,[x26,#8*2]
  1282. csel x12,x21,x8,lo
  1283. stp xzr,xzr,[x26,#8*3]
  1284. csel x13,x22,x9,lo
  1285. stp xzr,xzr,[x26,#8*4]
  1286. stp x10,x11,[x27,#8*0]
  1287. stp x12,x13,[x27,#8*2]
  1288. b Lmul4x_done
  1289. .align 4
  1290. Lmul4x4_post_condition:
  1291. adc x0,x0,xzr
  1292. ldr x1,[x29,#96] // pull rp
  1293. // x19-3,x0 hold result, x14-7 hold modulus
  1294. subs x6,x19,x14
  1295. ldr x30,[x29,#8] // pull return address
  1296. sbcs x7,x20,x15
  1297. stp xzr,xzr,[sp,#8*0]
  1298. sbcs x8,x21,x16
  1299. stp xzr,xzr,[sp,#8*2]
  1300. sbcs x9,x22,x17
  1301. stp xzr,xzr,[sp,#8*4]
  1302. sbcs xzr,x0,xzr // did it borrow?
  1303. stp xzr,xzr,[sp,#8*6]
  1304. // x6-3 hold result-modulus
  1305. csel x6,x19,x6,lo
  1306. csel x7,x20,x7,lo
  1307. csel x8,x21,x8,lo
  1308. csel x9,x22,x9,lo
  1309. stp x6,x7,[x1,#8*0]
  1310. stp x8,x9,[x1,#8*2]
  1311. Lmul4x_done:
  1312. ldp x19,x20,[x29,#16]
  1313. mov sp,x29
  1314. ldp x21,x22,[x29,#32]
  1315. mov x0,#1
  1316. ldp x23,x24,[x29,#48]
  1317. ldp x25,x26,[x29,#64]
  1318. ldp x27,x28,[x29,#80]
  1319. ldr x29,[sp],#128
  1320. // x30 is popped earlier
  1321. AARCH64_VALIDATE_LINK_REGISTER
  1322. ret
  1323. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1324. .align 2
  1325. .align 4
  1326. #endif // !OPENSSL_NO_ASM