armv8-mont.S 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(__aarch64__)
  11. #if defined(BORINGSSL_PREFIX)
  12. #include <boringssl_prefix_symbols_asm.h>
  13. #endif
  14. #include <openssl/arm_arch.h>
  15. .text
  16. .globl bn_mul_mont
  17. .def bn_mul_mont
  18. .type 32
  19. .endef
  20. .align 5
  21. bn_mul_mont:
  22. AARCH64_SIGN_LINK_REGISTER
  23. tst x5,#7
  24. b.eq __bn_sqr8x_mont
  25. tst x5,#3
  26. b.eq __bn_mul4x_mont
  27. Lmul_mont:
  28. stp x29,x30,[sp,#-64]!
  29. add x29,sp,#0
  30. stp x19,x20,[sp,#16]
  31. stp x21,x22,[sp,#32]
  32. stp x23,x24,[sp,#48]
  33. ldr x9,[x2],#8 // bp[0]
  34. sub x22,sp,x5,lsl#3
  35. ldp x7,x8,[x1],#16 // ap[0..1]
  36. lsl x5,x5,#3
  37. ldr x4,[x4] // *n0
  38. and x22,x22,#-16 // ABI says so
  39. ldp x13,x14,[x3],#16 // np[0..1]
  40. mul x6,x7,x9 // ap[0]*bp[0]
  41. sub x21,x5,#16 // j=num-2
  42. umulh x7,x7,x9
  43. mul x10,x8,x9 // ap[1]*bp[0]
  44. umulh x11,x8,x9
  45. mul x15,x6,x4 // "tp[0]"*n0
  46. mov sp,x22 // alloca
  47. // (*) mul x12,x13,x15 // np[0]*m1
  48. umulh x13,x13,x15
  49. mul x16,x14,x15 // np[1]*m1
  50. // (*) adds x12,x12,x6 // discarded
  51. // (*) As for removal of first multiplication and addition
  52. // instructions. The outcome of first addition is
  53. // guaranteed to be zero, which leaves two computationally
  54. // significant outcomes: it either carries or not. Then
  55. // question is when does it carry? Is there alternative
  56. // way to deduce it? If you follow operations, you can
  57. // observe that condition for carry is quite simple:
  58. // x6 being non-zero. So that carry can be calculated
  59. // by adding -1 to x6. That's what next instruction does.
  60. subs xzr,x6,#1 // (*)
  61. umulh x17,x14,x15
  62. adc x13,x13,xzr
  63. cbz x21,L1st_skip
  64. L1st:
  65. ldr x8,[x1],#8
  66. adds x6,x10,x7
  67. sub x21,x21,#8 // j--
  68. adc x7,x11,xzr
  69. ldr x14,[x3],#8
  70. adds x12,x16,x13
  71. mul x10,x8,x9 // ap[j]*bp[0]
  72. adc x13,x17,xzr
  73. umulh x11,x8,x9
  74. adds x12,x12,x6
  75. mul x16,x14,x15 // np[j]*m1
  76. adc x13,x13,xzr
  77. umulh x17,x14,x15
  78. str x12,[x22],#8 // tp[j-1]
  79. cbnz x21,L1st
  80. L1st_skip:
  81. adds x6,x10,x7
  82. sub x1,x1,x5 // rewind x1
  83. adc x7,x11,xzr
  84. adds x12,x16,x13
  85. sub x3,x3,x5 // rewind x3
  86. adc x13,x17,xzr
  87. adds x12,x12,x6
  88. sub x20,x5,#8 // i=num-1
  89. adcs x13,x13,x7
  90. adc x19,xzr,xzr // upmost overflow bit
  91. stp x12,x13,[x22]
  92. Louter:
  93. ldr x9,[x2],#8 // bp[i]
  94. ldp x7,x8,[x1],#16
  95. ldr x23,[sp] // tp[0]
  96. add x22,sp,#8
  97. mul x6,x7,x9 // ap[0]*bp[i]
  98. sub x21,x5,#16 // j=num-2
  99. umulh x7,x7,x9
  100. ldp x13,x14,[x3],#16
  101. mul x10,x8,x9 // ap[1]*bp[i]
  102. adds x6,x6,x23
  103. umulh x11,x8,x9
  104. adc x7,x7,xzr
  105. mul x15,x6,x4
  106. sub x20,x20,#8 // i--
  107. // (*) mul x12,x13,x15 // np[0]*m1
  108. umulh x13,x13,x15
  109. mul x16,x14,x15 // np[1]*m1
  110. // (*) adds x12,x12,x6
  111. subs xzr,x6,#1 // (*)
  112. umulh x17,x14,x15
  113. cbz x21,Linner_skip
  114. Linner:
  115. ldr x8,[x1],#8
  116. adc x13,x13,xzr
  117. ldr x23,[x22],#8 // tp[j]
  118. adds x6,x10,x7
  119. sub x21,x21,#8 // j--
  120. adc x7,x11,xzr
  121. adds x12,x16,x13
  122. ldr x14,[x3],#8
  123. adc x13,x17,xzr
  124. mul x10,x8,x9 // ap[j]*bp[i]
  125. adds x6,x6,x23
  126. umulh x11,x8,x9
  127. adc x7,x7,xzr
  128. mul x16,x14,x15 // np[j]*m1
  129. adds x12,x12,x6
  130. umulh x17,x14,x15
  131. str x12,[x22,#-16] // tp[j-1]
  132. cbnz x21,Linner
  133. Linner_skip:
  134. ldr x23,[x22],#8 // tp[j]
  135. adc x13,x13,xzr
  136. adds x6,x10,x7
  137. sub x1,x1,x5 // rewind x1
  138. adc x7,x11,xzr
  139. adds x12,x16,x13
  140. sub x3,x3,x5 // rewind x3
  141. adcs x13,x17,x19
  142. adc x19,xzr,xzr
  143. adds x6,x6,x23
  144. adc x7,x7,xzr
  145. adds x12,x12,x6
  146. adcs x13,x13,x7
  147. adc x19,x19,xzr // upmost overflow bit
  148. stp x12,x13,[x22,#-16]
  149. cbnz x20,Louter
  150. // Final step. We see if result is larger than modulus, and
  151. // if it is, subtract the modulus. But comparison implies
  152. // subtraction. So we subtract modulus, see if it borrowed,
  153. // and conditionally copy original value.
  154. ldr x23,[sp] // tp[0]
  155. add x22,sp,#8
  156. ldr x14,[x3],#8 // np[0]
  157. subs x21,x5,#8 // j=num-1 and clear borrow
  158. mov x1,x0
  159. Lsub:
  160. sbcs x8,x23,x14 // tp[j]-np[j]
  161. ldr x23,[x22],#8
  162. sub x21,x21,#8 // j--
  163. ldr x14,[x3],#8
  164. str x8,[x1],#8 // rp[j]=tp[j]-np[j]
  165. cbnz x21,Lsub
  166. sbcs x8,x23,x14
  167. sbcs x19,x19,xzr // did it borrow?
  168. str x8,[x1],#8 // rp[num-1]
  169. ldr x23,[sp] // tp[0]
  170. add x22,sp,#8
  171. ldr x8,[x0],#8 // rp[0]
  172. sub x5,x5,#8 // num--
  173. nop
  174. Lcond_copy:
  175. sub x5,x5,#8 // num--
  176. csel x14,x23,x8,lo // did it borrow?
  177. ldr x23,[x22],#8
  178. ldr x8,[x0],#8
  179. str xzr,[x22,#-16] // wipe tp
  180. str x14,[x0,#-16]
  181. cbnz x5,Lcond_copy
  182. csel x14,x23,x8,lo
  183. str xzr,[x22,#-8] // wipe tp
  184. str x14,[x0,#-8]
  185. ldp x19,x20,[x29,#16]
  186. mov sp,x29
  187. ldp x21,x22,[x29,#32]
  188. mov x0,#1
  189. ldp x23,x24,[x29,#48]
  190. ldr x29,[sp],#64
  191. AARCH64_VALIDATE_LINK_REGISTER
  192. ret
  193. .def __bn_sqr8x_mont
  194. .type 32
  195. .endef
  196. .align 5
  197. __bn_sqr8x_mont:
  198. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
  199. // only from bn_mul_mont which has already signed the return address.
  200. cmp x1,x2
  201. b.ne __bn_mul4x_mont
  202. Lsqr8x_mont:
  203. stp x29,x30,[sp,#-128]!
  204. add x29,sp,#0
  205. stp x19,x20,[sp,#16]
  206. stp x21,x22,[sp,#32]
  207. stp x23,x24,[sp,#48]
  208. stp x25,x26,[sp,#64]
  209. stp x27,x28,[sp,#80]
  210. stp x0,x3,[sp,#96] // offload rp and np
  211. ldp x6,x7,[x1,#8*0]
  212. ldp x8,x9,[x1,#8*2]
  213. ldp x10,x11,[x1,#8*4]
  214. ldp x12,x13,[x1,#8*6]
  215. sub x2,sp,x5,lsl#4
  216. lsl x5,x5,#3
  217. ldr x4,[x4] // *n0
  218. mov sp,x2 // alloca
  219. sub x27,x5,#8*8
  220. b Lsqr8x_zero_start
  221. Lsqr8x_zero:
  222. sub x27,x27,#8*8
  223. stp xzr,xzr,[x2,#8*0]
  224. stp xzr,xzr,[x2,#8*2]
  225. stp xzr,xzr,[x2,#8*4]
  226. stp xzr,xzr,[x2,#8*6]
  227. Lsqr8x_zero_start:
  228. stp xzr,xzr,[x2,#8*8]
  229. stp xzr,xzr,[x2,#8*10]
  230. stp xzr,xzr,[x2,#8*12]
  231. stp xzr,xzr,[x2,#8*14]
  232. add x2,x2,#8*16
  233. cbnz x27,Lsqr8x_zero
  234. add x3,x1,x5
  235. add x1,x1,#8*8
  236. mov x19,xzr
  237. mov x20,xzr
  238. mov x21,xzr
  239. mov x22,xzr
  240. mov x23,xzr
  241. mov x24,xzr
  242. mov x25,xzr
  243. mov x26,xzr
  244. mov x2,sp
  245. str x4,[x29,#112] // offload n0
  246. // Multiply everything but a[i]*a[i]
  247. .align 4
  248. Lsqr8x_outer_loop:
  249. // a[1]a[0] (i)
  250. // a[2]a[0]
  251. // a[3]a[0]
  252. // a[4]a[0]
  253. // a[5]a[0]
  254. // a[6]a[0]
  255. // a[7]a[0]
  256. // a[2]a[1] (ii)
  257. // a[3]a[1]
  258. // a[4]a[1]
  259. // a[5]a[1]
  260. // a[6]a[1]
  261. // a[7]a[1]
  262. // a[3]a[2] (iii)
  263. // a[4]a[2]
  264. // a[5]a[2]
  265. // a[6]a[2]
  266. // a[7]a[2]
  267. // a[4]a[3] (iv)
  268. // a[5]a[3]
  269. // a[6]a[3]
  270. // a[7]a[3]
  271. // a[5]a[4] (v)
  272. // a[6]a[4]
  273. // a[7]a[4]
  274. // a[6]a[5] (vi)
  275. // a[7]a[5]
  276. // a[7]a[6] (vii)
  277. mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
  278. mul x15,x8,x6
  279. mul x16,x9,x6
  280. mul x17,x10,x6
  281. adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
  282. mul x14,x11,x6
  283. adcs x21,x21,x15
  284. mul x15,x12,x6
  285. adcs x22,x22,x16
  286. mul x16,x13,x6
  287. adcs x23,x23,x17
  288. umulh x17,x7,x6 // hi(a[1..7]*a[0])
  289. adcs x24,x24,x14
  290. umulh x14,x8,x6
  291. adcs x25,x25,x15
  292. umulh x15,x9,x6
  293. adcs x26,x26,x16
  294. umulh x16,x10,x6
  295. stp x19,x20,[x2],#8*2 // t[0..1]
  296. adc x19,xzr,xzr // t[8]
  297. adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
  298. umulh x17,x11,x6
  299. adcs x22,x22,x14
  300. umulh x14,x12,x6
  301. adcs x23,x23,x15
  302. umulh x15,x13,x6
  303. adcs x24,x24,x16
  304. mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
  305. adcs x25,x25,x17
  306. mul x17,x9,x7
  307. adcs x26,x26,x14
  308. mul x14,x10,x7
  309. adc x19,x19,x15
  310. mul x15,x11,x7
  311. adds x22,x22,x16
  312. mul x16,x12,x7
  313. adcs x23,x23,x17
  314. mul x17,x13,x7
  315. adcs x24,x24,x14
  316. umulh x14,x8,x7 // hi(a[2..7]*a[1])
  317. adcs x25,x25,x15
  318. umulh x15,x9,x7
  319. adcs x26,x26,x16
  320. umulh x16,x10,x7
  321. adcs x19,x19,x17
  322. umulh x17,x11,x7
  323. stp x21,x22,[x2],#8*2 // t[2..3]
  324. adc x20,xzr,xzr // t[9]
  325. adds x23,x23,x14
  326. umulh x14,x12,x7
  327. adcs x24,x24,x15
  328. umulh x15,x13,x7
  329. adcs x25,x25,x16
  330. mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
  331. adcs x26,x26,x17
  332. mul x17,x10,x8
  333. adcs x19,x19,x14
  334. mul x14,x11,x8
  335. adc x20,x20,x15
  336. mul x15,x12,x8
  337. adds x24,x24,x16
  338. mul x16,x13,x8
  339. adcs x25,x25,x17
  340. umulh x17,x9,x8 // hi(a[3..7]*a[2])
  341. adcs x26,x26,x14
  342. umulh x14,x10,x8
  343. adcs x19,x19,x15
  344. umulh x15,x11,x8
  345. adcs x20,x20,x16
  346. umulh x16,x12,x8
  347. stp x23,x24,[x2],#8*2 // t[4..5]
  348. adc x21,xzr,xzr // t[10]
  349. adds x25,x25,x17
  350. umulh x17,x13,x8
  351. adcs x26,x26,x14
  352. mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
  353. adcs x19,x19,x15
  354. mul x15,x11,x9
  355. adcs x20,x20,x16
  356. mul x16,x12,x9
  357. adc x21,x21,x17
  358. mul x17,x13,x9
  359. adds x26,x26,x14
  360. umulh x14,x10,x9 // hi(a[4..7]*a[3])
  361. adcs x19,x19,x15
  362. umulh x15,x11,x9
  363. adcs x20,x20,x16
  364. umulh x16,x12,x9
  365. adcs x21,x21,x17
  366. umulh x17,x13,x9
  367. stp x25,x26,[x2],#8*2 // t[6..7]
  368. adc x22,xzr,xzr // t[11]
  369. adds x19,x19,x14
  370. mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
  371. adcs x20,x20,x15
  372. mul x15,x12,x10
  373. adcs x21,x21,x16
  374. mul x16,x13,x10
  375. adc x22,x22,x17
  376. umulh x17,x11,x10 // hi(a[5..7]*a[4])
  377. adds x20,x20,x14
  378. umulh x14,x12,x10
  379. adcs x21,x21,x15
  380. umulh x15,x13,x10
  381. adcs x22,x22,x16
  382. mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
  383. adc x23,xzr,xzr // t[12]
  384. adds x21,x21,x17
  385. mul x17,x13,x11
  386. adcs x22,x22,x14
  387. umulh x14,x12,x11 // hi(a[6..7]*a[5])
  388. adc x23,x23,x15
  389. umulh x15,x13,x11
  390. adds x22,x22,x16
  391. mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
  392. adcs x23,x23,x17
  393. umulh x17,x13,x12 // hi(a[7]*a[6])
  394. adc x24,xzr,xzr // t[13]
  395. adds x23,x23,x14
  396. sub x27,x3,x1 // done yet?
  397. adc x24,x24,x15
  398. adds x24,x24,x16
  399. sub x14,x3,x5 // rewinded ap
  400. adc x25,xzr,xzr // t[14]
  401. add x25,x25,x17
  402. cbz x27,Lsqr8x_outer_break
  403. mov x4,x6
  404. ldp x6,x7,[x2,#8*0]
  405. ldp x8,x9,[x2,#8*2]
  406. ldp x10,x11,[x2,#8*4]
  407. ldp x12,x13,[x2,#8*6]
  408. adds x19,x19,x6
  409. adcs x20,x20,x7
  410. ldp x6,x7,[x1,#8*0]
  411. adcs x21,x21,x8
  412. adcs x22,x22,x9
  413. ldp x8,x9,[x1,#8*2]
  414. adcs x23,x23,x10
  415. adcs x24,x24,x11
  416. ldp x10,x11,[x1,#8*4]
  417. adcs x25,x25,x12
  418. mov x0,x1
  419. adcs x26,xzr,x13
  420. ldp x12,x13,[x1,#8*6]
  421. add x1,x1,#8*8
  422. //adc x28,xzr,xzr // moved below
  423. mov x27,#-8*8
  424. // a[8]a[0]
  425. // a[9]a[0]
  426. // a[a]a[0]
  427. // a[b]a[0]
  428. // a[c]a[0]
  429. // a[d]a[0]
  430. // a[e]a[0]
  431. // a[f]a[0]
  432. // a[8]a[1]
  433. // a[f]a[1]........................
  434. // a[8]a[2]
  435. // a[f]a[2]........................
  436. // a[8]a[3]
  437. // a[f]a[3]........................
  438. // a[8]a[4]
  439. // a[f]a[4]........................
  440. // a[8]a[5]
  441. // a[f]a[5]........................
  442. // a[8]a[6]
  443. // a[f]a[6]........................
  444. // a[8]a[7]
  445. // a[f]a[7]........................
  446. Lsqr8x_mul:
  447. mul x14,x6,x4
  448. adc x28,xzr,xzr // carry bit, modulo-scheduled
  449. mul x15,x7,x4
  450. add x27,x27,#8
  451. mul x16,x8,x4
  452. mul x17,x9,x4
  453. adds x19,x19,x14
  454. mul x14,x10,x4
  455. adcs x20,x20,x15
  456. mul x15,x11,x4
  457. adcs x21,x21,x16
  458. mul x16,x12,x4
  459. adcs x22,x22,x17
  460. mul x17,x13,x4
  461. adcs x23,x23,x14
  462. umulh x14,x6,x4
  463. adcs x24,x24,x15
  464. umulh x15,x7,x4
  465. adcs x25,x25,x16
  466. umulh x16,x8,x4
  467. adcs x26,x26,x17
  468. umulh x17,x9,x4
  469. adc x28,x28,xzr
  470. str x19,[x2],#8
  471. adds x19,x20,x14
  472. umulh x14,x10,x4
  473. adcs x20,x21,x15
  474. umulh x15,x11,x4
  475. adcs x21,x22,x16
  476. umulh x16,x12,x4
  477. adcs x22,x23,x17
  478. umulh x17,x13,x4
  479. ldr x4,[x0,x27]
  480. adcs x23,x24,x14
  481. adcs x24,x25,x15
  482. adcs x25,x26,x16
  483. adcs x26,x28,x17
  484. //adc x28,xzr,xzr // moved above
  485. cbnz x27,Lsqr8x_mul
  486. // note that carry flag is guaranteed
  487. // to be zero at this point
  488. cmp x1,x3 // done yet?
  489. b.eq Lsqr8x_break
  490. ldp x6,x7,[x2,#8*0]
  491. ldp x8,x9,[x2,#8*2]
  492. ldp x10,x11,[x2,#8*4]
  493. ldp x12,x13,[x2,#8*6]
  494. adds x19,x19,x6
  495. ldr x4,[x0,#-8*8]
  496. adcs x20,x20,x7
  497. ldp x6,x7,[x1,#8*0]
  498. adcs x21,x21,x8
  499. adcs x22,x22,x9
  500. ldp x8,x9,[x1,#8*2]
  501. adcs x23,x23,x10
  502. adcs x24,x24,x11
  503. ldp x10,x11,[x1,#8*4]
  504. adcs x25,x25,x12
  505. mov x27,#-8*8
  506. adcs x26,x26,x13
  507. ldp x12,x13,[x1,#8*6]
  508. add x1,x1,#8*8
  509. //adc x28,xzr,xzr // moved above
  510. b Lsqr8x_mul
  511. .align 4
  512. Lsqr8x_break:
  513. ldp x6,x7,[x0,#8*0]
  514. add x1,x0,#8*8
  515. ldp x8,x9,[x0,#8*2]
  516. sub x14,x3,x1 // is it last iteration?
  517. ldp x10,x11,[x0,#8*4]
  518. sub x15,x2,x14
  519. ldp x12,x13,[x0,#8*6]
  520. cbz x14,Lsqr8x_outer_loop
  521. stp x19,x20,[x2,#8*0]
  522. ldp x19,x20,[x15,#8*0]
  523. stp x21,x22,[x2,#8*2]
  524. ldp x21,x22,[x15,#8*2]
  525. stp x23,x24,[x2,#8*4]
  526. ldp x23,x24,[x15,#8*4]
  527. stp x25,x26,[x2,#8*6]
  528. mov x2,x15
  529. ldp x25,x26,[x15,#8*6]
  530. b Lsqr8x_outer_loop
  531. .align 4
  532. Lsqr8x_outer_break:
  533. // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
  534. ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
  535. ldp x15,x16,[sp,#8*1]
  536. ldp x11,x13,[x14,#8*2]
  537. add x1,x14,#8*4
  538. ldp x17,x14,[sp,#8*3]
  539. stp x19,x20,[x2,#8*0]
  540. mul x19,x7,x7
  541. stp x21,x22,[x2,#8*2]
  542. umulh x7,x7,x7
  543. stp x23,x24,[x2,#8*4]
  544. mul x8,x9,x9
  545. stp x25,x26,[x2,#8*6]
  546. mov x2,sp
  547. umulh x9,x9,x9
  548. adds x20,x7,x15,lsl#1
  549. extr x15,x16,x15,#63
  550. sub x27,x5,#8*4
  551. Lsqr4x_shift_n_add:
  552. adcs x21,x8,x15
  553. extr x16,x17,x16,#63
  554. sub x27,x27,#8*4
  555. adcs x22,x9,x16
  556. ldp x15,x16,[x2,#8*5]
  557. mul x10,x11,x11
  558. ldp x7,x9,[x1],#8*2
  559. umulh x11,x11,x11
  560. mul x12,x13,x13
  561. umulh x13,x13,x13
  562. extr x17,x14,x17,#63
  563. stp x19,x20,[x2,#8*0]
  564. adcs x23,x10,x17
  565. extr x14,x15,x14,#63
  566. stp x21,x22,[x2,#8*2]
  567. adcs x24,x11,x14
  568. ldp x17,x14,[x2,#8*7]
  569. extr x15,x16,x15,#63
  570. adcs x25,x12,x15
  571. extr x16,x17,x16,#63
  572. adcs x26,x13,x16
  573. ldp x15,x16,[x2,#8*9]
  574. mul x6,x7,x7
  575. ldp x11,x13,[x1],#8*2
  576. umulh x7,x7,x7
  577. mul x8,x9,x9
  578. umulh x9,x9,x9
  579. stp x23,x24,[x2,#8*4]
  580. extr x17,x14,x17,#63
  581. stp x25,x26,[x2,#8*6]
  582. add x2,x2,#8*8
  583. adcs x19,x6,x17
  584. extr x14,x15,x14,#63
  585. adcs x20,x7,x14
  586. ldp x17,x14,[x2,#8*3]
  587. extr x15,x16,x15,#63
  588. cbnz x27,Lsqr4x_shift_n_add
  589. ldp x1,x4,[x29,#104] // pull np and n0
  590. adcs x21,x8,x15
  591. extr x16,x17,x16,#63
  592. adcs x22,x9,x16
  593. ldp x15,x16,[x2,#8*5]
  594. mul x10,x11,x11
  595. umulh x11,x11,x11
  596. stp x19,x20,[x2,#8*0]
  597. mul x12,x13,x13
  598. umulh x13,x13,x13
  599. stp x21,x22,[x2,#8*2]
  600. extr x17,x14,x17,#63
  601. adcs x23,x10,x17
  602. extr x14,x15,x14,#63
  603. ldp x19,x20,[sp,#8*0]
  604. adcs x24,x11,x14
  605. extr x15,x16,x15,#63
  606. ldp x6,x7,[x1,#8*0]
  607. adcs x25,x12,x15
  608. extr x16,xzr,x16,#63
  609. ldp x8,x9,[x1,#8*2]
  610. adc x26,x13,x16
  611. ldp x10,x11,[x1,#8*4]
  612. // Reduce by 512 bits per iteration
  613. mul x28,x4,x19 // t[0]*n0
  614. ldp x12,x13,[x1,#8*6]
  615. add x3,x1,x5
  616. ldp x21,x22,[sp,#8*2]
  617. stp x23,x24,[x2,#8*4]
  618. ldp x23,x24,[sp,#8*4]
  619. stp x25,x26,[x2,#8*6]
  620. ldp x25,x26,[sp,#8*6]
  621. add x1,x1,#8*8
  622. mov x30,xzr // initial top-most carry
  623. mov x2,sp
  624. mov x27,#8
  625. Lsqr8x_reduction:
  626. // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
  627. mul x15,x7,x28
  628. sub x27,x27,#1
  629. mul x16,x8,x28
  630. str x28,[x2],#8 // put aside t[0]*n0 for tail processing
  631. mul x17,x9,x28
  632. // (*) adds xzr,x19,x14
  633. subs xzr,x19,#1 // (*)
  634. mul x14,x10,x28
  635. adcs x19,x20,x15
  636. mul x15,x11,x28
  637. adcs x20,x21,x16
  638. mul x16,x12,x28
  639. adcs x21,x22,x17
  640. mul x17,x13,x28
  641. adcs x22,x23,x14
  642. umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
  643. adcs x23,x24,x15
  644. umulh x15,x7,x28
  645. adcs x24,x25,x16
  646. umulh x16,x8,x28
  647. adcs x25,x26,x17
  648. umulh x17,x9,x28
  649. adc x26,xzr,xzr
  650. adds x19,x19,x14
  651. umulh x14,x10,x28
  652. adcs x20,x20,x15
  653. umulh x15,x11,x28
  654. adcs x21,x21,x16
  655. umulh x16,x12,x28
  656. adcs x22,x22,x17
  657. umulh x17,x13,x28
  658. mul x28,x4,x19 // next t[0]*n0
  659. adcs x23,x23,x14
  660. adcs x24,x24,x15
  661. adcs x25,x25,x16
  662. adc x26,x26,x17
  663. cbnz x27,Lsqr8x_reduction
  664. ldp x14,x15,[x2,#8*0]
  665. ldp x16,x17,[x2,#8*2]
  666. mov x0,x2
  667. sub x27,x3,x1 // done yet?
  668. adds x19,x19,x14
  669. adcs x20,x20,x15
  670. ldp x14,x15,[x2,#8*4]
  671. adcs x21,x21,x16
  672. adcs x22,x22,x17
  673. ldp x16,x17,[x2,#8*6]
  674. adcs x23,x23,x14
  675. adcs x24,x24,x15
  676. adcs x25,x25,x16
  677. adcs x26,x26,x17
  678. //adc x28,xzr,xzr // moved below
  679. cbz x27,Lsqr8x8_post_condition
  680. ldr x4,[x2,#-8*8]
  681. ldp x6,x7,[x1,#8*0]
  682. ldp x8,x9,[x1,#8*2]
  683. ldp x10,x11,[x1,#8*4]
  684. mov x27,#-8*8
  685. ldp x12,x13,[x1,#8*6]
  686. add x1,x1,#8*8
  687. Lsqr8x_tail:
  688. mul x14,x6,x4
  689. adc x28,xzr,xzr // carry bit, modulo-scheduled
  690. mul x15,x7,x4
  691. add x27,x27,#8
  692. mul x16,x8,x4
  693. mul x17,x9,x4
  694. adds x19,x19,x14
  695. mul x14,x10,x4
  696. adcs x20,x20,x15
  697. mul x15,x11,x4
  698. adcs x21,x21,x16
  699. mul x16,x12,x4
  700. adcs x22,x22,x17
  701. mul x17,x13,x4
  702. adcs x23,x23,x14
  703. umulh x14,x6,x4
  704. adcs x24,x24,x15
  705. umulh x15,x7,x4
  706. adcs x25,x25,x16
  707. umulh x16,x8,x4
  708. adcs x26,x26,x17
  709. umulh x17,x9,x4
  710. adc x28,x28,xzr
  711. str x19,[x2],#8
  712. adds x19,x20,x14
  713. umulh x14,x10,x4
  714. adcs x20,x21,x15
  715. umulh x15,x11,x4
  716. adcs x21,x22,x16
  717. umulh x16,x12,x4
  718. adcs x22,x23,x17
  719. umulh x17,x13,x4
  720. ldr x4,[x0,x27]
  721. adcs x23,x24,x14
  722. adcs x24,x25,x15
  723. adcs x25,x26,x16
  724. adcs x26,x28,x17
  725. //adc x28,xzr,xzr // moved above
  726. cbnz x27,Lsqr8x_tail
  727. // note that carry flag is guaranteed
  728. // to be zero at this point
  729. ldp x6,x7,[x2,#8*0]
  730. sub x27,x3,x1 // done yet?
  731. sub x16,x3,x5 // rewinded np
  732. ldp x8,x9,[x2,#8*2]
  733. ldp x10,x11,[x2,#8*4]
  734. ldp x12,x13,[x2,#8*6]
  735. cbz x27,Lsqr8x_tail_break
  736. ldr x4,[x0,#-8*8]
  737. adds x19,x19,x6
  738. adcs x20,x20,x7
  739. ldp x6,x7,[x1,#8*0]
  740. adcs x21,x21,x8
  741. adcs x22,x22,x9
  742. ldp x8,x9,[x1,#8*2]
  743. adcs x23,x23,x10
  744. adcs x24,x24,x11
  745. ldp x10,x11,[x1,#8*4]
  746. adcs x25,x25,x12
  747. mov x27,#-8*8
  748. adcs x26,x26,x13
  749. ldp x12,x13,[x1,#8*6]
  750. add x1,x1,#8*8
  751. //adc x28,xzr,xzr // moved above
  752. b Lsqr8x_tail
  753. .align 4
  754. Lsqr8x_tail_break:
  755. ldr x4,[x29,#112] // pull n0
  756. add x27,x2,#8*8 // end of current t[num] window
  757. subs xzr,x30,#1 // "move" top-most carry to carry bit
  758. adcs x14,x19,x6
  759. adcs x15,x20,x7
  760. ldp x19,x20,[x0,#8*0]
  761. adcs x21,x21,x8
  762. ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
  763. adcs x22,x22,x9
  764. ldp x8,x9,[x16,#8*2]
  765. adcs x23,x23,x10
  766. adcs x24,x24,x11
  767. ldp x10,x11,[x16,#8*4]
  768. adcs x25,x25,x12
  769. adcs x26,x26,x13
  770. ldp x12,x13,[x16,#8*6]
  771. add x1,x16,#8*8
  772. adc x30,xzr,xzr // top-most carry
  773. mul x28,x4,x19
  774. stp x14,x15,[x2,#8*0]
  775. stp x21,x22,[x2,#8*2]
  776. ldp x21,x22,[x0,#8*2]
  777. stp x23,x24,[x2,#8*4]
  778. ldp x23,x24,[x0,#8*4]
  779. cmp x27,x29 // did we hit the bottom?
  780. stp x25,x26,[x2,#8*6]
  781. mov x2,x0 // slide the window
  782. ldp x25,x26,[x0,#8*6]
  783. mov x27,#8
  784. b.ne Lsqr8x_reduction
  785. // Final step. We see if result is larger than modulus, and
  786. // if it is, subtract the modulus. But comparison implies
  787. // subtraction. So we subtract modulus, see if it borrowed,
  788. // and conditionally copy original value.
  789. ldr x0,[x29,#96] // pull rp
  790. add x2,x2,#8*8
  791. subs x14,x19,x6
  792. sbcs x15,x20,x7
  793. sub x27,x5,#8*8
  794. mov x3,x0 // x0 copy
  795. Lsqr8x_sub:
  796. sbcs x16,x21,x8
  797. ldp x6,x7,[x1,#8*0]
  798. sbcs x17,x22,x9
  799. stp x14,x15,[x0,#8*0]
  800. sbcs x14,x23,x10
  801. ldp x8,x9,[x1,#8*2]
  802. sbcs x15,x24,x11
  803. stp x16,x17,[x0,#8*2]
  804. sbcs x16,x25,x12
  805. ldp x10,x11,[x1,#8*4]
  806. sbcs x17,x26,x13
  807. ldp x12,x13,[x1,#8*6]
  808. add x1,x1,#8*8
  809. ldp x19,x20,[x2,#8*0]
  810. sub x27,x27,#8*8
  811. ldp x21,x22,[x2,#8*2]
  812. ldp x23,x24,[x2,#8*4]
  813. ldp x25,x26,[x2,#8*6]
  814. add x2,x2,#8*8
  815. stp x14,x15,[x0,#8*4]
  816. sbcs x14,x19,x6
  817. stp x16,x17,[x0,#8*6]
  818. add x0,x0,#8*8
  819. sbcs x15,x20,x7
  820. cbnz x27,Lsqr8x_sub
  821. sbcs x16,x21,x8
  822. mov x2,sp
  823. add x1,sp,x5
  824. ldp x6,x7,[x3,#8*0]
  825. sbcs x17,x22,x9
  826. stp x14,x15,[x0,#8*0]
  827. sbcs x14,x23,x10
  828. ldp x8,x9,[x3,#8*2]
  829. sbcs x15,x24,x11
  830. stp x16,x17,[x0,#8*2]
  831. sbcs x16,x25,x12
  832. ldp x19,x20,[x1,#8*0]
  833. sbcs x17,x26,x13
  834. ldp x21,x22,[x1,#8*2]
  835. sbcs xzr,x30,xzr // did it borrow?
  836. ldr x30,[x29,#8] // pull return address
  837. stp x14,x15,[x0,#8*4]
  838. stp x16,x17,[x0,#8*6]
  839. sub x27,x5,#8*4
  840. Lsqr4x_cond_copy:
  841. sub x27,x27,#8*4
  842. csel x14,x19,x6,lo
  843. stp xzr,xzr,[x2,#8*0]
  844. csel x15,x20,x7,lo
  845. ldp x6,x7,[x3,#8*4]
  846. ldp x19,x20,[x1,#8*4]
  847. csel x16,x21,x8,lo
  848. stp xzr,xzr,[x2,#8*2]
  849. add x2,x2,#8*4
  850. csel x17,x22,x9,lo
  851. ldp x8,x9,[x3,#8*6]
  852. ldp x21,x22,[x1,#8*6]
  853. add x1,x1,#8*4
  854. stp x14,x15,[x3,#8*0]
  855. stp x16,x17,[x3,#8*2]
  856. add x3,x3,#8*4
  857. stp xzr,xzr,[x1,#8*0]
  858. stp xzr,xzr,[x1,#8*2]
  859. cbnz x27,Lsqr4x_cond_copy
  860. csel x14,x19,x6,lo
  861. stp xzr,xzr,[x2,#8*0]
  862. csel x15,x20,x7,lo
  863. stp xzr,xzr,[x2,#8*2]
  864. csel x16,x21,x8,lo
  865. csel x17,x22,x9,lo
  866. stp x14,x15,[x3,#8*0]
  867. stp x16,x17,[x3,#8*2]
  868. b Lsqr8x_done
  869. .align 4
  870. Lsqr8x8_post_condition:
  871. adc x28,xzr,xzr
  872. ldr x30,[x29,#8] // pull return address
  873. // x19-7,x28 hold result, x6-7 hold modulus
  874. subs x6,x19,x6
  875. ldr x1,[x29,#96] // pull rp
  876. sbcs x7,x20,x7
  877. stp xzr,xzr,[sp,#8*0]
  878. sbcs x8,x21,x8
  879. stp xzr,xzr,[sp,#8*2]
  880. sbcs x9,x22,x9
  881. stp xzr,xzr,[sp,#8*4]
  882. sbcs x10,x23,x10
  883. stp xzr,xzr,[sp,#8*6]
  884. sbcs x11,x24,x11
  885. stp xzr,xzr,[sp,#8*8]
  886. sbcs x12,x25,x12
  887. stp xzr,xzr,[sp,#8*10]
  888. sbcs x13,x26,x13
  889. stp xzr,xzr,[sp,#8*12]
  890. sbcs x28,x28,xzr // did it borrow?
  891. stp xzr,xzr,[sp,#8*14]
  892. // x6-7 hold result-modulus
  893. csel x6,x19,x6,lo
  894. csel x7,x20,x7,lo
  895. csel x8,x21,x8,lo
  896. csel x9,x22,x9,lo
  897. stp x6,x7,[x1,#8*0]
  898. csel x10,x23,x10,lo
  899. csel x11,x24,x11,lo
  900. stp x8,x9,[x1,#8*2]
  901. csel x12,x25,x12,lo
  902. csel x13,x26,x13,lo
  903. stp x10,x11,[x1,#8*4]
  904. stp x12,x13,[x1,#8*6]
  905. Lsqr8x_done:
  906. ldp x19,x20,[x29,#16]
  907. mov sp,x29
  908. ldp x21,x22,[x29,#32]
  909. mov x0,#1
  910. ldp x23,x24,[x29,#48]
  911. ldp x25,x26,[x29,#64]
  912. ldp x27,x28,[x29,#80]
  913. ldr x29,[sp],#128
  914. // x30 is popped earlier
  915. AARCH64_VALIDATE_LINK_REGISTER
  916. ret
  917. .def __bn_mul4x_mont
  918. .type 32
  919. .endef
  920. .align 5
  921. __bn_mul4x_mont:
  922. // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
  923. // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
  924. // return address.
  925. stp x29,x30,[sp,#-128]!
  926. add x29,sp,#0
  927. stp x19,x20,[sp,#16]
  928. stp x21,x22,[sp,#32]
  929. stp x23,x24,[sp,#48]
  930. stp x25,x26,[sp,#64]
  931. stp x27,x28,[sp,#80]
  932. sub x26,sp,x5,lsl#3
  933. lsl x5,x5,#3
  934. ldr x4,[x4] // *n0
  935. sub sp,x26,#8*4 // alloca
  936. add x10,x2,x5
  937. add x27,x1,x5
  938. stp x0,x10,[x29,#96] // offload rp and &b[num]
  939. ldr x24,[x2,#8*0] // b[0]
  940. ldp x6,x7,[x1,#8*0] // a[0..3]
  941. ldp x8,x9,[x1,#8*2]
  942. add x1,x1,#8*4
  943. mov x19,xzr
  944. mov x20,xzr
  945. mov x21,xzr
  946. mov x22,xzr
  947. ldp x14,x15,[x3,#8*0] // n[0..3]
  948. ldp x16,x17,[x3,#8*2]
  949. adds x3,x3,#8*4 // clear carry bit
  950. mov x0,xzr
  951. mov x28,#0
  952. mov x26,sp
  953. Loop_mul4x_1st_reduction:
  954. mul x10,x6,x24 // lo(a[0..3]*b[0])
  955. adc x0,x0,xzr // modulo-scheduled
  956. mul x11,x7,x24
  957. add x28,x28,#8
  958. mul x12,x8,x24
  959. and x28,x28,#31
  960. mul x13,x9,x24
  961. adds x19,x19,x10
  962. umulh x10,x6,x24 // hi(a[0..3]*b[0])
  963. adcs x20,x20,x11
  964. mul x25,x19,x4 // t[0]*n0
  965. adcs x21,x21,x12
  966. umulh x11,x7,x24
  967. adcs x22,x22,x13
  968. umulh x12,x8,x24
  969. adc x23,xzr,xzr
  970. umulh x13,x9,x24
  971. ldr x24,[x2,x28] // next b[i] (or b[0])
  972. adds x20,x20,x10
  973. // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
  974. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  975. adcs x21,x21,x11
  976. mul x11,x15,x25
  977. adcs x22,x22,x12
  978. mul x12,x16,x25
  979. adc x23,x23,x13 // can't overflow
  980. mul x13,x17,x25
  981. // (*) adds xzr,x19,x10
  982. subs xzr,x19,#1 // (*)
  983. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
  984. adcs x19,x20,x11
  985. umulh x11,x15,x25
  986. adcs x20,x21,x12
  987. umulh x12,x16,x25
  988. adcs x21,x22,x13
  989. umulh x13,x17,x25
  990. adcs x22,x23,x0
  991. adc x0,xzr,xzr
  992. adds x19,x19,x10
  993. sub x10,x27,x1
  994. adcs x20,x20,x11
  995. adcs x21,x21,x12
  996. adcs x22,x22,x13
  997. //adc x0,x0,xzr
  998. cbnz x28,Loop_mul4x_1st_reduction
  999. cbz x10,Lmul4x4_post_condition
  1000. ldp x6,x7,[x1,#8*0] // a[4..7]
  1001. ldp x8,x9,[x1,#8*2]
  1002. add x1,x1,#8*4
  1003. ldr x25,[sp] // a[0]*n0
  1004. ldp x14,x15,[x3,#8*0] // n[4..7]
  1005. ldp x16,x17,[x3,#8*2]
  1006. add x3,x3,#8*4
  1007. Loop_mul4x_1st_tail:
  1008. mul x10,x6,x24 // lo(a[4..7]*b[i])
  1009. adc x0,x0,xzr // modulo-scheduled
  1010. mul x11,x7,x24
  1011. add x28,x28,#8
  1012. mul x12,x8,x24
  1013. and x28,x28,#31
  1014. mul x13,x9,x24
  1015. adds x19,x19,x10
  1016. umulh x10,x6,x24 // hi(a[4..7]*b[i])
  1017. adcs x20,x20,x11
  1018. umulh x11,x7,x24
  1019. adcs x21,x21,x12
  1020. umulh x12,x8,x24
  1021. adcs x22,x22,x13
  1022. umulh x13,x9,x24
  1023. adc x23,xzr,xzr
  1024. ldr x24,[x2,x28] // next b[i] (or b[0])
  1025. adds x20,x20,x10
  1026. mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
  1027. adcs x21,x21,x11
  1028. mul x11,x15,x25
  1029. adcs x22,x22,x12
  1030. mul x12,x16,x25
  1031. adc x23,x23,x13 // can't overflow
  1032. mul x13,x17,x25
  1033. adds x19,x19,x10
  1034. umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
  1035. adcs x20,x20,x11
  1036. umulh x11,x15,x25
  1037. adcs x21,x21,x12
  1038. umulh x12,x16,x25
  1039. adcs x22,x22,x13
  1040. adcs x23,x23,x0
  1041. umulh x13,x17,x25
  1042. adc x0,xzr,xzr
  1043. ldr x25,[sp,x28] // next t[0]*n0
  1044. str x19,[x26],#8 // result!!!
  1045. adds x19,x20,x10
  1046. sub x10,x27,x1 // done yet?
  1047. adcs x20,x21,x11
  1048. adcs x21,x22,x12
  1049. adcs x22,x23,x13
  1050. //adc x0,x0,xzr
  1051. cbnz x28,Loop_mul4x_1st_tail
  1052. sub x11,x27,x5 // rewinded x1
  1053. cbz x10,Lmul4x_proceed
  1054. ldp x6,x7,[x1,#8*0]
  1055. ldp x8,x9,[x1,#8*2]
  1056. add x1,x1,#8*4
  1057. ldp x14,x15,[x3,#8*0]
  1058. ldp x16,x17,[x3,#8*2]
  1059. add x3,x3,#8*4
  1060. b Loop_mul4x_1st_tail
  1061. .align 5
  1062. Lmul4x_proceed:
  1063. ldr x24,[x2,#8*4]! // *++b
  1064. adc x30,x0,xzr
  1065. ldp x6,x7,[x11,#8*0] // a[0..3]
  1066. sub x3,x3,x5 // rewind np
  1067. ldp x8,x9,[x11,#8*2]
  1068. add x1,x11,#8*4
  1069. stp x19,x20,[x26,#8*0] // result!!!
  1070. ldp x19,x20,[sp,#8*4] // t[0..3]
  1071. stp x21,x22,[x26,#8*2] // result!!!
  1072. ldp x21,x22,[sp,#8*6]
  1073. ldp x14,x15,[x3,#8*0] // n[0..3]
  1074. mov x26,sp
  1075. ldp x16,x17,[x3,#8*2]
  1076. adds x3,x3,#8*4 // clear carry bit
  1077. mov x0,xzr
  1078. .align 4
  1079. Loop_mul4x_reduction:
  1080. mul x10,x6,x24 // lo(a[0..3]*b[4])
  1081. adc x0,x0,xzr // modulo-scheduled
  1082. mul x11,x7,x24
  1083. add x28,x28,#8
  1084. mul x12,x8,x24
  1085. and x28,x28,#31
  1086. mul x13,x9,x24
  1087. adds x19,x19,x10
  1088. umulh x10,x6,x24 // hi(a[0..3]*b[4])
  1089. adcs x20,x20,x11
  1090. mul x25,x19,x4 // t[0]*n0
  1091. adcs x21,x21,x12
  1092. umulh x11,x7,x24
  1093. adcs x22,x22,x13
  1094. umulh x12,x8,x24
  1095. adc x23,xzr,xzr
  1096. umulh x13,x9,x24
  1097. ldr x24,[x2,x28] // next b[i]
  1098. adds x20,x20,x10
  1099. // (*) mul x10,x14,x25
  1100. str x25,[x26],#8 // put aside t[0]*n0 for tail processing
  1101. adcs x21,x21,x11
  1102. mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
  1103. adcs x22,x22,x12
  1104. mul x12,x16,x25
  1105. adc x23,x23,x13 // can't overflow
  1106. mul x13,x17,x25
  1107. // (*) adds xzr,x19,x10
  1108. subs xzr,x19,#1 // (*)
  1109. umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
  1110. adcs x19,x20,x11
  1111. umulh x11,x15,x25
  1112. adcs x20,x21,x12
  1113. umulh x12,x16,x25
  1114. adcs x21,x22,x13
  1115. umulh x13,x17,x25
  1116. adcs x22,x23,x0
  1117. adc x0,xzr,xzr
  1118. adds x19,x19,x10
  1119. adcs x20,x20,x11
  1120. adcs x21,x21,x12
  1121. adcs x22,x22,x13
  1122. //adc x0,x0,xzr
  1123. cbnz x28,Loop_mul4x_reduction
  1124. adc x0,x0,xzr
  1125. ldp x10,x11,[x26,#8*4] // t[4..7]
  1126. ldp x12,x13,[x26,#8*6]
  1127. ldp x6,x7,[x1,#8*0] // a[4..7]
  1128. ldp x8,x9,[x1,#8*2]
  1129. add x1,x1,#8*4
  1130. adds x19,x19,x10
  1131. adcs x20,x20,x11
  1132. adcs x21,x21,x12
  1133. adcs x22,x22,x13
  1134. //adc x0,x0,xzr
  1135. ldr x25,[sp] // t[0]*n0
  1136. ldp x14,x15,[x3,#8*0] // n[4..7]
  1137. ldp x16,x17,[x3,#8*2]
  1138. add x3,x3,#8*4
  1139. .align 4
  1140. Loop_mul4x_tail:
  1141. mul x10,x6,x24 // lo(a[4..7]*b[4])
  1142. adc x0,x0,xzr // modulo-scheduled
  1143. mul x11,x7,x24
  1144. add x28,x28,#8
  1145. mul x12,x8,x24
  1146. and x28,x28,#31
  1147. mul x13,x9,x24
  1148. adds x19,x19,x10
  1149. umulh x10,x6,x24 // hi(a[4..7]*b[4])
  1150. adcs x20,x20,x11
  1151. umulh x11,x7,x24
  1152. adcs x21,x21,x12
  1153. umulh x12,x8,x24
  1154. adcs x22,x22,x13
  1155. umulh x13,x9,x24
  1156. adc x23,xzr,xzr
  1157. ldr x24,[x2,x28] // next b[i]
  1158. adds x20,x20,x10
  1159. mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
  1160. adcs x21,x21,x11
  1161. mul x11,x15,x25
  1162. adcs x22,x22,x12
  1163. mul x12,x16,x25
  1164. adc x23,x23,x13 // can't overflow
  1165. mul x13,x17,x25
  1166. adds x19,x19,x10
  1167. umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
  1168. adcs x20,x20,x11
  1169. umulh x11,x15,x25
  1170. adcs x21,x21,x12
  1171. umulh x12,x16,x25
  1172. adcs x22,x22,x13
  1173. umulh x13,x17,x25
  1174. adcs x23,x23,x0
  1175. ldr x25,[sp,x28] // next a[0]*n0
  1176. adc x0,xzr,xzr
  1177. str x19,[x26],#8 // result!!!
  1178. adds x19,x20,x10
  1179. sub x10,x27,x1 // done yet?
  1180. adcs x20,x21,x11
  1181. adcs x21,x22,x12
  1182. adcs x22,x23,x13
  1183. //adc x0,x0,xzr
  1184. cbnz x28,Loop_mul4x_tail
  1185. sub x11,x3,x5 // rewinded np?
  1186. adc x0,x0,xzr
  1187. cbz x10,Loop_mul4x_break
  1188. ldp x10,x11,[x26,#8*4]
  1189. ldp x12,x13,[x26,#8*6]
  1190. ldp x6,x7,[x1,#8*0]
  1191. ldp x8,x9,[x1,#8*2]
  1192. add x1,x1,#8*4
  1193. adds x19,x19,x10
  1194. adcs x20,x20,x11
  1195. adcs x21,x21,x12
  1196. adcs x22,x22,x13
  1197. //adc x0,x0,xzr
  1198. ldp x14,x15,[x3,#8*0]
  1199. ldp x16,x17,[x3,#8*2]
  1200. add x3,x3,#8*4
  1201. b Loop_mul4x_tail
  1202. .align 4
  1203. Loop_mul4x_break:
  1204. ldp x12,x13,[x29,#96] // pull rp and &b[num]
  1205. adds x19,x19,x30
  1206. add x2,x2,#8*4 // bp++
  1207. adcs x20,x20,xzr
  1208. sub x1,x1,x5 // rewind ap
  1209. adcs x21,x21,xzr
  1210. stp x19,x20,[x26,#8*0] // result!!!
  1211. adcs x22,x22,xzr
  1212. ldp x19,x20,[sp,#8*4] // t[0..3]
  1213. adc x30,x0,xzr
  1214. stp x21,x22,[x26,#8*2] // result!!!
  1215. cmp x2,x13 // done yet?
  1216. ldp x21,x22,[sp,#8*6]
  1217. ldp x14,x15,[x11,#8*0] // n[0..3]
  1218. ldp x16,x17,[x11,#8*2]
  1219. add x3,x11,#8*4
  1220. b.eq Lmul4x_post
  1221. ldr x24,[x2]
  1222. ldp x6,x7,[x1,#8*0] // a[0..3]
  1223. ldp x8,x9,[x1,#8*2]
  1224. adds x1,x1,#8*4 // clear carry bit
  1225. mov x0,xzr
  1226. mov x26,sp
  1227. b Loop_mul4x_reduction
  1228. .align 4
  1229. Lmul4x_post:
  1230. // Final step. We see if result is larger than modulus, and
  1231. // if it is, subtract the modulus. But comparison implies
  1232. // subtraction. So we subtract modulus, see if it borrowed,
  1233. // and conditionally copy original value.
  1234. mov x0,x12
  1235. mov x27,x12 // x0 copy
  1236. subs x10,x19,x14
  1237. add x26,sp,#8*8
  1238. sbcs x11,x20,x15
  1239. sub x28,x5,#8*4
  1240. Lmul4x_sub:
  1241. sbcs x12,x21,x16
  1242. ldp x14,x15,[x3,#8*0]
  1243. sub x28,x28,#8*4
  1244. ldp x19,x20,[x26,#8*0]
  1245. sbcs x13,x22,x17
  1246. ldp x16,x17,[x3,#8*2]
  1247. add x3,x3,#8*4
  1248. ldp x21,x22,[x26,#8*2]
  1249. add x26,x26,#8*4
  1250. stp x10,x11,[x0,#8*0]
  1251. sbcs x10,x19,x14
  1252. stp x12,x13,[x0,#8*2]
  1253. add x0,x0,#8*4
  1254. sbcs x11,x20,x15
  1255. cbnz x28,Lmul4x_sub
  1256. sbcs x12,x21,x16
  1257. mov x26,sp
  1258. add x1,sp,#8*4
  1259. ldp x6,x7,[x27,#8*0]
  1260. sbcs x13,x22,x17
  1261. stp x10,x11,[x0,#8*0]
  1262. ldp x8,x9,[x27,#8*2]
  1263. stp x12,x13,[x0,#8*2]
  1264. ldp x19,x20,[x1,#8*0]
  1265. ldp x21,x22,[x1,#8*2]
  1266. sbcs xzr,x30,xzr // did it borrow?
  1267. ldr x30,[x29,#8] // pull return address
  1268. sub x28,x5,#8*4
  1269. Lmul4x_cond_copy:
  1270. sub x28,x28,#8*4
  1271. csel x10,x19,x6,lo
  1272. stp xzr,xzr,[x26,#8*0]
  1273. csel x11,x20,x7,lo
  1274. ldp x6,x7,[x27,#8*4]
  1275. ldp x19,x20,[x1,#8*4]
  1276. csel x12,x21,x8,lo
  1277. stp xzr,xzr,[x26,#8*2]
  1278. add x26,x26,#8*4
  1279. csel x13,x22,x9,lo
  1280. ldp x8,x9,[x27,#8*6]
  1281. ldp x21,x22,[x1,#8*6]
  1282. add x1,x1,#8*4
  1283. stp x10,x11,[x27,#8*0]
  1284. stp x12,x13,[x27,#8*2]
  1285. add x27,x27,#8*4
  1286. cbnz x28,Lmul4x_cond_copy
  1287. csel x10,x19,x6,lo
  1288. stp xzr,xzr,[x26,#8*0]
  1289. csel x11,x20,x7,lo
  1290. stp xzr,xzr,[x26,#8*2]
  1291. csel x12,x21,x8,lo
  1292. stp xzr,xzr,[x26,#8*3]
  1293. csel x13,x22,x9,lo
  1294. stp xzr,xzr,[x26,#8*4]
  1295. stp x10,x11,[x27,#8*0]
  1296. stp x12,x13,[x27,#8*2]
  1297. b Lmul4x_done
  1298. .align 4
  1299. Lmul4x4_post_condition:
  1300. adc x0,x0,xzr
  1301. ldr x1,[x29,#96] // pull rp
  1302. // x19-3,x0 hold result, x14-7 hold modulus
  1303. subs x6,x19,x14
  1304. ldr x30,[x29,#8] // pull return address
  1305. sbcs x7,x20,x15
  1306. stp xzr,xzr,[sp,#8*0]
  1307. sbcs x8,x21,x16
  1308. stp xzr,xzr,[sp,#8*2]
  1309. sbcs x9,x22,x17
  1310. stp xzr,xzr,[sp,#8*4]
  1311. sbcs xzr,x0,xzr // did it borrow?
  1312. stp xzr,xzr,[sp,#8*6]
  1313. // x6-3 hold result-modulus
  1314. csel x6,x19,x6,lo
  1315. csel x7,x20,x7,lo
  1316. csel x8,x21,x8,lo
  1317. csel x9,x22,x9,lo
  1318. stp x6,x7,[x1,#8*0]
  1319. stp x8,x9,[x1,#8*2]
  1320. Lmul4x_done:
  1321. ldp x19,x20,[x29,#16]
  1322. mov sp,x29
  1323. ldp x21,x22,[x29,#32]
  1324. mov x0,#1
  1325. ldp x23,x24,[x29,#48]
  1326. ldp x25,x26,[x29,#64]
  1327. ldp x27,x28,[x29,#80]
  1328. ldr x29,[sp],#128
  1329. // x30 is popped earlier
  1330. AARCH64_VALIDATE_LINK_REGISTER
  1331. ret
  1332. .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  1333. .align 2
  1334. .align 4
  1335. #endif
  1336. #endif // !OPENSSL_NO_ASM