rsaz-avx2.asm 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972
  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. %ifdef BORINGSSL_PREFIX
  8. %include "boringssl_prefix_symbols_nasm.inc"
  9. %endif
  10. section .text code align=64
  11. global rsaz_1024_sqr_avx2
  12. ALIGN 64
  13. rsaz_1024_sqr_avx2:
  14. mov QWORD[8+rsp],rdi ;WIN64 prologue
  15. mov QWORD[16+rsp],rsi
  16. mov rax,rsp
  17. $L$SEH_begin_rsaz_1024_sqr_avx2:
  18. mov rdi,rcx
  19. mov rsi,rdx
  20. mov rdx,r8
  21. mov rcx,r9
  22. mov r8,QWORD[40+rsp]
  23. lea rax,[rsp]
  24. push rbx
  25. push rbp
  26. push r12
  27. push r13
  28. push r14
  29. push r15
  30. vzeroupper
  31. lea rsp,[((-168))+rsp]
  32. vmovaps XMMWORD[(-216)+rax],xmm6
  33. vmovaps XMMWORD[(-200)+rax],xmm7
  34. vmovaps XMMWORD[(-184)+rax],xmm8
  35. vmovaps XMMWORD[(-168)+rax],xmm9
  36. vmovaps XMMWORD[(-152)+rax],xmm10
  37. vmovaps XMMWORD[(-136)+rax],xmm11
  38. vmovaps XMMWORD[(-120)+rax],xmm12
  39. vmovaps XMMWORD[(-104)+rax],xmm13
  40. vmovaps XMMWORD[(-88)+rax],xmm14
  41. vmovaps XMMWORD[(-72)+rax],xmm15
  42. $L$sqr_1024_body:
  43. mov rbp,rax
  44. mov r13,rdx
  45. sub rsp,832
  46. mov r15,r13
  47. sub rdi,-128
  48. sub rsi,-128
  49. sub r13,-128
  50. and r15,4095
  51. add r15,32*10
  52. shr r15,12
  53. vpxor ymm9,ymm9,ymm9
  54. jz NEAR $L$sqr_1024_no_n_copy
  55. sub rsp,32*10
  56. vmovdqu ymm0,YMMWORD[((0-128))+r13]
  57. and rsp,-2048
  58. vmovdqu ymm1,YMMWORD[((32-128))+r13]
  59. vmovdqu ymm2,YMMWORD[((64-128))+r13]
  60. vmovdqu ymm3,YMMWORD[((96-128))+r13]
  61. vmovdqu ymm4,YMMWORD[((128-128))+r13]
  62. vmovdqu ymm5,YMMWORD[((160-128))+r13]
  63. vmovdqu ymm6,YMMWORD[((192-128))+r13]
  64. vmovdqu ymm7,YMMWORD[((224-128))+r13]
  65. vmovdqu ymm8,YMMWORD[((256-128))+r13]
  66. lea r13,[((832+128))+rsp]
  67. vmovdqu YMMWORD[(0-128)+r13],ymm0
  68. vmovdqu YMMWORD[(32-128)+r13],ymm1
  69. vmovdqu YMMWORD[(64-128)+r13],ymm2
  70. vmovdqu YMMWORD[(96-128)+r13],ymm3
  71. vmovdqu YMMWORD[(128-128)+r13],ymm4
  72. vmovdqu YMMWORD[(160-128)+r13],ymm5
  73. vmovdqu YMMWORD[(192-128)+r13],ymm6
  74. vmovdqu YMMWORD[(224-128)+r13],ymm7
  75. vmovdqu YMMWORD[(256-128)+r13],ymm8
  76. vmovdqu YMMWORD[(288-128)+r13],ymm9
  77. $L$sqr_1024_no_n_copy:
  78. and rsp,-1024
  79. vmovdqu ymm1,YMMWORD[((32-128))+rsi]
  80. vmovdqu ymm2,YMMWORD[((64-128))+rsi]
  81. vmovdqu ymm3,YMMWORD[((96-128))+rsi]
  82. vmovdqu ymm4,YMMWORD[((128-128))+rsi]
  83. vmovdqu ymm5,YMMWORD[((160-128))+rsi]
  84. vmovdqu ymm6,YMMWORD[((192-128))+rsi]
  85. vmovdqu ymm7,YMMWORD[((224-128))+rsi]
  86. vmovdqu ymm8,YMMWORD[((256-128))+rsi]
  87. lea rbx,[192+rsp]
  88. vmovdqu ymm15,YMMWORD[$L$and_mask]
  89. jmp NEAR $L$OOP_GRANDE_SQR_1024
  90. ALIGN 32
  91. $L$OOP_GRANDE_SQR_1024:
  92. lea r9,[((576+128))+rsp]
  93. lea r12,[448+rsp]
  94. vpaddq ymm1,ymm1,ymm1
  95. vpbroadcastq ymm10,QWORD[((0-128))+rsi]
  96. vpaddq ymm2,ymm2,ymm2
  97. vmovdqa YMMWORD[(0-128)+r9],ymm1
  98. vpaddq ymm3,ymm3,ymm3
  99. vmovdqa YMMWORD[(32-128)+r9],ymm2
  100. vpaddq ymm4,ymm4,ymm4
  101. vmovdqa YMMWORD[(64-128)+r9],ymm3
  102. vpaddq ymm5,ymm5,ymm5
  103. vmovdqa YMMWORD[(96-128)+r9],ymm4
  104. vpaddq ymm6,ymm6,ymm6
  105. vmovdqa YMMWORD[(128-128)+r9],ymm5
  106. vpaddq ymm7,ymm7,ymm7
  107. vmovdqa YMMWORD[(160-128)+r9],ymm6
  108. vpaddq ymm8,ymm8,ymm8
  109. vmovdqa YMMWORD[(192-128)+r9],ymm7
  110. vpxor ymm9,ymm9,ymm9
  111. vmovdqa YMMWORD[(224-128)+r9],ymm8
  112. vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
  113. vpbroadcastq ymm11,QWORD[((32-128))+rsi]
  114. vmovdqu YMMWORD[(288-192)+rbx],ymm9
  115. vpmuludq ymm1,ymm1,ymm10
  116. vmovdqu YMMWORD[(320-448)+r12],ymm9
  117. vpmuludq ymm2,ymm2,ymm10
  118. vmovdqu YMMWORD[(352-448)+r12],ymm9
  119. vpmuludq ymm3,ymm3,ymm10
  120. vmovdqu YMMWORD[(384-448)+r12],ymm9
  121. vpmuludq ymm4,ymm4,ymm10
  122. vmovdqu YMMWORD[(416-448)+r12],ymm9
  123. vpmuludq ymm5,ymm5,ymm10
  124. vmovdqu YMMWORD[(448-448)+r12],ymm9
  125. vpmuludq ymm6,ymm6,ymm10
  126. vmovdqu YMMWORD[(480-448)+r12],ymm9
  127. vpmuludq ymm7,ymm7,ymm10
  128. vmovdqu YMMWORD[(512-448)+r12],ymm9
  129. vpmuludq ymm8,ymm8,ymm10
  130. vpbroadcastq ymm10,QWORD[((64-128))+rsi]
  131. vmovdqu YMMWORD[(544-448)+r12],ymm9
  132. mov r15,rsi
  133. mov r14d,4
  134. jmp NEAR $L$sqr_entry_1024
  135. ALIGN 32
  136. $L$OOP_SQR_1024:
  137. vpbroadcastq ymm11,QWORD[((32-128))+r15]
  138. vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi]
  139. vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx]
  140. vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9]
  141. vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx]
  142. vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9]
  143. vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx]
  144. vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9]
  145. vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx]
  146. vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9]
  147. vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx]
  148. vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9]
  149. vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx]
  150. vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9]
  151. vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx]
  152. vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9]
  153. vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx]
  154. vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9]
  155. vpbroadcastq ymm10,QWORD[((64-128))+r15]
  156. vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx]
  157. $L$sqr_entry_1024:
  158. vmovdqu YMMWORD[(0-192)+rbx],ymm0
  159. vmovdqu YMMWORD[(32-192)+rbx],ymm1
  160. vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi]
  161. vpaddq ymm2,ymm2,ymm12
  162. vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9]
  163. vpaddq ymm3,ymm3,ymm14
  164. vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9]
  165. vpaddq ymm4,ymm4,ymm13
  166. vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9]
  167. vpaddq ymm5,ymm5,ymm12
  168. vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9]
  169. vpaddq ymm6,ymm6,ymm14
  170. vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9]
  171. vpaddq ymm7,ymm7,ymm13
  172. vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9]
  173. vpaddq ymm8,ymm8,ymm12
  174. vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9]
  175. vpbroadcastq ymm11,QWORD[((96-128))+r15]
  176. vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
  177. vmovdqu YMMWORD[(64-192)+rbx],ymm2
  178. vmovdqu YMMWORD[(96-192)+rbx],ymm3
  179. vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi]
  180. vpaddq ymm4,ymm4,ymm13
  181. vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9]
  182. vpaddq ymm5,ymm5,ymm12
  183. vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9]
  184. vpaddq ymm6,ymm6,ymm14
  185. vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9]
  186. vpaddq ymm7,ymm7,ymm13
  187. vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9]
  188. vpaddq ymm8,ymm8,ymm12
  189. vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
  190. vpaddq ymm0,ymm0,ymm14
  191. vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9]
  192. vpbroadcastq ymm10,QWORD[((128-128))+r15]
  193. vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
  194. vmovdqu YMMWORD[(128-192)+rbx],ymm4
  195. vmovdqu YMMWORD[(160-192)+rbx],ymm5
  196. vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi]
  197. vpaddq ymm6,ymm6,ymm12
  198. vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9]
  199. vpaddq ymm7,ymm7,ymm14
  200. vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9]
  201. vpaddq ymm8,ymm8,ymm13
  202. vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
  203. vpaddq ymm0,ymm0,ymm12
  204. vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
  205. vpaddq ymm1,ymm1,ymm14
  206. vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9]
  207. vpbroadcastq ymm11,QWORD[((160-128))+r15]
  208. vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
  209. vmovdqu YMMWORD[(192-192)+rbx],ymm6
  210. vmovdqu YMMWORD[(224-192)+rbx],ymm7
  211. vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi]
  212. vpaddq ymm8,ymm8,ymm12
  213. vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9]
  214. vpaddq ymm0,ymm0,ymm14
  215. vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9]
  216. vpaddq ymm1,ymm1,ymm13
  217. vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9]
  218. vpaddq ymm2,ymm2,ymm12
  219. vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9]
  220. vpbroadcastq ymm10,QWORD[((192-128))+r15]
  221. vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
  222. vmovdqu YMMWORD[(256-192)+rbx],ymm8
  223. vmovdqu YMMWORD[(288-192)+rbx],ymm0
  224. lea rbx,[8+rbx]
  225. vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi]
  226. vpaddq ymm1,ymm1,ymm13
  227. vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9]
  228. vpaddq ymm2,ymm2,ymm12
  229. vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9]
  230. vpaddq ymm3,ymm3,ymm14
  231. vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9]
  232. vpbroadcastq ymm11,QWORD[((224-128))+r15]
  233. vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
  234. vmovdqu YMMWORD[(320-448)+r12],ymm1
  235. vmovdqu YMMWORD[(352-448)+r12],ymm2
  236. vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi]
  237. vpaddq ymm3,ymm3,ymm12
  238. vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9]
  239. vpbroadcastq ymm0,QWORD[((256-128))+r15]
  240. vpaddq ymm4,ymm4,ymm14
  241. vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9]
  242. vpbroadcastq ymm10,QWORD[((0+8-128))+r15]
  243. vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
  244. vmovdqu YMMWORD[(384-448)+r12],ymm3
  245. vmovdqu YMMWORD[(416-448)+r12],ymm4
  246. lea r15,[8+r15]
  247. vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi]
  248. vpaddq ymm5,ymm5,ymm12
  249. vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9]
  250. vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
  251. vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi]
  252. vmovdqu YMMWORD[(448-448)+r12],ymm5
  253. vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
  254. vmovdqu YMMWORD[(480-448)+r12],ymm6
  255. vmovdqu YMMWORD[(512-448)+r12],ymm7
  256. lea r12,[8+r12]
  257. dec r14d
  258. jnz NEAR $L$OOP_SQR_1024
  259. vmovdqu ymm8,YMMWORD[256+rsp]
  260. vmovdqu ymm1,YMMWORD[288+rsp]
  261. vmovdqu ymm2,YMMWORD[320+rsp]
  262. lea rbx,[192+rsp]
  263. vpsrlq ymm14,ymm8,29
  264. vpand ymm8,ymm8,ymm15
  265. vpsrlq ymm11,ymm1,29
  266. vpand ymm1,ymm1,ymm15
  267. vpermq ymm14,ymm14,0x93
  268. vpxor ymm9,ymm9,ymm9
  269. vpermq ymm11,ymm11,0x93
  270. vpblendd ymm10,ymm14,ymm9,3
  271. vpblendd ymm14,ymm11,ymm14,3
  272. vpaddq ymm8,ymm8,ymm10
  273. vpblendd ymm11,ymm9,ymm11,3
  274. vpaddq ymm1,ymm1,ymm14
  275. vpaddq ymm2,ymm2,ymm11
  276. vmovdqu YMMWORD[(288-192)+rbx],ymm1
  277. vmovdqu YMMWORD[(320-192)+rbx],ymm2
  278. mov rax,QWORD[rsp]
  279. mov r10,QWORD[8+rsp]
  280. mov r11,QWORD[16+rsp]
  281. mov r12,QWORD[24+rsp]
  282. vmovdqu ymm1,YMMWORD[32+rsp]
  283. vmovdqu ymm2,YMMWORD[((64-192))+rbx]
  284. vmovdqu ymm3,YMMWORD[((96-192))+rbx]
  285. vmovdqu ymm4,YMMWORD[((128-192))+rbx]
  286. vmovdqu ymm5,YMMWORD[((160-192))+rbx]
  287. vmovdqu ymm6,YMMWORD[((192-192))+rbx]
  288. vmovdqu ymm7,YMMWORD[((224-192))+rbx]
  289. mov r9,rax
  290. imul eax,ecx
  291. and eax,0x1fffffff
  292. vmovd xmm12,eax
  293. mov rdx,rax
  294. imul rax,QWORD[((-128))+r13]
  295. vpbroadcastq ymm12,xmm12
  296. add r9,rax
  297. mov rax,rdx
  298. imul rax,QWORD[((8-128))+r13]
  299. shr r9,29
  300. add r10,rax
  301. mov rax,rdx
  302. imul rax,QWORD[((16-128))+r13]
  303. add r10,r9
  304. add r11,rax
  305. imul rdx,QWORD[((24-128))+r13]
  306. add r12,rdx
  307. mov rax,r10
  308. imul eax,ecx
  309. and eax,0x1fffffff
  310. mov r14d,9
  311. jmp NEAR $L$OOP_REDUCE_1024
  312. ALIGN 32
  313. $L$OOP_REDUCE_1024:
  314. vmovd xmm13,eax
  315. vpbroadcastq ymm13,xmm13
  316. vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13]
  317. mov rdx,rax
  318. imul rax,QWORD[((-128))+r13]
  319. vpaddq ymm1,ymm1,ymm10
  320. add r10,rax
  321. vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13]
  322. mov rax,rdx
  323. imul rax,QWORD[((8-128))+r13]
  324. vpaddq ymm2,ymm2,ymm14
  325. vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13]
  326. DB 0x67
  327. add r11,rax
  328. DB 0x67
  329. mov rax,rdx
  330. imul rax,QWORD[((16-128))+r13]
  331. shr r10,29
  332. vpaddq ymm3,ymm3,ymm11
  333. vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13]
  334. add r12,rax
  335. add r11,r10
  336. vpaddq ymm4,ymm4,ymm10
  337. vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13]
  338. mov rax,r11
  339. imul eax,ecx
  340. vpaddq ymm5,ymm5,ymm14
  341. vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13]
  342. and eax,0x1fffffff
  343. vpaddq ymm6,ymm6,ymm11
  344. vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13]
  345. vpaddq ymm7,ymm7,ymm10
  346. vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13]
  347. vmovd xmm12,eax
  348. vpaddq ymm8,ymm8,ymm14
  349. vpbroadcastq ymm12,xmm12
  350. vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13]
  351. vmovdqu ymm14,YMMWORD[((96-8-128))+r13]
  352. mov rdx,rax
  353. imul rax,QWORD[((-128))+r13]
  354. vpaddq ymm1,ymm1,ymm11
  355. vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13]
  356. vmovdqu ymm11,YMMWORD[((128-8-128))+r13]
  357. add r11,rax
  358. mov rax,rdx
  359. imul rax,QWORD[((8-128))+r13]
  360. vpaddq ymm2,ymm2,ymm10
  361. add rax,r12
  362. shr r11,29
  363. vpmuludq ymm14,ymm14,ymm13
  364. vmovdqu ymm10,YMMWORD[((160-8-128))+r13]
  365. add rax,r11
  366. vpaddq ymm3,ymm3,ymm14
  367. vpmuludq ymm11,ymm11,ymm13
  368. vmovdqu ymm14,YMMWORD[((192-8-128))+r13]
  369. DB 0x67
  370. mov r12,rax
  371. imul eax,ecx
  372. vpaddq ymm4,ymm4,ymm11
  373. vpmuludq ymm10,ymm10,ymm13
  374. DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
  375. and eax,0x1fffffff
  376. vpaddq ymm5,ymm5,ymm10
  377. vpmuludq ymm14,ymm14,ymm13
  378. vmovdqu ymm10,YMMWORD[((256-8-128))+r13]
  379. vpaddq ymm6,ymm6,ymm14
  380. vpmuludq ymm11,ymm11,ymm13
  381. vmovdqu ymm9,YMMWORD[((288-8-128))+r13]
  382. vmovd xmm0,eax
  383. imul rax,QWORD[((-128))+r13]
  384. vpaddq ymm7,ymm7,ymm11
  385. vpmuludq ymm10,ymm10,ymm13
  386. vmovdqu ymm14,YMMWORD[((32-16-128))+r13]
  387. vpbroadcastq ymm0,xmm0
  388. vpaddq ymm8,ymm8,ymm10
  389. vpmuludq ymm9,ymm9,ymm13
  390. vmovdqu ymm11,YMMWORD[((64-16-128))+r13]
  391. add r12,rax
  392. vmovdqu ymm13,YMMWORD[((32-24-128))+r13]
  393. vpmuludq ymm14,ymm14,ymm12
  394. vmovdqu ymm10,YMMWORD[((96-16-128))+r13]
  395. vpaddq ymm1,ymm1,ymm14
  396. vpmuludq ymm13,ymm13,ymm0
  397. vpmuludq ymm11,ymm11,ymm12
  398. DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
  399. vpaddq ymm13,ymm13,ymm1
  400. vpaddq ymm2,ymm2,ymm11
  401. vpmuludq ymm10,ymm10,ymm12
  402. vmovdqu ymm11,YMMWORD[((160-16-128))+r13]
  403. DB 0x67
  404. vmovq rax,xmm13
  405. vmovdqu YMMWORD[rsp],ymm13
  406. vpaddq ymm3,ymm3,ymm10
  407. vpmuludq ymm14,ymm14,ymm12
  408. vmovdqu ymm10,YMMWORD[((192-16-128))+r13]
  409. vpaddq ymm4,ymm4,ymm14
  410. vpmuludq ymm11,ymm11,ymm12
  411. vmovdqu ymm14,YMMWORD[((224-16-128))+r13]
  412. vpaddq ymm5,ymm5,ymm11
  413. vpmuludq ymm10,ymm10,ymm12
  414. vmovdqu ymm11,YMMWORD[((256-16-128))+r13]
  415. vpaddq ymm6,ymm6,ymm10
  416. vpmuludq ymm14,ymm14,ymm12
  417. shr r12,29
  418. vmovdqu ymm10,YMMWORD[((288-16-128))+r13]
  419. add rax,r12
  420. vpaddq ymm7,ymm7,ymm14
  421. vpmuludq ymm11,ymm11,ymm12
  422. mov r9,rax
  423. imul eax,ecx
  424. vpaddq ymm8,ymm8,ymm11
  425. vpmuludq ymm10,ymm10,ymm12
  426. and eax,0x1fffffff
  427. vmovd xmm12,eax
  428. vmovdqu ymm11,YMMWORD[((96-24-128))+r13]
  429. DB 0x67
  430. vpaddq ymm9,ymm9,ymm10
  431. vpbroadcastq ymm12,xmm12
  432. vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13]
  433. vmovdqu ymm10,YMMWORD[((128-24-128))+r13]
  434. mov rdx,rax
  435. imul rax,QWORD[((-128))+r13]
  436. mov r10,QWORD[8+rsp]
  437. vpaddq ymm1,ymm2,ymm14
  438. vpmuludq ymm11,ymm11,ymm0
  439. vmovdqu ymm14,YMMWORD[((160-24-128))+r13]
  440. add r9,rax
  441. mov rax,rdx
  442. imul rax,QWORD[((8-128))+r13]
  443. DB 0x67
  444. shr r9,29
  445. mov r11,QWORD[16+rsp]
  446. vpaddq ymm2,ymm3,ymm11
  447. vpmuludq ymm10,ymm10,ymm0
  448. vmovdqu ymm11,YMMWORD[((192-24-128))+r13]
  449. add r10,rax
  450. mov rax,rdx
  451. imul rax,QWORD[((16-128))+r13]
  452. vpaddq ymm3,ymm4,ymm10
  453. vpmuludq ymm14,ymm14,ymm0
  454. vmovdqu ymm10,YMMWORD[((224-24-128))+r13]
  455. imul rdx,QWORD[((24-128))+r13]
  456. add r11,rax
  457. lea rax,[r10*1+r9]
  458. vpaddq ymm4,ymm5,ymm14
  459. vpmuludq ymm11,ymm11,ymm0
  460. vmovdqu ymm14,YMMWORD[((256-24-128))+r13]
  461. mov r10,rax
  462. imul eax,ecx
  463. vpmuludq ymm10,ymm10,ymm0
  464. vpaddq ymm5,ymm6,ymm11
  465. vmovdqu ymm11,YMMWORD[((288-24-128))+r13]
  466. and eax,0x1fffffff
  467. vpaddq ymm6,ymm7,ymm10
  468. vpmuludq ymm14,ymm14,ymm0
  469. add rdx,QWORD[24+rsp]
  470. vpaddq ymm7,ymm8,ymm14
  471. vpmuludq ymm11,ymm11,ymm0
  472. vpaddq ymm8,ymm9,ymm11
  473. vmovq xmm9,r12
  474. mov r12,rdx
  475. dec r14d
  476. jnz NEAR $L$OOP_REDUCE_1024
  477. lea r12,[448+rsp]
  478. vpaddq ymm0,ymm13,ymm9
  479. vpxor ymm9,ymm9,ymm9
  480. vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx]
  481. vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12]
  482. vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12]
  483. vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12]
  484. vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12]
  485. vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12]
  486. vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12]
  487. vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12]
  488. vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12]
  489. vpsrlq ymm14,ymm0,29
  490. vpand ymm0,ymm0,ymm15
  491. vpsrlq ymm11,ymm1,29
  492. vpand ymm1,ymm1,ymm15
  493. vpsrlq ymm12,ymm2,29
  494. vpermq ymm14,ymm14,0x93
  495. vpand ymm2,ymm2,ymm15
  496. vpsrlq ymm13,ymm3,29
  497. vpermq ymm11,ymm11,0x93
  498. vpand ymm3,ymm3,ymm15
  499. vpermq ymm12,ymm12,0x93
  500. vpblendd ymm10,ymm14,ymm9,3
  501. vpermq ymm13,ymm13,0x93
  502. vpblendd ymm14,ymm11,ymm14,3
  503. vpaddq ymm0,ymm0,ymm10
  504. vpblendd ymm11,ymm12,ymm11,3
  505. vpaddq ymm1,ymm1,ymm14
  506. vpblendd ymm12,ymm13,ymm12,3
  507. vpaddq ymm2,ymm2,ymm11
  508. vpblendd ymm13,ymm9,ymm13,3
  509. vpaddq ymm3,ymm3,ymm12
  510. vpaddq ymm4,ymm4,ymm13
  511. vpsrlq ymm14,ymm0,29
  512. vpand ymm0,ymm0,ymm15
  513. vpsrlq ymm11,ymm1,29
  514. vpand ymm1,ymm1,ymm15
  515. vpsrlq ymm12,ymm2,29
  516. vpermq ymm14,ymm14,0x93
  517. vpand ymm2,ymm2,ymm15
  518. vpsrlq ymm13,ymm3,29
  519. vpermq ymm11,ymm11,0x93
  520. vpand ymm3,ymm3,ymm15
  521. vpermq ymm12,ymm12,0x93
  522. vpblendd ymm10,ymm14,ymm9,3
  523. vpermq ymm13,ymm13,0x93
  524. vpblendd ymm14,ymm11,ymm14,3
  525. vpaddq ymm0,ymm0,ymm10
  526. vpblendd ymm11,ymm12,ymm11,3
  527. vpaddq ymm1,ymm1,ymm14
  528. vmovdqu YMMWORD[(0-128)+rdi],ymm0
  529. vpblendd ymm12,ymm13,ymm12,3
  530. vpaddq ymm2,ymm2,ymm11
  531. vmovdqu YMMWORD[(32-128)+rdi],ymm1
  532. vpblendd ymm13,ymm9,ymm13,3
  533. vpaddq ymm3,ymm3,ymm12
  534. vmovdqu YMMWORD[(64-128)+rdi],ymm2
  535. vpaddq ymm4,ymm4,ymm13
  536. vmovdqu YMMWORD[(96-128)+rdi],ymm3
  537. vpsrlq ymm14,ymm4,29
  538. vpand ymm4,ymm4,ymm15
  539. vpsrlq ymm11,ymm5,29
  540. vpand ymm5,ymm5,ymm15
  541. vpsrlq ymm12,ymm6,29
  542. vpermq ymm14,ymm14,0x93
  543. vpand ymm6,ymm6,ymm15
  544. vpsrlq ymm13,ymm7,29
  545. vpermq ymm11,ymm11,0x93
  546. vpand ymm7,ymm7,ymm15
  547. vpsrlq ymm0,ymm8,29
  548. vpermq ymm12,ymm12,0x93
  549. vpand ymm8,ymm8,ymm15
  550. vpermq ymm13,ymm13,0x93
  551. vpblendd ymm10,ymm14,ymm9,3
  552. vpermq ymm0,ymm0,0x93
  553. vpblendd ymm14,ymm11,ymm14,3
  554. vpaddq ymm4,ymm4,ymm10
  555. vpblendd ymm11,ymm12,ymm11,3
  556. vpaddq ymm5,ymm5,ymm14
  557. vpblendd ymm12,ymm13,ymm12,3
  558. vpaddq ymm6,ymm6,ymm11
  559. vpblendd ymm13,ymm0,ymm13,3
  560. vpaddq ymm7,ymm7,ymm12
  561. vpaddq ymm8,ymm8,ymm13
  562. vpsrlq ymm14,ymm4,29
  563. vpand ymm4,ymm4,ymm15
  564. vpsrlq ymm11,ymm5,29
  565. vpand ymm5,ymm5,ymm15
  566. vpsrlq ymm12,ymm6,29
  567. vpermq ymm14,ymm14,0x93
  568. vpand ymm6,ymm6,ymm15
  569. vpsrlq ymm13,ymm7,29
  570. vpermq ymm11,ymm11,0x93
  571. vpand ymm7,ymm7,ymm15
  572. vpsrlq ymm0,ymm8,29
  573. vpermq ymm12,ymm12,0x93
  574. vpand ymm8,ymm8,ymm15
  575. vpermq ymm13,ymm13,0x93
  576. vpblendd ymm10,ymm14,ymm9,3
  577. vpermq ymm0,ymm0,0x93
  578. vpblendd ymm14,ymm11,ymm14,3
  579. vpaddq ymm4,ymm4,ymm10
  580. vpblendd ymm11,ymm12,ymm11,3
  581. vpaddq ymm5,ymm5,ymm14
  582. vmovdqu YMMWORD[(128-128)+rdi],ymm4
  583. vpblendd ymm12,ymm13,ymm12,3
  584. vpaddq ymm6,ymm6,ymm11
  585. vmovdqu YMMWORD[(160-128)+rdi],ymm5
  586. vpblendd ymm13,ymm0,ymm13,3
  587. vpaddq ymm7,ymm7,ymm12
  588. vmovdqu YMMWORD[(192-128)+rdi],ymm6
  589. vpaddq ymm8,ymm8,ymm13
  590. vmovdqu YMMWORD[(224-128)+rdi],ymm7
  591. vmovdqu YMMWORD[(256-128)+rdi],ymm8
  592. mov rsi,rdi
  593. dec r8d
  594. jne NEAR $L$OOP_GRANDE_SQR_1024
  595. vzeroall
  596. mov rax,rbp
  597. $L$sqr_1024_in_tail:
  598. movaps xmm6,XMMWORD[((-216))+rax]
  599. movaps xmm7,XMMWORD[((-200))+rax]
  600. movaps xmm8,XMMWORD[((-184))+rax]
  601. movaps xmm9,XMMWORD[((-168))+rax]
  602. movaps xmm10,XMMWORD[((-152))+rax]
  603. movaps xmm11,XMMWORD[((-136))+rax]
  604. movaps xmm12,XMMWORD[((-120))+rax]
  605. movaps xmm13,XMMWORD[((-104))+rax]
  606. movaps xmm14,XMMWORD[((-88))+rax]
  607. movaps xmm15,XMMWORD[((-72))+rax]
  608. mov r15,QWORD[((-48))+rax]
  609. mov r14,QWORD[((-40))+rax]
  610. mov r13,QWORD[((-32))+rax]
  611. mov r12,QWORD[((-24))+rax]
  612. mov rbp,QWORD[((-16))+rax]
  613. mov rbx,QWORD[((-8))+rax]
  614. lea rsp,[rax]
  615. $L$sqr_1024_epilogue:
  616. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  617. mov rsi,QWORD[16+rsp]
  618. DB 0F3h,0C3h ;repret
  619. $L$SEH_end_rsaz_1024_sqr_avx2:
  620. global rsaz_1024_mul_avx2
  621. ALIGN 64
  622. rsaz_1024_mul_avx2:
  623. mov QWORD[8+rsp],rdi ;WIN64 prologue
  624. mov QWORD[16+rsp],rsi
  625. mov rax,rsp
  626. $L$SEH_begin_rsaz_1024_mul_avx2:
  627. mov rdi,rcx
  628. mov rsi,rdx
  629. mov rdx,r8
  630. mov rcx,r9
  631. mov r8,QWORD[40+rsp]
  632. lea rax,[rsp]
  633. push rbx
  634. push rbp
  635. push r12
  636. push r13
  637. push r14
  638. push r15
  639. vzeroupper
  640. lea rsp,[((-168))+rsp]
  641. vmovaps XMMWORD[(-216)+rax],xmm6
  642. vmovaps XMMWORD[(-200)+rax],xmm7
  643. vmovaps XMMWORD[(-184)+rax],xmm8
  644. vmovaps XMMWORD[(-168)+rax],xmm9
  645. vmovaps XMMWORD[(-152)+rax],xmm10
  646. vmovaps XMMWORD[(-136)+rax],xmm11
  647. vmovaps XMMWORD[(-120)+rax],xmm12
  648. vmovaps XMMWORD[(-104)+rax],xmm13
  649. vmovaps XMMWORD[(-88)+rax],xmm14
  650. vmovaps XMMWORD[(-72)+rax],xmm15
  651. $L$mul_1024_body:
  652. mov rbp,rax
  653. vzeroall
  654. mov r13,rdx
  655. sub rsp,64
  656. DB 0x67,0x67
  657. mov r15,rsi
  658. and r15,4095
  659. add r15,32*10
  660. shr r15,12
  661. mov r15,rsi
  662. cmovnz rsi,r13
  663. cmovnz r13,r15
  664. mov r15,rcx
  665. sub rsi,-128
  666. sub rcx,-128
  667. sub rdi,-128
  668. and r15,4095
  669. add r15,32*10
  670. DB 0x67,0x67
  671. shr r15,12
  672. jz NEAR $L$mul_1024_no_n_copy
  673. sub rsp,32*10
  674. vmovdqu ymm0,YMMWORD[((0-128))+rcx]
  675. and rsp,-512
  676. vmovdqu ymm1,YMMWORD[((32-128))+rcx]
  677. vmovdqu ymm2,YMMWORD[((64-128))+rcx]
  678. vmovdqu ymm3,YMMWORD[((96-128))+rcx]
  679. vmovdqu ymm4,YMMWORD[((128-128))+rcx]
  680. vmovdqu ymm5,YMMWORD[((160-128))+rcx]
  681. vmovdqu ymm6,YMMWORD[((192-128))+rcx]
  682. vmovdqu ymm7,YMMWORD[((224-128))+rcx]
  683. vmovdqu ymm8,YMMWORD[((256-128))+rcx]
  684. lea rcx,[((64+128))+rsp]
  685. vmovdqu YMMWORD[(0-128)+rcx],ymm0
  686. vpxor ymm0,ymm0,ymm0
  687. vmovdqu YMMWORD[(32-128)+rcx],ymm1
  688. vpxor ymm1,ymm1,ymm1
  689. vmovdqu YMMWORD[(64-128)+rcx],ymm2
  690. vpxor ymm2,ymm2,ymm2
  691. vmovdqu YMMWORD[(96-128)+rcx],ymm3
  692. vpxor ymm3,ymm3,ymm3
  693. vmovdqu YMMWORD[(128-128)+rcx],ymm4
  694. vpxor ymm4,ymm4,ymm4
  695. vmovdqu YMMWORD[(160-128)+rcx],ymm5
  696. vpxor ymm5,ymm5,ymm5
  697. vmovdqu YMMWORD[(192-128)+rcx],ymm6
  698. vpxor ymm6,ymm6,ymm6
  699. vmovdqu YMMWORD[(224-128)+rcx],ymm7
  700. vpxor ymm7,ymm7,ymm7
  701. vmovdqu YMMWORD[(256-128)+rcx],ymm8
  702. vmovdqa ymm8,ymm0
  703. vmovdqu YMMWORD[(288-128)+rcx],ymm9
  704. $L$mul_1024_no_n_copy:
  705. and rsp,-64
  706. mov rbx,QWORD[r13]
  707. vpbroadcastq ymm10,QWORD[r13]
  708. vmovdqu YMMWORD[rsp],ymm0
  709. xor r9,r9
  710. DB 0x67
  711. xor r10,r10
  712. xor r11,r11
  713. xor r12,r12
  714. vmovdqu ymm15,YMMWORD[$L$and_mask]
  715. mov r14d,9
  716. vmovdqu YMMWORD[(288-128)+rdi],ymm9
  717. jmp NEAR $L$oop_mul_1024
  718. ALIGN 32
  719. $L$oop_mul_1024:
  720. vpsrlq ymm9,ymm3,29
  721. mov rax,rbx
  722. imul rax,QWORD[((-128))+rsi]
  723. add rax,r9
  724. mov r10,rbx
  725. imul r10,QWORD[((8-128))+rsi]
  726. add r10,QWORD[8+rsp]
  727. mov r9,rax
  728. imul eax,r8d
  729. and eax,0x1fffffff
  730. mov r11,rbx
  731. imul r11,QWORD[((16-128))+rsi]
  732. add r11,QWORD[16+rsp]
  733. mov r12,rbx
  734. imul r12,QWORD[((24-128))+rsi]
  735. add r12,QWORD[24+rsp]
  736. vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi]
  737. vmovd xmm11,eax
  738. vpaddq ymm1,ymm1,ymm0
  739. vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi]
  740. vpbroadcastq ymm11,xmm11
  741. vpaddq ymm2,ymm2,ymm12
  742. vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi]
  743. vpand ymm3,ymm3,ymm15
  744. vpaddq ymm3,ymm3,ymm13
  745. vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi]
  746. vpaddq ymm4,ymm4,ymm0
  747. vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi]
  748. vpaddq ymm5,ymm5,ymm12
  749. vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi]
  750. vpaddq ymm6,ymm6,ymm13
  751. vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi]
  752. vpermq ymm9,ymm9,0x93
  753. vpaddq ymm7,ymm7,ymm0
  754. vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi]
  755. vpbroadcastq ymm10,QWORD[8+r13]
  756. vpaddq ymm8,ymm8,ymm12
  757. mov rdx,rax
  758. imul rax,QWORD[((-128))+rcx]
  759. add r9,rax
  760. mov rax,rdx
  761. imul rax,QWORD[((8-128))+rcx]
  762. add r10,rax
  763. mov rax,rdx
  764. imul rax,QWORD[((16-128))+rcx]
  765. add r11,rax
  766. shr r9,29
  767. imul rdx,QWORD[((24-128))+rcx]
  768. add r12,rdx
  769. add r10,r9
  770. vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx]
  771. vmovq rbx,xmm10
  772. vpaddq ymm1,ymm1,ymm13
  773. vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx]
  774. vpaddq ymm2,ymm2,ymm0
  775. vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx]
  776. vpaddq ymm3,ymm3,ymm12
  777. vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx]
  778. vpaddq ymm4,ymm4,ymm13
  779. vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx]
  780. vpaddq ymm5,ymm5,ymm0
  781. vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx]
  782. vpaddq ymm6,ymm6,ymm12
  783. vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx]
  784. vpblendd ymm12,ymm9,ymm14,3
  785. vpaddq ymm7,ymm7,ymm13
  786. vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx]
  787. vpaddq ymm3,ymm3,ymm12
  788. vpaddq ymm8,ymm8,ymm0
  789. mov rax,rbx
  790. imul rax,QWORD[((-128))+rsi]
  791. add r10,rax
  792. vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi]
  793. mov rax,rbx
  794. imul rax,QWORD[((8-128))+rsi]
  795. add r11,rax
  796. vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi]
  797. mov rax,r10
  798. vpblendd ymm9,ymm9,ymm14,0xfc
  799. imul eax,r8d
  800. vpaddq ymm4,ymm4,ymm9
  801. and eax,0x1fffffff
  802. imul rbx,QWORD[((16-128))+rsi]
  803. add r12,rbx
  804. vpmuludq ymm12,ymm12,ymm10
  805. vmovd xmm11,eax
  806. vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi]
  807. vpaddq ymm1,ymm1,ymm12
  808. vpmuludq ymm13,ymm13,ymm10
  809. vpbroadcastq ymm11,xmm11
  810. vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi]
  811. vpaddq ymm2,ymm2,ymm13
  812. vpmuludq ymm0,ymm0,ymm10
  813. vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi]
  814. vpaddq ymm3,ymm3,ymm0
  815. vpmuludq ymm12,ymm12,ymm10
  816. vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi]
  817. vpaddq ymm4,ymm4,ymm12
  818. vpmuludq ymm13,ymm13,ymm10
  819. vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi]
  820. vpaddq ymm5,ymm5,ymm13
  821. vpmuludq ymm0,ymm0,ymm10
  822. vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi]
  823. vpaddq ymm6,ymm6,ymm0
  824. vpmuludq ymm12,ymm12,ymm10
  825. vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi]
  826. vpaddq ymm7,ymm7,ymm12
  827. vpmuludq ymm13,ymm13,ymm10
  828. vpaddq ymm8,ymm8,ymm13
  829. vpmuludq ymm9,ymm9,ymm10
  830. vpbroadcastq ymm10,QWORD[16+r13]
  831. mov rdx,rax
  832. imul rax,QWORD[((-128))+rcx]
  833. add r10,rax
  834. vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx]
  835. mov rax,rdx
  836. imul rax,QWORD[((8-128))+rcx]
  837. add r11,rax
  838. vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx]
  839. shr r10,29
  840. imul rdx,QWORD[((16-128))+rcx]
  841. add r12,rdx
  842. add r11,r10
  843. vpmuludq ymm0,ymm0,ymm11
  844. vmovq rbx,xmm10
  845. vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx]
  846. vpaddq ymm1,ymm1,ymm0
  847. vpmuludq ymm12,ymm12,ymm11
  848. vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx]
  849. vpaddq ymm2,ymm2,ymm12
  850. vpmuludq ymm13,ymm13,ymm11
  851. vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx]
  852. vpaddq ymm3,ymm3,ymm13
  853. vpmuludq ymm0,ymm0,ymm11
  854. vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx]
  855. vpaddq ymm4,ymm4,ymm0
  856. vpmuludq ymm12,ymm12,ymm11
  857. vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx]
  858. vpaddq ymm5,ymm5,ymm12
  859. vpmuludq ymm13,ymm13,ymm11
  860. vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx]
  861. vpaddq ymm6,ymm6,ymm13
  862. vpmuludq ymm0,ymm0,ymm11
  863. vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx]
  864. vpaddq ymm7,ymm7,ymm0
  865. vpmuludq ymm12,ymm12,ymm11
  866. vpaddq ymm8,ymm8,ymm12
  867. vpmuludq ymm13,ymm13,ymm11
  868. vpaddq ymm9,ymm9,ymm13
  869. vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi]
  870. mov rax,rbx
  871. imul rax,QWORD[((-128))+rsi]
  872. add rax,r11
  873. vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi]
  874. mov r11,rax
  875. imul eax,r8d
  876. and eax,0x1fffffff
  877. imul rbx,QWORD[((8-128))+rsi]
  878. add r12,rbx
  879. vpmuludq ymm0,ymm0,ymm10
  880. vmovd xmm11,eax
  881. vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi]
  882. vpaddq ymm1,ymm1,ymm0
  883. vpmuludq ymm12,ymm12,ymm10
  884. vpbroadcastq ymm11,xmm11
  885. vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi]
  886. vpaddq ymm2,ymm2,ymm12
  887. vpmuludq ymm13,ymm13,ymm10
  888. vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi]
  889. vpaddq ymm3,ymm3,ymm13
  890. vpmuludq ymm0,ymm0,ymm10
  891. vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi]
  892. vpaddq ymm4,ymm4,ymm0
  893. vpmuludq ymm12,ymm12,ymm10
  894. vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi]
  895. vpaddq ymm5,ymm5,ymm12
  896. vpmuludq ymm13,ymm13,ymm10
  897. vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi]
  898. vpaddq ymm6,ymm6,ymm13
  899. vpmuludq ymm0,ymm0,ymm10
  900. vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi]
  901. vpaddq ymm7,ymm7,ymm0
  902. vpmuludq ymm12,ymm12,ymm10
  903. vpaddq ymm8,ymm8,ymm12
  904. vpmuludq ymm13,ymm13,ymm10
  905. vpbroadcastq ymm10,QWORD[24+r13]
  906. vpaddq ymm9,ymm9,ymm13
  907. vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx]
  908. mov rdx,rax
  909. imul rax,QWORD[((-128))+rcx]
  910. add r11,rax
  911. vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx]
  912. imul rdx,QWORD[((8-128))+rcx]
  913. add r12,rdx
  914. shr r11,29
  915. vpmuludq ymm0,ymm0,ymm11
  916. vmovq rbx,xmm10
  917. vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx]
  918. vpaddq ymm1,ymm1,ymm0
  919. vpmuludq ymm12,ymm12,ymm11
  920. vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx]
  921. vpaddq ymm2,ymm2,ymm12
  922. vpmuludq ymm13,ymm13,ymm11
  923. vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx]
  924. vpaddq ymm3,ymm3,ymm13
  925. vpmuludq ymm0,ymm0,ymm11
  926. vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx]
  927. vpaddq ymm4,ymm4,ymm0
  928. vpmuludq ymm12,ymm12,ymm11
  929. vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx]
  930. vpaddq ymm5,ymm5,ymm12
  931. vpmuludq ymm13,ymm13,ymm11
  932. vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx]
  933. vpaddq ymm6,ymm6,ymm13
  934. vpmuludq ymm0,ymm0,ymm11
  935. vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx]
  936. vpaddq ymm7,ymm7,ymm0
  937. vpmuludq ymm12,ymm12,ymm11
  938. vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi]
  939. vpaddq ymm8,ymm8,ymm12
  940. vpmuludq ymm13,ymm13,ymm11
  941. vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi]
  942. vpaddq ymm9,ymm9,ymm13
  943. add r12,r11
  944. imul rbx,QWORD[((-128))+rsi]
  945. add r12,rbx
  946. mov rax,r12
  947. imul eax,r8d
  948. and eax,0x1fffffff
  949. vpmuludq ymm0,ymm0,ymm10
  950. vmovd xmm11,eax
  951. vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi]
  952. vpaddq ymm1,ymm1,ymm0
  953. vpmuludq ymm12,ymm12,ymm10
  954. vpbroadcastq ymm11,xmm11
  955. vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi]
  956. vpaddq ymm2,ymm2,ymm12
  957. vpmuludq ymm13,ymm13,ymm10
  958. vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi]
  959. vpaddq ymm3,ymm3,ymm13
  960. vpmuludq ymm0,ymm0,ymm10
  961. vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi]
  962. vpaddq ymm4,ymm4,ymm0
  963. vpmuludq ymm12,ymm12,ymm10
  964. vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi]
  965. vpaddq ymm5,ymm5,ymm12
  966. vpmuludq ymm13,ymm13,ymm10
  967. vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi]
  968. vpaddq ymm6,ymm6,ymm13
  969. vpmuludq ymm0,ymm0,ymm10
  970. vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi]
  971. vpaddq ymm7,ymm7,ymm0
  972. vpmuludq ymm12,ymm12,ymm10
  973. vpaddq ymm8,ymm8,ymm12
  974. vpmuludq ymm13,ymm13,ymm10
  975. vpbroadcastq ymm10,QWORD[32+r13]
  976. vpaddq ymm9,ymm9,ymm13
  977. add r13,32
  978. vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx]
  979. imul rax,QWORD[((-128))+rcx]
  980. add r12,rax
  981. shr r12,29
  982. vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx]
  983. vpmuludq ymm0,ymm0,ymm11
  984. vmovq rbx,xmm10
  985. vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx]
  986. vpaddq ymm0,ymm1,ymm0
  987. vpmuludq ymm12,ymm12,ymm11
  988. vmovdqu YMMWORD[rsp],ymm0
  989. vpaddq ymm1,ymm2,ymm12
  990. vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx]
  991. vpmuludq ymm13,ymm13,ymm11
  992. vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx]
  993. vpaddq ymm2,ymm3,ymm13
  994. vpmuludq ymm0,ymm0,ymm11
  995. vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx]
  996. vpaddq ymm3,ymm4,ymm0
  997. vpmuludq ymm12,ymm12,ymm11
  998. vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx]
  999. vpaddq ymm4,ymm5,ymm12
  1000. vpmuludq ymm13,ymm13,ymm11
  1001. vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx]
  1002. vpaddq ymm5,ymm6,ymm13
  1003. vpmuludq ymm0,ymm0,ymm11
  1004. vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx]
  1005. mov r9,r12
  1006. vpaddq ymm6,ymm7,ymm0
  1007. vpmuludq ymm12,ymm12,ymm11
  1008. add r9,QWORD[rsp]
  1009. vpaddq ymm7,ymm8,ymm12
  1010. vpmuludq ymm13,ymm13,ymm11
  1011. vmovq xmm12,r12
  1012. vpaddq ymm8,ymm9,ymm13
  1013. dec r14d
  1014. jnz NEAR $L$oop_mul_1024
  1015. vpaddq ymm0,ymm12,YMMWORD[rsp]
  1016. vpsrlq ymm12,ymm0,29
  1017. vpand ymm0,ymm0,ymm15
  1018. vpsrlq ymm13,ymm1,29
  1019. vpand ymm1,ymm1,ymm15
  1020. vpsrlq ymm10,ymm2,29
  1021. vpermq ymm12,ymm12,0x93
  1022. vpand ymm2,ymm2,ymm15
  1023. vpsrlq ymm11,ymm3,29
  1024. vpermq ymm13,ymm13,0x93
  1025. vpand ymm3,ymm3,ymm15
  1026. vpblendd ymm9,ymm12,ymm14,3
  1027. vpermq ymm10,ymm10,0x93
  1028. vpblendd ymm12,ymm13,ymm12,3
  1029. vpermq ymm11,ymm11,0x93
  1030. vpaddq ymm0,ymm0,ymm9
  1031. vpblendd ymm13,ymm10,ymm13,3
  1032. vpaddq ymm1,ymm1,ymm12
  1033. vpblendd ymm10,ymm11,ymm10,3
  1034. vpaddq ymm2,ymm2,ymm13
  1035. vpblendd ymm11,ymm14,ymm11,3
  1036. vpaddq ymm3,ymm3,ymm10
  1037. vpaddq ymm4,ymm4,ymm11
  1038. vpsrlq ymm12,ymm0,29
  1039. vpand ymm0,ymm0,ymm15
  1040. vpsrlq ymm13,ymm1,29
  1041. vpand ymm1,ymm1,ymm15
  1042. vpsrlq ymm10,ymm2,29
  1043. vpermq ymm12,ymm12,0x93
  1044. vpand ymm2,ymm2,ymm15
  1045. vpsrlq ymm11,ymm3,29
  1046. vpermq ymm13,ymm13,0x93
  1047. vpand ymm3,ymm3,ymm15
  1048. vpermq ymm10,ymm10,0x93
  1049. vpblendd ymm9,ymm12,ymm14,3
  1050. vpermq ymm11,ymm11,0x93
  1051. vpblendd ymm12,ymm13,ymm12,3
  1052. vpaddq ymm0,ymm0,ymm9
  1053. vpblendd ymm13,ymm10,ymm13,3
  1054. vpaddq ymm1,ymm1,ymm12
  1055. vpblendd ymm10,ymm11,ymm10,3
  1056. vpaddq ymm2,ymm2,ymm13
  1057. vpblendd ymm11,ymm14,ymm11,3
  1058. vpaddq ymm3,ymm3,ymm10
  1059. vpaddq ymm4,ymm4,ymm11
  1060. vmovdqu YMMWORD[(0-128)+rdi],ymm0
  1061. vmovdqu YMMWORD[(32-128)+rdi],ymm1
  1062. vmovdqu YMMWORD[(64-128)+rdi],ymm2
  1063. vmovdqu YMMWORD[(96-128)+rdi],ymm3
  1064. vpsrlq ymm12,ymm4,29
  1065. vpand ymm4,ymm4,ymm15
  1066. vpsrlq ymm13,ymm5,29
  1067. vpand ymm5,ymm5,ymm15
  1068. vpsrlq ymm10,ymm6,29
  1069. vpermq ymm12,ymm12,0x93
  1070. vpand ymm6,ymm6,ymm15
  1071. vpsrlq ymm11,ymm7,29
  1072. vpermq ymm13,ymm13,0x93
  1073. vpand ymm7,ymm7,ymm15
  1074. vpsrlq ymm0,ymm8,29
  1075. vpermq ymm10,ymm10,0x93
  1076. vpand ymm8,ymm8,ymm15
  1077. vpermq ymm11,ymm11,0x93
  1078. vpblendd ymm9,ymm12,ymm14,3
  1079. vpermq ymm0,ymm0,0x93
  1080. vpblendd ymm12,ymm13,ymm12,3
  1081. vpaddq ymm4,ymm4,ymm9
  1082. vpblendd ymm13,ymm10,ymm13,3
  1083. vpaddq ymm5,ymm5,ymm12
  1084. vpblendd ymm10,ymm11,ymm10,3
  1085. vpaddq ymm6,ymm6,ymm13
  1086. vpblendd ymm11,ymm0,ymm11,3
  1087. vpaddq ymm7,ymm7,ymm10
  1088. vpaddq ymm8,ymm8,ymm11
  1089. vpsrlq ymm12,ymm4,29
  1090. vpand ymm4,ymm4,ymm15
  1091. vpsrlq ymm13,ymm5,29
  1092. vpand ymm5,ymm5,ymm15
  1093. vpsrlq ymm10,ymm6,29
  1094. vpermq ymm12,ymm12,0x93
  1095. vpand ymm6,ymm6,ymm15
  1096. vpsrlq ymm11,ymm7,29
  1097. vpermq ymm13,ymm13,0x93
  1098. vpand ymm7,ymm7,ymm15
  1099. vpsrlq ymm0,ymm8,29
  1100. vpermq ymm10,ymm10,0x93
  1101. vpand ymm8,ymm8,ymm15
  1102. vpermq ymm11,ymm11,0x93
  1103. vpblendd ymm9,ymm12,ymm14,3
  1104. vpermq ymm0,ymm0,0x93
  1105. vpblendd ymm12,ymm13,ymm12,3
  1106. vpaddq ymm4,ymm4,ymm9
  1107. vpblendd ymm13,ymm10,ymm13,3
  1108. vpaddq ymm5,ymm5,ymm12
  1109. vpblendd ymm10,ymm11,ymm10,3
  1110. vpaddq ymm6,ymm6,ymm13
  1111. vpblendd ymm11,ymm0,ymm11,3
  1112. vpaddq ymm7,ymm7,ymm10
  1113. vpaddq ymm8,ymm8,ymm11
  1114. vmovdqu YMMWORD[(128-128)+rdi],ymm4
  1115. vmovdqu YMMWORD[(160-128)+rdi],ymm5
  1116. vmovdqu YMMWORD[(192-128)+rdi],ymm6
  1117. vmovdqu YMMWORD[(224-128)+rdi],ymm7
  1118. vmovdqu YMMWORD[(256-128)+rdi],ymm8
  1119. vzeroupper
  1120. mov rax,rbp
  1121. $L$mul_1024_in_tail:
  1122. movaps xmm6,XMMWORD[((-216))+rax]
  1123. movaps xmm7,XMMWORD[((-200))+rax]
  1124. movaps xmm8,XMMWORD[((-184))+rax]
  1125. movaps xmm9,XMMWORD[((-168))+rax]
  1126. movaps xmm10,XMMWORD[((-152))+rax]
  1127. movaps xmm11,XMMWORD[((-136))+rax]
  1128. movaps xmm12,XMMWORD[((-120))+rax]
  1129. movaps xmm13,XMMWORD[((-104))+rax]
  1130. movaps xmm14,XMMWORD[((-88))+rax]
  1131. movaps xmm15,XMMWORD[((-72))+rax]
  1132. mov r15,QWORD[((-48))+rax]
  1133. mov r14,QWORD[((-40))+rax]
  1134. mov r13,QWORD[((-32))+rax]
  1135. mov r12,QWORD[((-24))+rax]
  1136. mov rbp,QWORD[((-16))+rax]
  1137. mov rbx,QWORD[((-8))+rax]
  1138. lea rsp,[rax]
  1139. $L$mul_1024_epilogue:
  1140. mov rdi,QWORD[8+rsp] ;WIN64 epilogue
  1141. mov rsi,QWORD[16+rsp]
  1142. DB 0F3h,0C3h ;repret
  1143. $L$SEH_end_rsaz_1024_mul_avx2:
  1144. global rsaz_1024_red2norm_avx2
  1145. ALIGN 32
  1146. rsaz_1024_red2norm_avx2:
  1147. sub rdx,-128
  1148. xor rax,rax
  1149. mov r8,QWORD[((-128))+rdx]
  1150. mov r9,QWORD[((-120))+rdx]
  1151. mov r10,QWORD[((-112))+rdx]
  1152. shl r8,0
  1153. shl r9,29
  1154. mov r11,r10
  1155. shl r10,58
  1156. shr r11,6
  1157. add rax,r8
  1158. add rax,r9
  1159. add rax,r10
  1160. adc r11,0
  1161. mov QWORD[rcx],rax
  1162. mov rax,r11
  1163. mov r8,QWORD[((-104))+rdx]
  1164. mov r9,QWORD[((-96))+rdx]
  1165. shl r8,23
  1166. mov r10,r9
  1167. shl r9,52
  1168. shr r10,12
  1169. add rax,r8
  1170. add rax,r9
  1171. adc r10,0
  1172. mov QWORD[8+rcx],rax
  1173. mov rax,r10
  1174. mov r11,QWORD[((-88))+rdx]
  1175. mov r8,QWORD[((-80))+rdx]
  1176. shl r11,17
  1177. mov r9,r8
  1178. shl r8,46
  1179. shr r9,18
  1180. add rax,r11
  1181. add rax,r8
  1182. adc r9,0
  1183. mov QWORD[16+rcx],rax
  1184. mov rax,r9
  1185. mov r10,QWORD[((-72))+rdx]
  1186. mov r11,QWORD[((-64))+rdx]
  1187. shl r10,11
  1188. mov r8,r11
  1189. shl r11,40
  1190. shr r8,24
  1191. add rax,r10
  1192. add rax,r11
  1193. adc r8,0
  1194. mov QWORD[24+rcx],rax
  1195. mov rax,r8
  1196. mov r9,QWORD[((-56))+rdx]
  1197. mov r10,QWORD[((-48))+rdx]
  1198. mov r11,QWORD[((-40))+rdx]
  1199. shl r9,5
  1200. shl r10,34
  1201. mov r8,r11
  1202. shl r11,63
  1203. shr r8,1
  1204. add rax,r9
  1205. add rax,r10
  1206. add rax,r11
  1207. adc r8,0
  1208. mov QWORD[32+rcx],rax
  1209. mov rax,r8
  1210. mov r9,QWORD[((-32))+rdx]
  1211. mov r10,QWORD[((-24))+rdx]
  1212. shl r9,28
  1213. mov r11,r10
  1214. shl r10,57
  1215. shr r11,7
  1216. add rax,r9
  1217. add rax,r10
  1218. adc r11,0
  1219. mov QWORD[40+rcx],rax
  1220. mov rax,r11
  1221. mov r8,QWORD[((-16))+rdx]
  1222. mov r9,QWORD[((-8))+rdx]
  1223. shl r8,22
  1224. mov r10,r9
  1225. shl r9,51
  1226. shr r10,13
  1227. add rax,r8
  1228. add rax,r9
  1229. adc r10,0
  1230. mov QWORD[48+rcx],rax
  1231. mov rax,r10
  1232. mov r11,QWORD[rdx]
  1233. mov r8,QWORD[8+rdx]
  1234. shl r11,16
  1235. mov r9,r8
  1236. shl r8,45
  1237. shr r9,19
  1238. add rax,r11
  1239. add rax,r8
  1240. adc r9,0
  1241. mov QWORD[56+rcx],rax
  1242. mov rax,r9
  1243. mov r10,QWORD[16+rdx]
  1244. mov r11,QWORD[24+rdx]
  1245. shl r10,10
  1246. mov r8,r11
  1247. shl r11,39
  1248. shr r8,25
  1249. add rax,r10
  1250. add rax,r11
  1251. adc r8,0
  1252. mov QWORD[64+rcx],rax
  1253. mov rax,r8
  1254. mov r9,QWORD[32+rdx]
  1255. mov r10,QWORD[40+rdx]
  1256. mov r11,QWORD[48+rdx]
  1257. shl r9,4
  1258. shl r10,33
  1259. mov r8,r11
  1260. shl r11,62
  1261. shr r8,2
  1262. add rax,r9
  1263. add rax,r10
  1264. add rax,r11
  1265. adc r8,0
  1266. mov QWORD[72+rcx],rax
  1267. mov rax,r8
  1268. mov r9,QWORD[56+rdx]
  1269. mov r10,QWORD[64+rdx]
  1270. shl r9,27
  1271. mov r11,r10
  1272. shl r10,56
  1273. shr r11,8
  1274. add rax,r9
  1275. add rax,r10
  1276. adc r11,0
  1277. mov QWORD[80+rcx],rax
  1278. mov rax,r11
  1279. mov r8,QWORD[72+rdx]
  1280. mov r9,QWORD[80+rdx]
  1281. shl r8,21
  1282. mov r10,r9
  1283. shl r9,50
  1284. shr r10,14
  1285. add rax,r8
  1286. add rax,r9
  1287. adc r10,0
  1288. mov QWORD[88+rcx],rax
  1289. mov rax,r10
  1290. mov r11,QWORD[88+rdx]
  1291. mov r8,QWORD[96+rdx]
  1292. shl r11,15
  1293. mov r9,r8
  1294. shl r8,44
  1295. shr r9,20
  1296. add rax,r11
  1297. add rax,r8
  1298. adc r9,0
  1299. mov QWORD[96+rcx],rax
  1300. mov rax,r9
  1301. mov r10,QWORD[104+rdx]
  1302. mov r11,QWORD[112+rdx]
  1303. shl r10,9
  1304. mov r8,r11
  1305. shl r11,38
  1306. shr r8,26
  1307. add rax,r10
  1308. add rax,r11
  1309. adc r8,0
  1310. mov QWORD[104+rcx],rax
  1311. mov rax,r8
  1312. mov r9,QWORD[120+rdx]
  1313. mov r10,QWORD[128+rdx]
  1314. mov r11,QWORD[136+rdx]
  1315. shl r9,3
  1316. shl r10,32
  1317. mov r8,r11
  1318. shl r11,61
  1319. shr r8,3
  1320. add rax,r9
  1321. add rax,r10
  1322. add rax,r11
  1323. adc r8,0
  1324. mov QWORD[112+rcx],rax
  1325. mov rax,r8
  1326. mov r9,QWORD[144+rdx]
  1327. mov r10,QWORD[152+rdx]
  1328. shl r9,26
  1329. mov r11,r10
  1330. shl r10,55
  1331. shr r11,9
  1332. add rax,r9
  1333. add rax,r10
  1334. adc r11,0
  1335. mov QWORD[120+rcx],rax
  1336. mov rax,r11
  1337. DB 0F3h,0C3h ;repret
  1338. global rsaz_1024_norm2red_avx2
  1339. ALIGN 32
  1340. rsaz_1024_norm2red_avx2:
  1341. sub rcx,-128
  1342. mov r8,QWORD[rdx]
  1343. mov eax,0x1fffffff
  1344. mov r9,QWORD[8+rdx]
  1345. mov r11,r8
  1346. shr r11,0
  1347. and r11,rax
  1348. mov QWORD[((-128))+rcx],r11
  1349. mov r10,r8
  1350. shr r10,29
  1351. and r10,rax
  1352. mov QWORD[((-120))+rcx],r10
  1353. shrd r8,r9,58
  1354. and r8,rax
  1355. mov QWORD[((-112))+rcx],r8
  1356. mov r10,QWORD[16+rdx]
  1357. mov r8,r9
  1358. shr r8,23
  1359. and r8,rax
  1360. mov QWORD[((-104))+rcx],r8
  1361. shrd r9,r10,52
  1362. and r9,rax
  1363. mov QWORD[((-96))+rcx],r9
  1364. mov r11,QWORD[24+rdx]
  1365. mov r9,r10
  1366. shr r9,17
  1367. and r9,rax
  1368. mov QWORD[((-88))+rcx],r9
  1369. shrd r10,r11,46
  1370. and r10,rax
  1371. mov QWORD[((-80))+rcx],r10
  1372. mov r8,QWORD[32+rdx]
  1373. mov r10,r11
  1374. shr r10,11
  1375. and r10,rax
  1376. mov QWORD[((-72))+rcx],r10
  1377. shrd r11,r8,40
  1378. and r11,rax
  1379. mov QWORD[((-64))+rcx],r11
  1380. mov r9,QWORD[40+rdx]
  1381. mov r11,r8
  1382. shr r11,5
  1383. and r11,rax
  1384. mov QWORD[((-56))+rcx],r11
  1385. mov r10,r8
  1386. shr r10,34
  1387. and r10,rax
  1388. mov QWORD[((-48))+rcx],r10
  1389. shrd r8,r9,63
  1390. and r8,rax
  1391. mov QWORD[((-40))+rcx],r8
  1392. mov r10,QWORD[48+rdx]
  1393. mov r8,r9
  1394. shr r8,28
  1395. and r8,rax
  1396. mov QWORD[((-32))+rcx],r8
  1397. shrd r9,r10,57
  1398. and r9,rax
  1399. mov QWORD[((-24))+rcx],r9
  1400. mov r11,QWORD[56+rdx]
  1401. mov r9,r10
  1402. shr r9,22
  1403. and r9,rax
  1404. mov QWORD[((-16))+rcx],r9
  1405. shrd r10,r11,51
  1406. and r10,rax
  1407. mov QWORD[((-8))+rcx],r10
  1408. mov r8,QWORD[64+rdx]
  1409. mov r10,r11
  1410. shr r10,16
  1411. and r10,rax
  1412. mov QWORD[rcx],r10
  1413. shrd r11,r8,45
  1414. and r11,rax
  1415. mov QWORD[8+rcx],r11
  1416. mov r9,QWORD[72+rdx]
  1417. mov r11,r8
  1418. shr r11,10
  1419. and r11,rax
  1420. mov QWORD[16+rcx],r11
  1421. shrd r8,r9,39
  1422. and r8,rax
  1423. mov QWORD[24+rcx],r8
  1424. mov r10,QWORD[80+rdx]
  1425. mov r8,r9
  1426. shr r8,4
  1427. and r8,rax
  1428. mov QWORD[32+rcx],r8
  1429. mov r11,r9
  1430. shr r11,33
  1431. and r11,rax
  1432. mov QWORD[40+rcx],r11
  1433. shrd r9,r10,62
  1434. and r9,rax
  1435. mov QWORD[48+rcx],r9
  1436. mov r11,QWORD[88+rdx]
  1437. mov r9,r10
  1438. shr r9,27
  1439. and r9,rax
  1440. mov QWORD[56+rcx],r9
  1441. shrd r10,r11,56
  1442. and r10,rax
  1443. mov QWORD[64+rcx],r10
  1444. mov r8,QWORD[96+rdx]
  1445. mov r10,r11
  1446. shr r10,21
  1447. and r10,rax
  1448. mov QWORD[72+rcx],r10
  1449. shrd r11,r8,50
  1450. and r11,rax
  1451. mov QWORD[80+rcx],r11
  1452. mov r9,QWORD[104+rdx]
  1453. mov r11,r8
  1454. shr r11,15
  1455. and r11,rax
  1456. mov QWORD[88+rcx],r11
  1457. shrd r8,r9,44
  1458. and r8,rax
  1459. mov QWORD[96+rcx],r8
  1460. mov r10,QWORD[112+rdx]
  1461. mov r8,r9
  1462. shr r8,9
  1463. and r8,rax
  1464. mov QWORD[104+rcx],r8
  1465. shrd r9,r10,38
  1466. and r9,rax
  1467. mov QWORD[112+rcx],r9
  1468. mov r11,QWORD[120+rdx]
  1469. mov r9,r10
  1470. shr r9,3
  1471. and r9,rax
  1472. mov QWORD[120+rcx],r9
  1473. mov r8,r10
  1474. shr r8,32
  1475. and r8,rax
  1476. mov QWORD[128+rcx],r8
  1477. shrd r10,r11,61
  1478. and r10,rax
  1479. mov QWORD[136+rcx],r10
  1480. xor r8,r8
  1481. mov r10,r11
  1482. shr r10,26
  1483. and r10,rax
  1484. mov QWORD[144+rcx],r10
  1485. shrd r11,r8,55
  1486. and r11,rax
  1487. mov QWORD[152+rcx],r11
  1488. mov QWORD[160+rcx],r8
  1489. mov QWORD[168+rcx],r8
  1490. mov QWORD[176+rcx],r8
  1491. mov QWORD[184+rcx],r8
  1492. DB 0F3h,0C3h ;repret
  1493. global rsaz_1024_scatter5_avx2
  1494. ALIGN 32
  1495. rsaz_1024_scatter5_avx2:
  1496. vzeroupper
  1497. vmovdqu ymm5,YMMWORD[$L$scatter_permd]
  1498. shl r8d,4
  1499. lea rcx,[r8*1+rcx]
  1500. mov eax,9
  1501. jmp NEAR $L$oop_scatter_1024
  1502. ALIGN 32
  1503. $L$oop_scatter_1024:
  1504. vmovdqu ymm0,YMMWORD[rdx]
  1505. lea rdx,[32+rdx]
  1506. vpermd ymm0,ymm5,ymm0
  1507. vmovdqu XMMWORD[rcx],xmm0
  1508. lea rcx,[512+rcx]
  1509. dec eax
  1510. jnz NEAR $L$oop_scatter_1024
  1511. vzeroupper
  1512. DB 0F3h,0C3h ;repret
  1513. global rsaz_1024_gather5_avx2
  1514. ALIGN 32
  1515. rsaz_1024_gather5_avx2:
  1516. vzeroupper
  1517. mov r11,rsp
  1518. lea rax,[((-136))+rsp]
  1519. $L$SEH_begin_rsaz_1024_gather5:
  1520. DB 0x48,0x8d,0x60,0xe0
  1521. DB 0xc5,0xf8,0x29,0x70,0xe0
  1522. DB 0xc5,0xf8,0x29,0x78,0xf0
  1523. DB 0xc5,0x78,0x29,0x40,0x00
  1524. DB 0xc5,0x78,0x29,0x48,0x10
  1525. DB 0xc5,0x78,0x29,0x50,0x20
  1526. DB 0xc5,0x78,0x29,0x58,0x30
  1527. DB 0xc5,0x78,0x29,0x60,0x40
  1528. DB 0xc5,0x78,0x29,0x68,0x50
  1529. DB 0xc5,0x78,0x29,0x70,0x60
  1530. DB 0xc5,0x78,0x29,0x78,0x70
  1531. lea rsp,[((-256))+rsp]
  1532. and rsp,-32
  1533. lea r10,[$L$inc]
  1534. lea rax,[((-128))+rsp]
  1535. vmovd xmm4,r8d
  1536. vmovdqa ymm0,YMMWORD[r10]
  1537. vmovdqa ymm1,YMMWORD[32+r10]
  1538. vmovdqa ymm5,YMMWORD[64+r10]
  1539. vpbroadcastd ymm4,xmm4
  1540. vpaddd ymm2,ymm0,ymm5
  1541. vpcmpeqd ymm0,ymm0,ymm4
  1542. vpaddd ymm3,ymm1,ymm5
  1543. vpcmpeqd ymm1,ymm1,ymm4
  1544. vmovdqa YMMWORD[(0+128)+rax],ymm0
  1545. vpaddd ymm0,ymm2,ymm5
  1546. vpcmpeqd ymm2,ymm2,ymm4
  1547. vmovdqa YMMWORD[(32+128)+rax],ymm1
  1548. vpaddd ymm1,ymm3,ymm5
  1549. vpcmpeqd ymm3,ymm3,ymm4
  1550. vmovdqa YMMWORD[(64+128)+rax],ymm2
  1551. vpaddd ymm2,ymm0,ymm5
  1552. vpcmpeqd ymm0,ymm0,ymm4
  1553. vmovdqa YMMWORD[(96+128)+rax],ymm3
  1554. vpaddd ymm3,ymm1,ymm5
  1555. vpcmpeqd ymm1,ymm1,ymm4
  1556. vmovdqa YMMWORD[(128+128)+rax],ymm0
  1557. vpaddd ymm8,ymm2,ymm5
  1558. vpcmpeqd ymm2,ymm2,ymm4
  1559. vmovdqa YMMWORD[(160+128)+rax],ymm1
  1560. vpaddd ymm9,ymm3,ymm5
  1561. vpcmpeqd ymm3,ymm3,ymm4
  1562. vmovdqa YMMWORD[(192+128)+rax],ymm2
  1563. vpaddd ymm10,ymm8,ymm5
  1564. vpcmpeqd ymm8,ymm8,ymm4
  1565. vmovdqa YMMWORD[(224+128)+rax],ymm3
  1566. vpaddd ymm11,ymm9,ymm5
  1567. vpcmpeqd ymm9,ymm9,ymm4
  1568. vpaddd ymm12,ymm10,ymm5
  1569. vpcmpeqd ymm10,ymm10,ymm4
  1570. vpaddd ymm13,ymm11,ymm5
  1571. vpcmpeqd ymm11,ymm11,ymm4
  1572. vpaddd ymm14,ymm12,ymm5
  1573. vpcmpeqd ymm12,ymm12,ymm4
  1574. vpaddd ymm15,ymm13,ymm5
  1575. vpcmpeqd ymm13,ymm13,ymm4
  1576. vpcmpeqd ymm14,ymm14,ymm4
  1577. vpcmpeqd ymm15,ymm15,ymm4
  1578. vmovdqa ymm7,YMMWORD[((-32))+r10]
  1579. lea rdx,[128+rdx]
  1580. mov r8d,9
  1581. $L$oop_gather_1024:
  1582. vmovdqa ymm0,YMMWORD[((0-128))+rdx]
  1583. vmovdqa ymm1,YMMWORD[((32-128))+rdx]
  1584. vmovdqa ymm2,YMMWORD[((64-128))+rdx]
  1585. vmovdqa ymm3,YMMWORD[((96-128))+rdx]
  1586. vpand ymm0,ymm0,YMMWORD[((0+128))+rax]
  1587. vpand ymm1,ymm1,YMMWORD[((32+128))+rax]
  1588. vpand ymm2,ymm2,YMMWORD[((64+128))+rax]
  1589. vpor ymm4,ymm1,ymm0
  1590. vpand ymm3,ymm3,YMMWORD[((96+128))+rax]
  1591. vmovdqa ymm0,YMMWORD[((128-128))+rdx]
  1592. vmovdqa ymm1,YMMWORD[((160-128))+rdx]
  1593. vpor ymm5,ymm3,ymm2
  1594. vmovdqa ymm2,YMMWORD[((192-128))+rdx]
  1595. vmovdqa ymm3,YMMWORD[((224-128))+rdx]
  1596. vpand ymm0,ymm0,YMMWORD[((128+128))+rax]
  1597. vpand ymm1,ymm1,YMMWORD[((160+128))+rax]
  1598. vpand ymm2,ymm2,YMMWORD[((192+128))+rax]
  1599. vpor ymm4,ymm4,ymm0
  1600. vpand ymm3,ymm3,YMMWORD[((224+128))+rax]
  1601. vpand ymm0,ymm8,YMMWORD[((256-128))+rdx]
  1602. vpor ymm5,ymm5,ymm1
  1603. vpand ymm1,ymm9,YMMWORD[((288-128))+rdx]
  1604. vpor ymm4,ymm4,ymm2
  1605. vpand ymm2,ymm10,YMMWORD[((320-128))+rdx]
  1606. vpor ymm5,ymm5,ymm3
  1607. vpand ymm3,ymm11,YMMWORD[((352-128))+rdx]
  1608. vpor ymm4,ymm4,ymm0
  1609. vpand ymm0,ymm12,YMMWORD[((384-128))+rdx]
  1610. vpor ymm5,ymm5,ymm1
  1611. vpand ymm1,ymm13,YMMWORD[((416-128))+rdx]
  1612. vpor ymm4,ymm4,ymm2
  1613. vpand ymm2,ymm14,YMMWORD[((448-128))+rdx]
  1614. vpor ymm5,ymm5,ymm3
  1615. vpand ymm3,ymm15,YMMWORD[((480-128))+rdx]
  1616. lea rdx,[512+rdx]
  1617. vpor ymm4,ymm4,ymm0
  1618. vpor ymm5,ymm5,ymm1
  1619. vpor ymm4,ymm4,ymm2
  1620. vpor ymm5,ymm5,ymm3
  1621. vpor ymm4,ymm4,ymm5
  1622. vextracti128 xmm5,ymm4,1
  1623. vpor xmm5,xmm5,xmm4
  1624. vpermd ymm5,ymm7,ymm5
  1625. vmovdqu YMMWORD[rcx],ymm5
  1626. lea rcx,[32+rcx]
  1627. dec r8d
  1628. jnz NEAR $L$oop_gather_1024
  1629. vpxor ymm0,ymm0,ymm0
  1630. vmovdqu YMMWORD[rcx],ymm0
  1631. vzeroupper
  1632. movaps xmm6,XMMWORD[((-168))+r11]
  1633. movaps xmm7,XMMWORD[((-152))+r11]
  1634. movaps xmm8,XMMWORD[((-136))+r11]
  1635. movaps xmm9,XMMWORD[((-120))+r11]
  1636. movaps xmm10,XMMWORD[((-104))+r11]
  1637. movaps xmm11,XMMWORD[((-88))+r11]
  1638. movaps xmm12,XMMWORD[((-72))+r11]
  1639. movaps xmm13,XMMWORD[((-56))+r11]
  1640. movaps xmm14,XMMWORD[((-40))+r11]
  1641. movaps xmm15,XMMWORD[((-24))+r11]
  1642. lea rsp,[r11]
  1643. DB 0F3h,0C3h ;repret
  1644. $L$SEH_end_rsaz_1024_gather5:
  1645. ALIGN 64
  1646. $L$and_mask:
  1647. DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
  1648. $L$scatter_permd:
  1649. DD 0,2,4,6,7,7,7,7
  1650. $L$gather_permd:
  1651. DD 0,7,1,7,2,7,3,7
  1652. $L$inc:
  1653. DD 0,0,0,0,1,1,1,1
  1654. DD 2,2,2,2,3,3,3,3
  1655. DD 4,4,4,4,4,4,4,4
  1656. ALIGN 64
  1657. EXTERN __imp_RtlVirtualUnwind
  1658. ALIGN 16
  1659. rsaz_se_handler:
  1660. push rsi
  1661. push rdi
  1662. push rbx
  1663. push rbp
  1664. push r12
  1665. push r13
  1666. push r14
  1667. push r15
  1668. pushfq
  1669. sub rsp,64
  1670. mov rax,QWORD[120+r8]
  1671. mov rbx,QWORD[248+r8]
  1672. mov rsi,QWORD[8+r9]
  1673. mov r11,QWORD[56+r9]
  1674. mov r10d,DWORD[r11]
  1675. lea r10,[r10*1+rsi]
  1676. cmp rbx,r10
  1677. jb NEAR $L$common_seh_tail
  1678. mov r10d,DWORD[4+r11]
  1679. lea r10,[r10*1+rsi]
  1680. cmp rbx,r10
  1681. jae NEAR $L$common_seh_tail
  1682. mov rbp,QWORD[160+r8]
  1683. mov r10d,DWORD[8+r11]
  1684. lea r10,[r10*1+rsi]
  1685. cmp rbx,r10
  1686. cmovc rax,rbp
  1687. mov r15,QWORD[((-48))+rax]
  1688. mov r14,QWORD[((-40))+rax]
  1689. mov r13,QWORD[((-32))+rax]
  1690. mov r12,QWORD[((-24))+rax]
  1691. mov rbp,QWORD[((-16))+rax]
  1692. mov rbx,QWORD[((-8))+rax]
  1693. mov QWORD[240+r8],r15
  1694. mov QWORD[232+r8],r14
  1695. mov QWORD[224+r8],r13
  1696. mov QWORD[216+r8],r12
  1697. mov QWORD[160+r8],rbp
  1698. mov QWORD[144+r8],rbx
  1699. lea rsi,[((-216))+rax]
  1700. lea rdi,[512+r8]
  1701. mov ecx,20
  1702. DD 0xa548f3fc
  1703. $L$common_seh_tail:
  1704. mov rdi,QWORD[8+rax]
  1705. mov rsi,QWORD[16+rax]
  1706. mov QWORD[152+r8],rax
  1707. mov QWORD[168+r8],rsi
  1708. mov QWORD[176+r8],rdi
  1709. mov rdi,QWORD[40+r9]
  1710. mov rsi,r8
  1711. mov ecx,154
  1712. DD 0xa548f3fc
  1713. mov rsi,r9
  1714. xor rcx,rcx
  1715. mov rdx,QWORD[8+rsi]
  1716. mov r8,QWORD[rsi]
  1717. mov r9,QWORD[16+rsi]
  1718. mov r10,QWORD[40+rsi]
  1719. lea r11,[56+rsi]
  1720. lea r12,[24+rsi]
  1721. mov QWORD[32+rsp],r10
  1722. mov QWORD[40+rsp],r11
  1723. mov QWORD[48+rsp],r12
  1724. mov QWORD[56+rsp],rcx
  1725. call QWORD[__imp_RtlVirtualUnwind]
  1726. mov eax,1
  1727. add rsp,64
  1728. popfq
  1729. pop r15
  1730. pop r14
  1731. pop r13
  1732. pop r12
  1733. pop rbp
  1734. pop rbx
  1735. pop rdi
  1736. pop rsi
  1737. DB 0F3h,0C3h ;repret
  1738. section .pdata rdata align=4
  1739. ALIGN 4
  1740. DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase
  1741. DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase
  1742. DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase
  1743. DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase
  1744. DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase
  1745. DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase
  1746. DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase
  1747. DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase
  1748. DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase
  1749. section .xdata rdata align=8
  1750. ALIGN 8
  1751. $L$SEH_info_rsaz_1024_sqr_avx2:
  1752. DB 9,0,0,0
  1753. DD rsaz_se_handler wrt ..imagebase
  1754. DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase
  1755. DD 0
  1756. $L$SEH_info_rsaz_1024_mul_avx2:
  1757. DB 9,0,0,0
  1758. DD rsaz_se_handler wrt ..imagebase
  1759. DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase
  1760. DD 0
  1761. $L$SEH_info_rsaz_1024_gather5:
  1762. DB 0x01,0x36,0x17,0x0b
  1763. DB 0x36,0xf8,0x09,0x00
  1764. DB 0x31,0xe8,0x08,0x00
  1765. DB 0x2c,0xd8,0x07,0x00
  1766. DB 0x27,0xc8,0x06,0x00
  1767. DB 0x22,0xb8,0x05,0x00
  1768. DB 0x1d,0xa8,0x04,0x00
  1769. DB 0x18,0x98,0x03,0x00
  1770. DB 0x13,0x88,0x02,0x00
  1771. DB 0x0e,0x78,0x01,0x00
  1772. DB 0x09,0x68,0x00,0x00
  1773. DB 0x04,0x01,0x15,0x00
  1774. DB 0x00,0xb3,0x00,0x00