ghash-x86_64.asm 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221
  1. ; This file is generated from a similarly-named Perl script in the BoringSSL
  2. ; source tree. Do not edit by hand.
  3. default rel
  4. %define XMMWORD
  5. %define YMMWORD
  6. %define ZMMWORD
  7. %ifdef BORINGSSL_PREFIX
  8. %include "boringssl_prefix_symbols_nasm.inc"
  9. %endif
  10. section .text code align=64
  11. EXTERN OPENSSL_ia32cap_P
  12. global gcm_init_clmul
  13. ALIGN 16
  14. gcm_init_clmul:
  15. $L$_init_clmul:
  16. $L$SEH_begin_gcm_init_clmul:
  17. DB 0x48,0x83,0xec,0x18
  18. DB 0x0f,0x29,0x34,0x24
  19. movdqu xmm2,XMMWORD[rdx]
  20. pshufd xmm2,xmm2,78
  21. pshufd xmm4,xmm2,255
  22. movdqa xmm3,xmm2
  23. psllq xmm2,1
  24. pxor xmm5,xmm5
  25. psrlq xmm3,63
  26. pcmpgtd xmm5,xmm4
  27. pslldq xmm3,8
  28. por xmm2,xmm3
  29. pand xmm5,XMMWORD[$L$0x1c2_polynomial]
  30. pxor xmm2,xmm5
  31. pshufd xmm6,xmm2,78
  32. movdqa xmm0,xmm2
  33. pxor xmm6,xmm2
  34. movdqa xmm1,xmm0
  35. pshufd xmm3,xmm0,78
  36. pxor xmm3,xmm0
  37. DB 102,15,58,68,194,0
  38. DB 102,15,58,68,202,17
  39. DB 102,15,58,68,222,0
  40. pxor xmm3,xmm0
  41. pxor xmm3,xmm1
  42. movdqa xmm4,xmm3
  43. psrldq xmm3,8
  44. pslldq xmm4,8
  45. pxor xmm1,xmm3
  46. pxor xmm0,xmm4
  47. movdqa xmm4,xmm0
  48. movdqa xmm3,xmm0
  49. psllq xmm0,5
  50. pxor xmm3,xmm0
  51. psllq xmm0,1
  52. pxor xmm0,xmm3
  53. psllq xmm0,57
  54. movdqa xmm3,xmm0
  55. pslldq xmm0,8
  56. psrldq xmm3,8
  57. pxor xmm0,xmm4
  58. pxor xmm1,xmm3
  59. movdqa xmm4,xmm0
  60. psrlq xmm0,1
  61. pxor xmm1,xmm4
  62. pxor xmm4,xmm0
  63. psrlq xmm0,5
  64. pxor xmm0,xmm4
  65. psrlq xmm0,1
  66. pxor xmm0,xmm1
  67. pshufd xmm3,xmm2,78
  68. pshufd xmm4,xmm0,78
  69. pxor xmm3,xmm2
  70. movdqu XMMWORD[rcx],xmm2
  71. pxor xmm4,xmm0
  72. movdqu XMMWORD[16+rcx],xmm0
  73. DB 102,15,58,15,227,8
  74. movdqu XMMWORD[32+rcx],xmm4
  75. movdqa xmm1,xmm0
  76. pshufd xmm3,xmm0,78
  77. pxor xmm3,xmm0
  78. DB 102,15,58,68,194,0
  79. DB 102,15,58,68,202,17
  80. DB 102,15,58,68,222,0
  81. pxor xmm3,xmm0
  82. pxor xmm3,xmm1
  83. movdqa xmm4,xmm3
  84. psrldq xmm3,8
  85. pslldq xmm4,8
  86. pxor xmm1,xmm3
  87. pxor xmm0,xmm4
  88. movdqa xmm4,xmm0
  89. movdqa xmm3,xmm0
  90. psllq xmm0,5
  91. pxor xmm3,xmm0
  92. psllq xmm0,1
  93. pxor xmm0,xmm3
  94. psllq xmm0,57
  95. movdqa xmm3,xmm0
  96. pslldq xmm0,8
  97. psrldq xmm3,8
  98. pxor xmm0,xmm4
  99. pxor xmm1,xmm3
  100. movdqa xmm4,xmm0
  101. psrlq xmm0,1
  102. pxor xmm1,xmm4
  103. pxor xmm4,xmm0
  104. psrlq xmm0,5
  105. pxor xmm0,xmm4
  106. psrlq xmm0,1
  107. pxor xmm0,xmm1
  108. movdqa xmm5,xmm0
  109. movdqa xmm1,xmm0
  110. pshufd xmm3,xmm0,78
  111. pxor xmm3,xmm0
  112. DB 102,15,58,68,194,0
  113. DB 102,15,58,68,202,17
  114. DB 102,15,58,68,222,0
  115. pxor xmm3,xmm0
  116. pxor xmm3,xmm1
  117. movdqa xmm4,xmm3
  118. psrldq xmm3,8
  119. pslldq xmm4,8
  120. pxor xmm1,xmm3
  121. pxor xmm0,xmm4
  122. movdqa xmm4,xmm0
  123. movdqa xmm3,xmm0
  124. psllq xmm0,5
  125. pxor xmm3,xmm0
  126. psllq xmm0,1
  127. pxor xmm0,xmm3
  128. psllq xmm0,57
  129. movdqa xmm3,xmm0
  130. pslldq xmm0,8
  131. psrldq xmm3,8
  132. pxor xmm0,xmm4
  133. pxor xmm1,xmm3
  134. movdqa xmm4,xmm0
  135. psrlq xmm0,1
  136. pxor xmm1,xmm4
  137. pxor xmm4,xmm0
  138. psrlq xmm0,5
  139. pxor xmm0,xmm4
  140. psrlq xmm0,1
  141. pxor xmm0,xmm1
  142. pshufd xmm3,xmm5,78
  143. pshufd xmm4,xmm0,78
  144. pxor xmm3,xmm5
  145. movdqu XMMWORD[48+rcx],xmm5
  146. pxor xmm4,xmm0
  147. movdqu XMMWORD[64+rcx],xmm0
  148. DB 102,15,58,15,227,8
  149. movdqu XMMWORD[80+rcx],xmm4
  150. movaps xmm6,XMMWORD[rsp]
  151. lea rsp,[24+rsp]
  152. $L$SEH_end_gcm_init_clmul:
  153. DB 0F3h,0C3h ;repret
  154. global gcm_gmult_clmul
  155. ALIGN 16
  156. gcm_gmult_clmul:
  157. $L$_gmult_clmul:
  158. movdqu xmm0,XMMWORD[rcx]
  159. movdqa xmm5,XMMWORD[$L$bswap_mask]
  160. movdqu xmm2,XMMWORD[rdx]
  161. movdqu xmm4,XMMWORD[32+rdx]
  162. DB 102,15,56,0,197
  163. movdqa xmm1,xmm0
  164. pshufd xmm3,xmm0,78
  165. pxor xmm3,xmm0
  166. DB 102,15,58,68,194,0
  167. DB 102,15,58,68,202,17
  168. DB 102,15,58,68,220,0
  169. pxor xmm3,xmm0
  170. pxor xmm3,xmm1
  171. movdqa xmm4,xmm3
  172. psrldq xmm3,8
  173. pslldq xmm4,8
  174. pxor xmm1,xmm3
  175. pxor xmm0,xmm4
  176. movdqa xmm4,xmm0
  177. movdqa xmm3,xmm0
  178. psllq xmm0,5
  179. pxor xmm3,xmm0
  180. psllq xmm0,1
  181. pxor xmm0,xmm3
  182. psllq xmm0,57
  183. movdqa xmm3,xmm0
  184. pslldq xmm0,8
  185. psrldq xmm3,8
  186. pxor xmm0,xmm4
  187. pxor xmm1,xmm3
  188. movdqa xmm4,xmm0
  189. psrlq xmm0,1
  190. pxor xmm1,xmm4
  191. pxor xmm4,xmm0
  192. psrlq xmm0,5
  193. pxor xmm0,xmm4
  194. psrlq xmm0,1
  195. pxor xmm0,xmm1
  196. DB 102,15,56,0,197
  197. movdqu XMMWORD[rcx],xmm0
  198. DB 0F3h,0C3h ;repret
  199. global gcm_ghash_clmul
  200. ALIGN 32
  201. gcm_ghash_clmul:
  202. $L$_ghash_clmul:
  203. lea rax,[((-136))+rsp]
  204. $L$SEH_begin_gcm_ghash_clmul:
  205. DB 0x48,0x8d,0x60,0xe0
  206. DB 0x0f,0x29,0x70,0xe0
  207. DB 0x0f,0x29,0x78,0xf0
  208. DB 0x44,0x0f,0x29,0x00
  209. DB 0x44,0x0f,0x29,0x48,0x10
  210. DB 0x44,0x0f,0x29,0x50,0x20
  211. DB 0x44,0x0f,0x29,0x58,0x30
  212. DB 0x44,0x0f,0x29,0x60,0x40
  213. DB 0x44,0x0f,0x29,0x68,0x50
  214. DB 0x44,0x0f,0x29,0x70,0x60
  215. DB 0x44,0x0f,0x29,0x78,0x70
  216. movdqa xmm10,XMMWORD[$L$bswap_mask]
  217. movdqu xmm0,XMMWORD[rcx]
  218. movdqu xmm2,XMMWORD[rdx]
  219. movdqu xmm7,XMMWORD[32+rdx]
  220. DB 102,65,15,56,0,194
  221. sub r9,0x10
  222. jz NEAR $L$odd_tail
  223. movdqu xmm6,XMMWORD[16+rdx]
  224. lea rax,[OPENSSL_ia32cap_P]
  225. mov eax,DWORD[4+rax]
  226. cmp r9,0x30
  227. jb NEAR $L$skip4x
  228. and eax,71303168
  229. cmp eax,4194304
  230. je NEAR $L$skip4x
  231. sub r9,0x30
  232. mov rax,0xA040608020C0E000
  233. movdqu xmm14,XMMWORD[48+rdx]
  234. movdqu xmm15,XMMWORD[64+rdx]
  235. movdqu xmm3,XMMWORD[48+r8]
  236. movdqu xmm11,XMMWORD[32+r8]
  237. DB 102,65,15,56,0,218
  238. DB 102,69,15,56,0,218
  239. movdqa xmm5,xmm3
  240. pshufd xmm4,xmm3,78
  241. pxor xmm4,xmm3
  242. DB 102,15,58,68,218,0
  243. DB 102,15,58,68,234,17
  244. DB 102,15,58,68,231,0
  245. movdqa xmm13,xmm11
  246. pshufd xmm12,xmm11,78
  247. pxor xmm12,xmm11
  248. DB 102,68,15,58,68,222,0
  249. DB 102,68,15,58,68,238,17
  250. DB 102,68,15,58,68,231,16
  251. xorps xmm3,xmm11
  252. xorps xmm5,xmm13
  253. movups xmm7,XMMWORD[80+rdx]
  254. xorps xmm4,xmm12
  255. movdqu xmm11,XMMWORD[16+r8]
  256. movdqu xmm8,XMMWORD[r8]
  257. DB 102,69,15,56,0,218
  258. DB 102,69,15,56,0,194
  259. movdqa xmm13,xmm11
  260. pshufd xmm12,xmm11,78
  261. pxor xmm0,xmm8
  262. pxor xmm12,xmm11
  263. DB 102,69,15,58,68,222,0
  264. movdqa xmm1,xmm0
  265. pshufd xmm8,xmm0,78
  266. pxor xmm8,xmm0
  267. DB 102,69,15,58,68,238,17
  268. DB 102,68,15,58,68,231,0
  269. xorps xmm3,xmm11
  270. xorps xmm5,xmm13
  271. lea r8,[64+r8]
  272. sub r9,0x40
  273. jc NEAR $L$tail4x
  274. jmp NEAR $L$mod4_loop
  275. ALIGN 32
  276. $L$mod4_loop:
  277. DB 102,65,15,58,68,199,0
  278. xorps xmm4,xmm12
  279. movdqu xmm11,XMMWORD[48+r8]
  280. DB 102,69,15,56,0,218
  281. DB 102,65,15,58,68,207,17
  282. xorps xmm0,xmm3
  283. movdqu xmm3,XMMWORD[32+r8]
  284. movdqa xmm13,xmm11
  285. DB 102,68,15,58,68,199,16
  286. pshufd xmm12,xmm11,78
  287. xorps xmm1,xmm5
  288. pxor xmm12,xmm11
  289. DB 102,65,15,56,0,218
  290. movups xmm7,XMMWORD[32+rdx]
  291. xorps xmm8,xmm4
  292. DB 102,68,15,58,68,218,0
  293. pshufd xmm4,xmm3,78
  294. pxor xmm8,xmm0
  295. movdqa xmm5,xmm3
  296. pxor xmm8,xmm1
  297. pxor xmm4,xmm3
  298. movdqa xmm9,xmm8
  299. DB 102,68,15,58,68,234,17
  300. pslldq xmm8,8
  301. psrldq xmm9,8
  302. pxor xmm0,xmm8
  303. movdqa xmm8,XMMWORD[$L$7_mask]
  304. pxor xmm1,xmm9
  305. DB 102,76,15,110,200
  306. pand xmm8,xmm0
  307. DB 102,69,15,56,0,200
  308. pxor xmm9,xmm0
  309. DB 102,68,15,58,68,231,0
  310. psllq xmm9,57
  311. movdqa xmm8,xmm9
  312. pslldq xmm9,8
  313. DB 102,15,58,68,222,0
  314. psrldq xmm8,8
  315. pxor xmm0,xmm9
  316. pxor xmm1,xmm8
  317. movdqu xmm8,XMMWORD[r8]
  318. movdqa xmm9,xmm0
  319. psrlq xmm0,1
  320. DB 102,15,58,68,238,17
  321. xorps xmm3,xmm11
  322. movdqu xmm11,XMMWORD[16+r8]
  323. DB 102,69,15,56,0,218
  324. DB 102,15,58,68,231,16
  325. xorps xmm5,xmm13
  326. movups xmm7,XMMWORD[80+rdx]
  327. DB 102,69,15,56,0,194
  328. pxor xmm1,xmm9
  329. pxor xmm9,xmm0
  330. psrlq xmm0,5
  331. movdqa xmm13,xmm11
  332. pxor xmm4,xmm12
  333. pshufd xmm12,xmm11,78
  334. pxor xmm0,xmm9
  335. pxor xmm1,xmm8
  336. pxor xmm12,xmm11
  337. DB 102,69,15,58,68,222,0
  338. psrlq xmm0,1
  339. pxor xmm0,xmm1
  340. movdqa xmm1,xmm0
  341. DB 102,69,15,58,68,238,17
  342. xorps xmm3,xmm11
  343. pshufd xmm8,xmm0,78
  344. pxor xmm8,xmm0
  345. DB 102,68,15,58,68,231,0
  346. xorps xmm5,xmm13
  347. lea r8,[64+r8]
  348. sub r9,0x40
  349. jnc NEAR $L$mod4_loop
  350. $L$tail4x:
  351. DB 102,65,15,58,68,199,0
  352. DB 102,65,15,58,68,207,17
  353. DB 102,68,15,58,68,199,16
  354. xorps xmm4,xmm12
  355. xorps xmm0,xmm3
  356. xorps xmm1,xmm5
  357. pxor xmm1,xmm0
  358. pxor xmm8,xmm4
  359. pxor xmm8,xmm1
  360. pxor xmm1,xmm0
  361. movdqa xmm9,xmm8
  362. psrldq xmm8,8
  363. pslldq xmm9,8
  364. pxor xmm1,xmm8
  365. pxor xmm0,xmm9
  366. movdqa xmm4,xmm0
  367. movdqa xmm3,xmm0
  368. psllq xmm0,5
  369. pxor xmm3,xmm0
  370. psllq xmm0,1
  371. pxor xmm0,xmm3
  372. psllq xmm0,57
  373. movdqa xmm3,xmm0
  374. pslldq xmm0,8
  375. psrldq xmm3,8
  376. pxor xmm0,xmm4
  377. pxor xmm1,xmm3
  378. movdqa xmm4,xmm0
  379. psrlq xmm0,1
  380. pxor xmm1,xmm4
  381. pxor xmm4,xmm0
  382. psrlq xmm0,5
  383. pxor xmm0,xmm4
  384. psrlq xmm0,1
  385. pxor xmm0,xmm1
  386. add r9,0x40
  387. jz NEAR $L$done
  388. movdqu xmm7,XMMWORD[32+rdx]
  389. sub r9,0x10
  390. jz NEAR $L$odd_tail
  391. $L$skip4x:
  392. movdqu xmm8,XMMWORD[r8]
  393. movdqu xmm3,XMMWORD[16+r8]
  394. DB 102,69,15,56,0,194
  395. DB 102,65,15,56,0,218
  396. pxor xmm0,xmm8
  397. movdqa xmm5,xmm3
  398. pshufd xmm4,xmm3,78
  399. pxor xmm4,xmm3
  400. DB 102,15,58,68,218,0
  401. DB 102,15,58,68,234,17
  402. DB 102,15,58,68,231,0
  403. lea r8,[32+r8]
  404. nop
  405. sub r9,0x20
  406. jbe NEAR $L$even_tail
  407. nop
  408. jmp NEAR $L$mod_loop
  409. ALIGN 32
  410. $L$mod_loop:
  411. movdqa xmm1,xmm0
  412. movdqa xmm8,xmm4
  413. pshufd xmm4,xmm0,78
  414. pxor xmm4,xmm0
  415. DB 102,15,58,68,198,0
  416. DB 102,15,58,68,206,17
  417. DB 102,15,58,68,231,16
  418. pxor xmm0,xmm3
  419. pxor xmm1,xmm5
  420. movdqu xmm9,XMMWORD[r8]
  421. pxor xmm8,xmm0
  422. DB 102,69,15,56,0,202
  423. movdqu xmm3,XMMWORD[16+r8]
  424. pxor xmm8,xmm1
  425. pxor xmm1,xmm9
  426. pxor xmm4,xmm8
  427. DB 102,65,15,56,0,218
  428. movdqa xmm8,xmm4
  429. psrldq xmm8,8
  430. pslldq xmm4,8
  431. pxor xmm1,xmm8
  432. pxor xmm0,xmm4
  433. movdqa xmm5,xmm3
  434. movdqa xmm9,xmm0
  435. movdqa xmm8,xmm0
  436. psllq xmm0,5
  437. pxor xmm8,xmm0
  438. DB 102,15,58,68,218,0
  439. psllq xmm0,1
  440. pxor xmm0,xmm8
  441. psllq xmm0,57
  442. movdqa xmm8,xmm0
  443. pslldq xmm0,8
  444. psrldq xmm8,8
  445. pxor xmm0,xmm9
  446. pshufd xmm4,xmm5,78
  447. pxor xmm1,xmm8
  448. pxor xmm4,xmm5
  449. movdqa xmm9,xmm0
  450. psrlq xmm0,1
  451. DB 102,15,58,68,234,17
  452. pxor xmm1,xmm9
  453. pxor xmm9,xmm0
  454. psrlq xmm0,5
  455. pxor xmm0,xmm9
  456. lea r8,[32+r8]
  457. psrlq xmm0,1
  458. DB 102,15,58,68,231,0
  459. pxor xmm0,xmm1
  460. sub r9,0x20
  461. ja NEAR $L$mod_loop
  462. $L$even_tail:
  463. movdqa xmm1,xmm0
  464. movdqa xmm8,xmm4
  465. pshufd xmm4,xmm0,78
  466. pxor xmm4,xmm0
  467. DB 102,15,58,68,198,0
  468. DB 102,15,58,68,206,17
  469. DB 102,15,58,68,231,16
  470. pxor xmm0,xmm3
  471. pxor xmm1,xmm5
  472. pxor xmm8,xmm0
  473. pxor xmm8,xmm1
  474. pxor xmm4,xmm8
  475. movdqa xmm8,xmm4
  476. psrldq xmm8,8
  477. pslldq xmm4,8
  478. pxor xmm1,xmm8
  479. pxor xmm0,xmm4
  480. movdqa xmm4,xmm0
  481. movdqa xmm3,xmm0
  482. psllq xmm0,5
  483. pxor xmm3,xmm0
  484. psllq xmm0,1
  485. pxor xmm0,xmm3
  486. psllq xmm0,57
  487. movdqa xmm3,xmm0
  488. pslldq xmm0,8
  489. psrldq xmm3,8
  490. pxor xmm0,xmm4
  491. pxor xmm1,xmm3
  492. movdqa xmm4,xmm0
  493. psrlq xmm0,1
  494. pxor xmm1,xmm4
  495. pxor xmm4,xmm0
  496. psrlq xmm0,5
  497. pxor xmm0,xmm4
  498. psrlq xmm0,1
  499. pxor xmm0,xmm1
  500. test r9,r9
  501. jnz NEAR $L$done
  502. $L$odd_tail:
  503. movdqu xmm8,XMMWORD[r8]
  504. DB 102,69,15,56,0,194
  505. pxor xmm0,xmm8
  506. movdqa xmm1,xmm0
  507. pshufd xmm3,xmm0,78
  508. pxor xmm3,xmm0
  509. DB 102,15,58,68,194,0
  510. DB 102,15,58,68,202,17
  511. DB 102,15,58,68,223,0
  512. pxor xmm3,xmm0
  513. pxor xmm3,xmm1
  514. movdqa xmm4,xmm3
  515. psrldq xmm3,8
  516. pslldq xmm4,8
  517. pxor xmm1,xmm3
  518. pxor xmm0,xmm4
  519. movdqa xmm4,xmm0
  520. movdqa xmm3,xmm0
  521. psllq xmm0,5
  522. pxor xmm3,xmm0
  523. psllq xmm0,1
  524. pxor xmm0,xmm3
  525. psllq xmm0,57
  526. movdqa xmm3,xmm0
  527. pslldq xmm0,8
  528. psrldq xmm3,8
  529. pxor xmm0,xmm4
  530. pxor xmm1,xmm3
  531. movdqa xmm4,xmm0
  532. psrlq xmm0,1
  533. pxor xmm1,xmm4
  534. pxor xmm4,xmm0
  535. psrlq xmm0,5
  536. pxor xmm0,xmm4
  537. psrlq xmm0,1
  538. pxor xmm0,xmm1
  539. $L$done:
  540. DB 102,65,15,56,0,194
  541. movdqu XMMWORD[rcx],xmm0
  542. movaps xmm6,XMMWORD[rsp]
  543. movaps xmm7,XMMWORD[16+rsp]
  544. movaps xmm8,XMMWORD[32+rsp]
  545. movaps xmm9,XMMWORD[48+rsp]
  546. movaps xmm10,XMMWORD[64+rsp]
  547. movaps xmm11,XMMWORD[80+rsp]
  548. movaps xmm12,XMMWORD[96+rsp]
  549. movaps xmm13,XMMWORD[112+rsp]
  550. movaps xmm14,XMMWORD[128+rsp]
  551. movaps xmm15,XMMWORD[144+rsp]
  552. lea rsp,[168+rsp]
  553. $L$SEH_end_gcm_ghash_clmul:
  554. DB 0F3h,0C3h ;repret
  555. global gcm_init_avx
  556. ALIGN 32
  557. gcm_init_avx:
  558. $L$SEH_begin_gcm_init_avx:
  559. DB 0x48,0x83,0xec,0x18
  560. DB 0x0f,0x29,0x34,0x24
  561. vzeroupper
  562. vmovdqu xmm2,XMMWORD[rdx]
  563. vpshufd xmm2,xmm2,78
  564. vpshufd xmm4,xmm2,255
  565. vpsrlq xmm3,xmm2,63
  566. vpsllq xmm2,xmm2,1
  567. vpxor xmm5,xmm5,xmm5
  568. vpcmpgtd xmm5,xmm5,xmm4
  569. vpslldq xmm3,xmm3,8
  570. vpor xmm2,xmm2,xmm3
  571. vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial]
  572. vpxor xmm2,xmm2,xmm5
  573. vpunpckhqdq xmm6,xmm2,xmm2
  574. vmovdqa xmm0,xmm2
  575. vpxor xmm6,xmm6,xmm2
  576. mov r10,4
  577. jmp NEAR $L$init_start_avx
  578. ALIGN 32
  579. $L$init_loop_avx:
  580. vpalignr xmm5,xmm4,xmm3,8
  581. vmovdqu XMMWORD[(-16)+rcx],xmm5
  582. vpunpckhqdq xmm3,xmm0,xmm0
  583. vpxor xmm3,xmm3,xmm0
  584. vpclmulqdq xmm1,xmm0,xmm2,0x11
  585. vpclmulqdq xmm0,xmm0,xmm2,0x00
  586. vpclmulqdq xmm3,xmm3,xmm6,0x00
  587. vpxor xmm4,xmm1,xmm0
  588. vpxor xmm3,xmm3,xmm4
  589. vpslldq xmm4,xmm3,8
  590. vpsrldq xmm3,xmm3,8
  591. vpxor xmm0,xmm0,xmm4
  592. vpxor xmm1,xmm1,xmm3
  593. vpsllq xmm3,xmm0,57
  594. vpsllq xmm4,xmm0,62
  595. vpxor xmm4,xmm4,xmm3
  596. vpsllq xmm3,xmm0,63
  597. vpxor xmm4,xmm4,xmm3
  598. vpslldq xmm3,xmm4,8
  599. vpsrldq xmm4,xmm4,8
  600. vpxor xmm0,xmm0,xmm3
  601. vpxor xmm1,xmm1,xmm4
  602. vpsrlq xmm4,xmm0,1
  603. vpxor xmm1,xmm1,xmm0
  604. vpxor xmm0,xmm0,xmm4
  605. vpsrlq xmm4,xmm4,5
  606. vpxor xmm0,xmm0,xmm4
  607. vpsrlq xmm0,xmm0,1
  608. vpxor xmm0,xmm0,xmm1
  609. $L$init_start_avx:
  610. vmovdqa xmm5,xmm0
  611. vpunpckhqdq xmm3,xmm0,xmm0
  612. vpxor xmm3,xmm3,xmm0
  613. vpclmulqdq xmm1,xmm0,xmm2,0x11
  614. vpclmulqdq xmm0,xmm0,xmm2,0x00
  615. vpclmulqdq xmm3,xmm3,xmm6,0x00
  616. vpxor xmm4,xmm1,xmm0
  617. vpxor xmm3,xmm3,xmm4
  618. vpslldq xmm4,xmm3,8
  619. vpsrldq xmm3,xmm3,8
  620. vpxor xmm0,xmm0,xmm4
  621. vpxor xmm1,xmm1,xmm3
  622. vpsllq xmm3,xmm0,57
  623. vpsllq xmm4,xmm0,62
  624. vpxor xmm4,xmm4,xmm3
  625. vpsllq xmm3,xmm0,63
  626. vpxor xmm4,xmm4,xmm3
  627. vpslldq xmm3,xmm4,8
  628. vpsrldq xmm4,xmm4,8
  629. vpxor xmm0,xmm0,xmm3
  630. vpxor xmm1,xmm1,xmm4
  631. vpsrlq xmm4,xmm0,1
  632. vpxor xmm1,xmm1,xmm0
  633. vpxor xmm0,xmm0,xmm4
  634. vpsrlq xmm4,xmm4,5
  635. vpxor xmm0,xmm0,xmm4
  636. vpsrlq xmm0,xmm0,1
  637. vpxor xmm0,xmm0,xmm1
  638. vpshufd xmm3,xmm5,78
  639. vpshufd xmm4,xmm0,78
  640. vpxor xmm3,xmm3,xmm5
  641. vmovdqu XMMWORD[rcx],xmm5
  642. vpxor xmm4,xmm4,xmm0
  643. vmovdqu XMMWORD[16+rcx],xmm0
  644. lea rcx,[48+rcx]
  645. sub r10,1
  646. jnz NEAR $L$init_loop_avx
  647. vpalignr xmm5,xmm3,xmm4,8
  648. vmovdqu XMMWORD[(-16)+rcx],xmm5
  649. vzeroupper
  650. movaps xmm6,XMMWORD[rsp]
  651. lea rsp,[24+rsp]
  652. $L$SEH_end_gcm_init_avx:
  653. DB 0F3h,0C3h ;repret
  654. global gcm_gmult_avx
  655. ALIGN 32
  656. gcm_gmult_avx:
  657. jmp NEAR $L$_gmult_clmul
  658. global gcm_ghash_avx
  659. ALIGN 32
  660. gcm_ghash_avx:
  661. lea rax,[((-136))+rsp]
  662. $L$SEH_begin_gcm_ghash_avx:
  663. DB 0x48,0x8d,0x60,0xe0
  664. DB 0x0f,0x29,0x70,0xe0
  665. DB 0x0f,0x29,0x78,0xf0
  666. DB 0x44,0x0f,0x29,0x00
  667. DB 0x44,0x0f,0x29,0x48,0x10
  668. DB 0x44,0x0f,0x29,0x50,0x20
  669. DB 0x44,0x0f,0x29,0x58,0x30
  670. DB 0x44,0x0f,0x29,0x60,0x40
  671. DB 0x44,0x0f,0x29,0x68,0x50
  672. DB 0x44,0x0f,0x29,0x70,0x60
  673. DB 0x44,0x0f,0x29,0x78,0x70
  674. vzeroupper
  675. vmovdqu xmm10,XMMWORD[rcx]
  676. lea r10,[$L$0x1c2_polynomial]
  677. lea rdx,[64+rdx]
  678. vmovdqu xmm13,XMMWORD[$L$bswap_mask]
  679. vpshufb xmm10,xmm10,xmm13
  680. cmp r9,0x80
  681. jb NEAR $L$short_avx
  682. sub r9,0x80
  683. vmovdqu xmm14,XMMWORD[112+r8]
  684. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  685. vpshufb xmm14,xmm14,xmm13
  686. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  687. vpunpckhqdq xmm9,xmm14,xmm14
  688. vmovdqu xmm15,XMMWORD[96+r8]
  689. vpclmulqdq xmm0,xmm14,xmm6,0x00
  690. vpxor xmm9,xmm9,xmm14
  691. vpshufb xmm15,xmm15,xmm13
  692. vpclmulqdq xmm1,xmm14,xmm6,0x11
  693. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  694. vpunpckhqdq xmm8,xmm15,xmm15
  695. vmovdqu xmm14,XMMWORD[80+r8]
  696. vpclmulqdq xmm2,xmm9,xmm7,0x00
  697. vpxor xmm8,xmm8,xmm15
  698. vpshufb xmm14,xmm14,xmm13
  699. vpclmulqdq xmm3,xmm15,xmm6,0x00
  700. vpunpckhqdq xmm9,xmm14,xmm14
  701. vpclmulqdq xmm4,xmm15,xmm6,0x11
  702. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  703. vpxor xmm9,xmm9,xmm14
  704. vmovdqu xmm15,XMMWORD[64+r8]
  705. vpclmulqdq xmm5,xmm8,xmm7,0x10
  706. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  707. vpshufb xmm15,xmm15,xmm13
  708. vpxor xmm3,xmm3,xmm0
  709. vpclmulqdq xmm0,xmm14,xmm6,0x00
  710. vpxor xmm4,xmm4,xmm1
  711. vpunpckhqdq xmm8,xmm15,xmm15
  712. vpclmulqdq xmm1,xmm14,xmm6,0x11
  713. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  714. vpxor xmm5,xmm5,xmm2
  715. vpclmulqdq xmm2,xmm9,xmm7,0x00
  716. vpxor xmm8,xmm8,xmm15
  717. vmovdqu xmm14,XMMWORD[48+r8]
  718. vpxor xmm0,xmm0,xmm3
  719. vpclmulqdq xmm3,xmm15,xmm6,0x00
  720. vpxor xmm1,xmm1,xmm4
  721. vpshufb xmm14,xmm14,xmm13
  722. vpclmulqdq xmm4,xmm15,xmm6,0x11
  723. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  724. vpxor xmm2,xmm2,xmm5
  725. vpunpckhqdq xmm9,xmm14,xmm14
  726. vpclmulqdq xmm5,xmm8,xmm7,0x10
  727. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  728. vpxor xmm9,xmm9,xmm14
  729. vmovdqu xmm15,XMMWORD[32+r8]
  730. vpxor xmm3,xmm3,xmm0
  731. vpclmulqdq xmm0,xmm14,xmm6,0x00
  732. vpxor xmm4,xmm4,xmm1
  733. vpshufb xmm15,xmm15,xmm13
  734. vpclmulqdq xmm1,xmm14,xmm6,0x11
  735. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  736. vpxor xmm5,xmm5,xmm2
  737. vpunpckhqdq xmm8,xmm15,xmm15
  738. vpclmulqdq xmm2,xmm9,xmm7,0x00
  739. vpxor xmm8,xmm8,xmm15
  740. vmovdqu xmm14,XMMWORD[16+r8]
  741. vpxor xmm0,xmm0,xmm3
  742. vpclmulqdq xmm3,xmm15,xmm6,0x00
  743. vpxor xmm1,xmm1,xmm4
  744. vpshufb xmm14,xmm14,xmm13
  745. vpclmulqdq xmm4,xmm15,xmm6,0x11
  746. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  747. vpxor xmm2,xmm2,xmm5
  748. vpunpckhqdq xmm9,xmm14,xmm14
  749. vpclmulqdq xmm5,xmm8,xmm7,0x10
  750. vmovdqu xmm7,XMMWORD[((176-64))+rdx]
  751. vpxor xmm9,xmm9,xmm14
  752. vmovdqu xmm15,XMMWORD[r8]
  753. vpxor xmm3,xmm3,xmm0
  754. vpclmulqdq xmm0,xmm14,xmm6,0x00
  755. vpxor xmm4,xmm4,xmm1
  756. vpshufb xmm15,xmm15,xmm13
  757. vpclmulqdq xmm1,xmm14,xmm6,0x11
  758. vmovdqu xmm6,XMMWORD[((160-64))+rdx]
  759. vpxor xmm5,xmm5,xmm2
  760. vpclmulqdq xmm2,xmm9,xmm7,0x10
  761. lea r8,[128+r8]
  762. cmp r9,0x80
  763. jb NEAR $L$tail_avx
  764. vpxor xmm15,xmm15,xmm10
  765. sub r9,0x80
  766. jmp NEAR $L$oop8x_avx
  767. ALIGN 32
  768. $L$oop8x_avx:
  769. vpunpckhqdq xmm8,xmm15,xmm15
  770. vmovdqu xmm14,XMMWORD[112+r8]
  771. vpxor xmm3,xmm3,xmm0
  772. vpxor xmm8,xmm8,xmm15
  773. vpclmulqdq xmm10,xmm15,xmm6,0x00
  774. vpshufb xmm14,xmm14,xmm13
  775. vpxor xmm4,xmm4,xmm1
  776. vpclmulqdq xmm11,xmm15,xmm6,0x11
  777. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  778. vpunpckhqdq xmm9,xmm14,xmm14
  779. vpxor xmm5,xmm5,xmm2
  780. vpclmulqdq xmm12,xmm8,xmm7,0x00
  781. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  782. vpxor xmm9,xmm9,xmm14
  783. vmovdqu xmm15,XMMWORD[96+r8]
  784. vpclmulqdq xmm0,xmm14,xmm6,0x00
  785. vpxor xmm10,xmm10,xmm3
  786. vpshufb xmm15,xmm15,xmm13
  787. vpclmulqdq xmm1,xmm14,xmm6,0x11
  788. vxorps xmm11,xmm11,xmm4
  789. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  790. vpunpckhqdq xmm8,xmm15,xmm15
  791. vpclmulqdq xmm2,xmm9,xmm7,0x00
  792. vpxor xmm12,xmm12,xmm5
  793. vxorps xmm8,xmm8,xmm15
  794. vmovdqu xmm14,XMMWORD[80+r8]
  795. vpxor xmm12,xmm12,xmm10
  796. vpclmulqdq xmm3,xmm15,xmm6,0x00
  797. vpxor xmm12,xmm12,xmm11
  798. vpslldq xmm9,xmm12,8
  799. vpxor xmm3,xmm3,xmm0
  800. vpclmulqdq xmm4,xmm15,xmm6,0x11
  801. vpsrldq xmm12,xmm12,8
  802. vpxor xmm10,xmm10,xmm9
  803. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  804. vpshufb xmm14,xmm14,xmm13
  805. vxorps xmm11,xmm11,xmm12
  806. vpxor xmm4,xmm4,xmm1
  807. vpunpckhqdq xmm9,xmm14,xmm14
  808. vpclmulqdq xmm5,xmm8,xmm7,0x10
  809. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  810. vpxor xmm9,xmm9,xmm14
  811. vpxor xmm5,xmm5,xmm2
  812. vmovdqu xmm15,XMMWORD[64+r8]
  813. vpalignr xmm12,xmm10,xmm10,8
  814. vpclmulqdq xmm0,xmm14,xmm6,0x00
  815. vpshufb xmm15,xmm15,xmm13
  816. vpxor xmm0,xmm0,xmm3
  817. vpclmulqdq xmm1,xmm14,xmm6,0x11
  818. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  819. vpunpckhqdq xmm8,xmm15,xmm15
  820. vpxor xmm1,xmm1,xmm4
  821. vpclmulqdq xmm2,xmm9,xmm7,0x00
  822. vxorps xmm8,xmm8,xmm15
  823. vpxor xmm2,xmm2,xmm5
  824. vmovdqu xmm14,XMMWORD[48+r8]
  825. vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
  826. vpclmulqdq xmm3,xmm15,xmm6,0x00
  827. vpshufb xmm14,xmm14,xmm13
  828. vpxor xmm3,xmm3,xmm0
  829. vpclmulqdq xmm4,xmm15,xmm6,0x11
  830. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  831. vpunpckhqdq xmm9,xmm14,xmm14
  832. vpxor xmm4,xmm4,xmm1
  833. vpclmulqdq xmm5,xmm8,xmm7,0x10
  834. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  835. vpxor xmm9,xmm9,xmm14
  836. vpxor xmm5,xmm5,xmm2
  837. vmovdqu xmm15,XMMWORD[32+r8]
  838. vpclmulqdq xmm0,xmm14,xmm6,0x00
  839. vpshufb xmm15,xmm15,xmm13
  840. vpxor xmm0,xmm0,xmm3
  841. vpclmulqdq xmm1,xmm14,xmm6,0x11
  842. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  843. vpunpckhqdq xmm8,xmm15,xmm15
  844. vpxor xmm1,xmm1,xmm4
  845. vpclmulqdq xmm2,xmm9,xmm7,0x00
  846. vpxor xmm8,xmm8,xmm15
  847. vpxor xmm2,xmm2,xmm5
  848. vxorps xmm10,xmm10,xmm12
  849. vmovdqu xmm14,XMMWORD[16+r8]
  850. vpalignr xmm12,xmm10,xmm10,8
  851. vpclmulqdq xmm3,xmm15,xmm6,0x00
  852. vpshufb xmm14,xmm14,xmm13
  853. vpxor xmm3,xmm3,xmm0
  854. vpclmulqdq xmm4,xmm15,xmm6,0x11
  855. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  856. vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10
  857. vxorps xmm12,xmm12,xmm11
  858. vpunpckhqdq xmm9,xmm14,xmm14
  859. vpxor xmm4,xmm4,xmm1
  860. vpclmulqdq xmm5,xmm8,xmm7,0x10
  861. vmovdqu xmm7,XMMWORD[((176-64))+rdx]
  862. vpxor xmm9,xmm9,xmm14
  863. vpxor xmm5,xmm5,xmm2
  864. vmovdqu xmm15,XMMWORD[r8]
  865. vpclmulqdq xmm0,xmm14,xmm6,0x00
  866. vpshufb xmm15,xmm15,xmm13
  867. vpclmulqdq xmm1,xmm14,xmm6,0x11
  868. vmovdqu xmm6,XMMWORD[((160-64))+rdx]
  869. vpxor xmm15,xmm15,xmm12
  870. vpclmulqdq xmm2,xmm9,xmm7,0x10
  871. vpxor xmm15,xmm15,xmm10
  872. lea r8,[128+r8]
  873. sub r9,0x80
  874. jnc NEAR $L$oop8x_avx
  875. add r9,0x80
  876. jmp NEAR $L$tail_no_xor_avx
  877. ALIGN 32
  878. $L$short_avx:
  879. vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8]
  880. lea r8,[r9*1+r8]
  881. vmovdqu xmm6,XMMWORD[((0-64))+rdx]
  882. vmovdqu xmm7,XMMWORD[((32-64))+rdx]
  883. vpshufb xmm15,xmm14,xmm13
  884. vmovdqa xmm3,xmm0
  885. vmovdqa xmm4,xmm1
  886. vmovdqa xmm5,xmm2
  887. sub r9,0x10
  888. jz NEAR $L$tail_avx
  889. vpunpckhqdq xmm8,xmm15,xmm15
  890. vpxor xmm3,xmm3,xmm0
  891. vpclmulqdq xmm0,xmm15,xmm6,0x00
  892. vpxor xmm8,xmm8,xmm15
  893. vmovdqu xmm14,XMMWORD[((-32))+r8]
  894. vpxor xmm4,xmm4,xmm1
  895. vpclmulqdq xmm1,xmm15,xmm6,0x11
  896. vmovdqu xmm6,XMMWORD[((16-64))+rdx]
  897. vpshufb xmm15,xmm14,xmm13
  898. vpxor xmm5,xmm5,xmm2
  899. vpclmulqdq xmm2,xmm8,xmm7,0x00
  900. vpsrldq xmm7,xmm7,8
  901. sub r9,0x10
  902. jz NEAR $L$tail_avx
  903. vpunpckhqdq xmm8,xmm15,xmm15
  904. vpxor xmm3,xmm3,xmm0
  905. vpclmulqdq xmm0,xmm15,xmm6,0x00
  906. vpxor xmm8,xmm8,xmm15
  907. vmovdqu xmm14,XMMWORD[((-48))+r8]
  908. vpxor xmm4,xmm4,xmm1
  909. vpclmulqdq xmm1,xmm15,xmm6,0x11
  910. vmovdqu xmm6,XMMWORD[((48-64))+rdx]
  911. vpshufb xmm15,xmm14,xmm13
  912. vpxor xmm5,xmm5,xmm2
  913. vpclmulqdq xmm2,xmm8,xmm7,0x00
  914. vmovdqu xmm7,XMMWORD[((80-64))+rdx]
  915. sub r9,0x10
  916. jz NEAR $L$tail_avx
  917. vpunpckhqdq xmm8,xmm15,xmm15
  918. vpxor xmm3,xmm3,xmm0
  919. vpclmulqdq xmm0,xmm15,xmm6,0x00
  920. vpxor xmm8,xmm8,xmm15
  921. vmovdqu xmm14,XMMWORD[((-64))+r8]
  922. vpxor xmm4,xmm4,xmm1
  923. vpclmulqdq xmm1,xmm15,xmm6,0x11
  924. vmovdqu xmm6,XMMWORD[((64-64))+rdx]
  925. vpshufb xmm15,xmm14,xmm13
  926. vpxor xmm5,xmm5,xmm2
  927. vpclmulqdq xmm2,xmm8,xmm7,0x00
  928. vpsrldq xmm7,xmm7,8
  929. sub r9,0x10
  930. jz NEAR $L$tail_avx
  931. vpunpckhqdq xmm8,xmm15,xmm15
  932. vpxor xmm3,xmm3,xmm0
  933. vpclmulqdq xmm0,xmm15,xmm6,0x00
  934. vpxor xmm8,xmm8,xmm15
  935. vmovdqu xmm14,XMMWORD[((-80))+r8]
  936. vpxor xmm4,xmm4,xmm1
  937. vpclmulqdq xmm1,xmm15,xmm6,0x11
  938. vmovdqu xmm6,XMMWORD[((96-64))+rdx]
  939. vpshufb xmm15,xmm14,xmm13
  940. vpxor xmm5,xmm5,xmm2
  941. vpclmulqdq xmm2,xmm8,xmm7,0x00
  942. vmovdqu xmm7,XMMWORD[((128-64))+rdx]
  943. sub r9,0x10
  944. jz NEAR $L$tail_avx
  945. vpunpckhqdq xmm8,xmm15,xmm15
  946. vpxor xmm3,xmm3,xmm0
  947. vpclmulqdq xmm0,xmm15,xmm6,0x00
  948. vpxor xmm8,xmm8,xmm15
  949. vmovdqu xmm14,XMMWORD[((-96))+r8]
  950. vpxor xmm4,xmm4,xmm1
  951. vpclmulqdq xmm1,xmm15,xmm6,0x11
  952. vmovdqu xmm6,XMMWORD[((112-64))+rdx]
  953. vpshufb xmm15,xmm14,xmm13
  954. vpxor xmm5,xmm5,xmm2
  955. vpclmulqdq xmm2,xmm8,xmm7,0x00
  956. vpsrldq xmm7,xmm7,8
  957. sub r9,0x10
  958. jz NEAR $L$tail_avx
  959. vpunpckhqdq xmm8,xmm15,xmm15
  960. vpxor xmm3,xmm3,xmm0
  961. vpclmulqdq xmm0,xmm15,xmm6,0x00
  962. vpxor xmm8,xmm8,xmm15
  963. vmovdqu xmm14,XMMWORD[((-112))+r8]
  964. vpxor xmm4,xmm4,xmm1
  965. vpclmulqdq xmm1,xmm15,xmm6,0x11
  966. vmovdqu xmm6,XMMWORD[((144-64))+rdx]
  967. vpshufb xmm15,xmm14,xmm13
  968. vpxor xmm5,xmm5,xmm2
  969. vpclmulqdq xmm2,xmm8,xmm7,0x00
  970. vmovq xmm7,QWORD[((184-64))+rdx]
  971. sub r9,0x10
  972. jmp NEAR $L$tail_avx
  973. ALIGN 32
  974. $L$tail_avx:
  975. vpxor xmm15,xmm15,xmm10
  976. $L$tail_no_xor_avx:
  977. vpunpckhqdq xmm8,xmm15,xmm15
  978. vpxor xmm3,xmm3,xmm0
  979. vpclmulqdq xmm0,xmm15,xmm6,0x00
  980. vpxor xmm8,xmm8,xmm15
  981. vpxor xmm4,xmm4,xmm1
  982. vpclmulqdq xmm1,xmm15,xmm6,0x11
  983. vpxor xmm5,xmm5,xmm2
  984. vpclmulqdq xmm2,xmm8,xmm7,0x00
  985. vmovdqu xmm12,XMMWORD[r10]
  986. vpxor xmm10,xmm3,xmm0
  987. vpxor xmm11,xmm4,xmm1
  988. vpxor xmm5,xmm5,xmm2
  989. vpxor xmm5,xmm5,xmm10
  990. vpxor xmm5,xmm5,xmm11
  991. vpslldq xmm9,xmm5,8
  992. vpsrldq xmm5,xmm5,8
  993. vpxor xmm10,xmm10,xmm9
  994. vpxor xmm11,xmm11,xmm5
  995. vpclmulqdq xmm9,xmm10,xmm12,0x10
  996. vpalignr xmm10,xmm10,xmm10,8
  997. vpxor xmm10,xmm10,xmm9
  998. vpclmulqdq xmm9,xmm10,xmm12,0x10
  999. vpalignr xmm10,xmm10,xmm10,8
  1000. vpxor xmm10,xmm10,xmm11
  1001. vpxor xmm10,xmm10,xmm9
  1002. cmp r9,0
  1003. jne NEAR $L$short_avx
  1004. vpshufb xmm10,xmm10,xmm13
  1005. vmovdqu XMMWORD[rcx],xmm10
  1006. vzeroupper
  1007. movaps xmm6,XMMWORD[rsp]
  1008. movaps xmm7,XMMWORD[16+rsp]
  1009. movaps xmm8,XMMWORD[32+rsp]
  1010. movaps xmm9,XMMWORD[48+rsp]
  1011. movaps xmm10,XMMWORD[64+rsp]
  1012. movaps xmm11,XMMWORD[80+rsp]
  1013. movaps xmm12,XMMWORD[96+rsp]
  1014. movaps xmm13,XMMWORD[112+rsp]
  1015. movaps xmm14,XMMWORD[128+rsp]
  1016. movaps xmm15,XMMWORD[144+rsp]
  1017. lea rsp,[168+rsp]
  1018. $L$SEH_end_gcm_ghash_avx:
  1019. DB 0F3h,0C3h ;repret
  1020. ALIGN 64
  1021. $L$bswap_mask:
  1022. DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  1023. $L$0x1c2_polynomial:
  1024. DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  1025. $L$7_mask:
  1026. DD 7,0,7,0
  1027. ALIGN 64
  1028. DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52
  1029. DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
  1030. DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
  1031. DB 114,103,62,0
  1032. ALIGN 64
  1033. section .pdata rdata align=4
  1034. ALIGN 4
  1035. DD $L$SEH_begin_gcm_init_clmul wrt ..imagebase
  1036. DD $L$SEH_end_gcm_init_clmul wrt ..imagebase
  1037. DD $L$SEH_info_gcm_init_clmul wrt ..imagebase
  1038. DD $L$SEH_begin_gcm_ghash_clmul wrt ..imagebase
  1039. DD $L$SEH_end_gcm_ghash_clmul wrt ..imagebase
  1040. DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase
  1041. DD $L$SEH_begin_gcm_init_avx wrt ..imagebase
  1042. DD $L$SEH_end_gcm_init_avx wrt ..imagebase
  1043. DD $L$SEH_info_gcm_init_clmul wrt ..imagebase
  1044. DD $L$SEH_begin_gcm_ghash_avx wrt ..imagebase
  1045. DD $L$SEH_end_gcm_ghash_avx wrt ..imagebase
  1046. DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase
  1047. section .xdata rdata align=8
  1048. ALIGN 8
  1049. $L$SEH_info_gcm_init_clmul:
  1050. DB 0x01,0x08,0x03,0x00
  1051. DB 0x08,0x68,0x00,0x00
  1052. DB 0x04,0x22,0x00,0x00
  1053. $L$SEH_info_gcm_ghash_clmul:
  1054. DB 0x01,0x33,0x16,0x00
  1055. DB 0x33,0xf8,0x09,0x00
  1056. DB 0x2e,0xe8,0x08,0x00
  1057. DB 0x29,0xd8,0x07,0x00
  1058. DB 0x24,0xc8,0x06,0x00
  1059. DB 0x1f,0xb8,0x05,0x00
  1060. DB 0x1a,0xa8,0x04,0x00
  1061. DB 0x15,0x98,0x03,0x00
  1062. DB 0x10,0x88,0x02,0x00
  1063. DB 0x0c,0x78,0x01,0x00
  1064. DB 0x08,0x68,0x00,0x00
  1065. DB 0x04,0x01,0x15,0x00