ghash-x86_64.S 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if defined(__has_feature)
  4. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  5. #define OPENSSL_NO_ASM
  6. #endif
  7. #endif
  8. #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
  9. #if defined(BORINGSSL_PREFIX)
  10. #include <boringssl_prefix_symbols_asm.h>
  11. #endif
  12. .text
  13. .globl _gcm_init_clmul
  14. .private_extern _gcm_init_clmul
  15. .p2align 4
  16. _gcm_init_clmul:
  17. L$_init_clmul:
  18. movdqu (%rsi),%xmm2
  19. pshufd $78,%xmm2,%xmm2
  20. pshufd $255,%xmm2,%xmm4
  21. movdqa %xmm2,%xmm3
  22. psllq $1,%xmm2
  23. pxor %xmm5,%xmm5
  24. psrlq $63,%xmm3
  25. pcmpgtd %xmm4,%xmm5
  26. pslldq $8,%xmm3
  27. por %xmm3,%xmm2
  28. pand L$0x1c2_polynomial(%rip),%xmm5
  29. pxor %xmm5,%xmm2
  30. pshufd $78,%xmm2,%xmm6
  31. movdqa %xmm2,%xmm0
  32. pxor %xmm2,%xmm6
  33. movdqa %xmm0,%xmm1
  34. pshufd $78,%xmm0,%xmm3
  35. pxor %xmm0,%xmm3
  36. .byte 102,15,58,68,194,0
  37. .byte 102,15,58,68,202,17
  38. .byte 102,15,58,68,222,0
  39. pxor %xmm0,%xmm3
  40. pxor %xmm1,%xmm3
  41. movdqa %xmm3,%xmm4
  42. psrldq $8,%xmm3
  43. pslldq $8,%xmm4
  44. pxor %xmm3,%xmm1
  45. pxor %xmm4,%xmm0
  46. movdqa %xmm0,%xmm4
  47. movdqa %xmm0,%xmm3
  48. psllq $5,%xmm0
  49. pxor %xmm0,%xmm3
  50. psllq $1,%xmm0
  51. pxor %xmm3,%xmm0
  52. psllq $57,%xmm0
  53. movdqa %xmm0,%xmm3
  54. pslldq $8,%xmm0
  55. psrldq $8,%xmm3
  56. pxor %xmm4,%xmm0
  57. pxor %xmm3,%xmm1
  58. movdqa %xmm0,%xmm4
  59. psrlq $1,%xmm0
  60. pxor %xmm4,%xmm1
  61. pxor %xmm0,%xmm4
  62. psrlq $5,%xmm0
  63. pxor %xmm4,%xmm0
  64. psrlq $1,%xmm0
  65. pxor %xmm1,%xmm0
  66. pshufd $78,%xmm2,%xmm3
  67. pshufd $78,%xmm0,%xmm4
  68. pxor %xmm2,%xmm3
  69. movdqu %xmm2,0(%rdi)
  70. pxor %xmm0,%xmm4
  71. movdqu %xmm0,16(%rdi)
  72. .byte 102,15,58,15,227,8
  73. movdqu %xmm4,32(%rdi)
  74. movdqa %xmm0,%xmm1
  75. pshufd $78,%xmm0,%xmm3
  76. pxor %xmm0,%xmm3
  77. .byte 102,15,58,68,194,0
  78. .byte 102,15,58,68,202,17
  79. .byte 102,15,58,68,222,0
  80. pxor %xmm0,%xmm3
  81. pxor %xmm1,%xmm3
  82. movdqa %xmm3,%xmm4
  83. psrldq $8,%xmm3
  84. pslldq $8,%xmm4
  85. pxor %xmm3,%xmm1
  86. pxor %xmm4,%xmm0
  87. movdqa %xmm0,%xmm4
  88. movdqa %xmm0,%xmm3
  89. psllq $5,%xmm0
  90. pxor %xmm0,%xmm3
  91. psllq $1,%xmm0
  92. pxor %xmm3,%xmm0
  93. psllq $57,%xmm0
  94. movdqa %xmm0,%xmm3
  95. pslldq $8,%xmm0
  96. psrldq $8,%xmm3
  97. pxor %xmm4,%xmm0
  98. pxor %xmm3,%xmm1
  99. movdqa %xmm0,%xmm4
  100. psrlq $1,%xmm0
  101. pxor %xmm4,%xmm1
  102. pxor %xmm0,%xmm4
  103. psrlq $5,%xmm0
  104. pxor %xmm4,%xmm0
  105. psrlq $1,%xmm0
  106. pxor %xmm1,%xmm0
  107. movdqa %xmm0,%xmm5
  108. movdqa %xmm0,%xmm1
  109. pshufd $78,%xmm0,%xmm3
  110. pxor %xmm0,%xmm3
  111. .byte 102,15,58,68,194,0
  112. .byte 102,15,58,68,202,17
  113. .byte 102,15,58,68,222,0
  114. pxor %xmm0,%xmm3
  115. pxor %xmm1,%xmm3
  116. movdqa %xmm3,%xmm4
  117. psrldq $8,%xmm3
  118. pslldq $8,%xmm4
  119. pxor %xmm3,%xmm1
  120. pxor %xmm4,%xmm0
  121. movdqa %xmm0,%xmm4
  122. movdqa %xmm0,%xmm3
  123. psllq $5,%xmm0
  124. pxor %xmm0,%xmm3
  125. psllq $1,%xmm0
  126. pxor %xmm3,%xmm0
  127. psllq $57,%xmm0
  128. movdqa %xmm0,%xmm3
  129. pslldq $8,%xmm0
  130. psrldq $8,%xmm3
  131. pxor %xmm4,%xmm0
  132. pxor %xmm3,%xmm1
  133. movdqa %xmm0,%xmm4
  134. psrlq $1,%xmm0
  135. pxor %xmm4,%xmm1
  136. pxor %xmm0,%xmm4
  137. psrlq $5,%xmm0
  138. pxor %xmm4,%xmm0
  139. psrlq $1,%xmm0
  140. pxor %xmm1,%xmm0
  141. pshufd $78,%xmm5,%xmm3
  142. pshufd $78,%xmm0,%xmm4
  143. pxor %xmm5,%xmm3
  144. movdqu %xmm5,48(%rdi)
  145. pxor %xmm0,%xmm4
  146. movdqu %xmm0,64(%rdi)
  147. .byte 102,15,58,15,227,8
  148. movdqu %xmm4,80(%rdi)
  149. .byte 0xf3,0xc3
  150. .globl _gcm_gmult_clmul
  151. .private_extern _gcm_gmult_clmul
  152. .p2align 4
  153. _gcm_gmult_clmul:
  154. L$_gmult_clmul:
  155. movdqu (%rdi),%xmm0
  156. movdqa L$bswap_mask(%rip),%xmm5
  157. movdqu (%rsi),%xmm2
  158. movdqu 32(%rsi),%xmm4
  159. .byte 102,15,56,0,197
  160. movdqa %xmm0,%xmm1
  161. pshufd $78,%xmm0,%xmm3
  162. pxor %xmm0,%xmm3
  163. .byte 102,15,58,68,194,0
  164. .byte 102,15,58,68,202,17
  165. .byte 102,15,58,68,220,0
  166. pxor %xmm0,%xmm3
  167. pxor %xmm1,%xmm3
  168. movdqa %xmm3,%xmm4
  169. psrldq $8,%xmm3
  170. pslldq $8,%xmm4
  171. pxor %xmm3,%xmm1
  172. pxor %xmm4,%xmm0
  173. movdqa %xmm0,%xmm4
  174. movdqa %xmm0,%xmm3
  175. psllq $5,%xmm0
  176. pxor %xmm0,%xmm3
  177. psllq $1,%xmm0
  178. pxor %xmm3,%xmm0
  179. psllq $57,%xmm0
  180. movdqa %xmm0,%xmm3
  181. pslldq $8,%xmm0
  182. psrldq $8,%xmm3
  183. pxor %xmm4,%xmm0
  184. pxor %xmm3,%xmm1
  185. movdqa %xmm0,%xmm4
  186. psrlq $1,%xmm0
  187. pxor %xmm4,%xmm1
  188. pxor %xmm0,%xmm4
  189. psrlq $5,%xmm0
  190. pxor %xmm4,%xmm0
  191. psrlq $1,%xmm0
  192. pxor %xmm1,%xmm0
  193. .byte 102,15,56,0,197
  194. movdqu %xmm0,(%rdi)
  195. .byte 0xf3,0xc3
  196. .globl _gcm_ghash_clmul
  197. .private_extern _gcm_ghash_clmul
  198. .p2align 5
  199. _gcm_ghash_clmul:
  200. L$_ghash_clmul:
  201. movdqa L$bswap_mask(%rip),%xmm10
  202. movdqu (%rdi),%xmm0
  203. movdqu (%rsi),%xmm2
  204. movdqu 32(%rsi),%xmm7
  205. .byte 102,65,15,56,0,194
  206. subq $0x10,%rcx
  207. jz L$odd_tail
  208. movdqu 16(%rsi),%xmm6
  209. leaq _OPENSSL_ia32cap_P(%rip),%rax
  210. movl 4(%rax),%eax
  211. cmpq $0x30,%rcx
  212. jb L$skip4x
  213. andl $71303168,%eax
  214. cmpl $4194304,%eax
  215. je L$skip4x
  216. subq $0x30,%rcx
  217. movq $0xA040608020C0E000,%rax
  218. movdqu 48(%rsi),%xmm14
  219. movdqu 64(%rsi),%xmm15
  220. movdqu 48(%rdx),%xmm3
  221. movdqu 32(%rdx),%xmm11
  222. .byte 102,65,15,56,0,218
  223. .byte 102,69,15,56,0,218
  224. movdqa %xmm3,%xmm5
  225. pshufd $78,%xmm3,%xmm4
  226. pxor %xmm3,%xmm4
  227. .byte 102,15,58,68,218,0
  228. .byte 102,15,58,68,234,17
  229. .byte 102,15,58,68,231,0
  230. movdqa %xmm11,%xmm13
  231. pshufd $78,%xmm11,%xmm12
  232. pxor %xmm11,%xmm12
  233. .byte 102,68,15,58,68,222,0
  234. .byte 102,68,15,58,68,238,17
  235. .byte 102,68,15,58,68,231,16
  236. xorps %xmm11,%xmm3
  237. xorps %xmm13,%xmm5
  238. movups 80(%rsi),%xmm7
  239. xorps %xmm12,%xmm4
  240. movdqu 16(%rdx),%xmm11
  241. movdqu 0(%rdx),%xmm8
  242. .byte 102,69,15,56,0,218
  243. .byte 102,69,15,56,0,194
  244. movdqa %xmm11,%xmm13
  245. pshufd $78,%xmm11,%xmm12
  246. pxor %xmm8,%xmm0
  247. pxor %xmm11,%xmm12
  248. .byte 102,69,15,58,68,222,0
  249. movdqa %xmm0,%xmm1
  250. pshufd $78,%xmm0,%xmm8
  251. pxor %xmm0,%xmm8
  252. .byte 102,69,15,58,68,238,17
  253. .byte 102,68,15,58,68,231,0
  254. xorps %xmm11,%xmm3
  255. xorps %xmm13,%xmm5
  256. leaq 64(%rdx),%rdx
  257. subq $0x40,%rcx
  258. jc L$tail4x
  259. jmp L$mod4_loop
  260. .p2align 5
  261. L$mod4_loop:
  262. .byte 102,65,15,58,68,199,0
  263. xorps %xmm12,%xmm4
  264. movdqu 48(%rdx),%xmm11
  265. .byte 102,69,15,56,0,218
  266. .byte 102,65,15,58,68,207,17
  267. xorps %xmm3,%xmm0
  268. movdqu 32(%rdx),%xmm3
  269. movdqa %xmm11,%xmm13
  270. .byte 102,68,15,58,68,199,16
  271. pshufd $78,%xmm11,%xmm12
  272. xorps %xmm5,%xmm1
  273. pxor %xmm11,%xmm12
  274. .byte 102,65,15,56,0,218
  275. movups 32(%rsi),%xmm7
  276. xorps %xmm4,%xmm8
  277. .byte 102,68,15,58,68,218,0
  278. pshufd $78,%xmm3,%xmm4
  279. pxor %xmm0,%xmm8
  280. movdqa %xmm3,%xmm5
  281. pxor %xmm1,%xmm8
  282. pxor %xmm3,%xmm4
  283. movdqa %xmm8,%xmm9
  284. .byte 102,68,15,58,68,234,17
  285. pslldq $8,%xmm8
  286. psrldq $8,%xmm9
  287. pxor %xmm8,%xmm0
  288. movdqa L$7_mask(%rip),%xmm8
  289. pxor %xmm9,%xmm1
  290. .byte 102,76,15,110,200
  291. pand %xmm0,%xmm8
  292. .byte 102,69,15,56,0,200
  293. pxor %xmm0,%xmm9
  294. .byte 102,68,15,58,68,231,0
  295. psllq $57,%xmm9
  296. movdqa %xmm9,%xmm8
  297. pslldq $8,%xmm9
  298. .byte 102,15,58,68,222,0
  299. psrldq $8,%xmm8
  300. pxor %xmm9,%xmm0
  301. pxor %xmm8,%xmm1
  302. movdqu 0(%rdx),%xmm8
  303. movdqa %xmm0,%xmm9
  304. psrlq $1,%xmm0
  305. .byte 102,15,58,68,238,17
  306. xorps %xmm11,%xmm3
  307. movdqu 16(%rdx),%xmm11
  308. .byte 102,69,15,56,0,218
  309. .byte 102,15,58,68,231,16
  310. xorps %xmm13,%xmm5
  311. movups 80(%rsi),%xmm7
  312. .byte 102,69,15,56,0,194
  313. pxor %xmm9,%xmm1
  314. pxor %xmm0,%xmm9
  315. psrlq $5,%xmm0
  316. movdqa %xmm11,%xmm13
  317. pxor %xmm12,%xmm4
  318. pshufd $78,%xmm11,%xmm12
  319. pxor %xmm9,%xmm0
  320. pxor %xmm8,%xmm1
  321. pxor %xmm11,%xmm12
  322. .byte 102,69,15,58,68,222,0
  323. psrlq $1,%xmm0
  324. pxor %xmm1,%xmm0
  325. movdqa %xmm0,%xmm1
  326. .byte 102,69,15,58,68,238,17
  327. xorps %xmm11,%xmm3
  328. pshufd $78,%xmm0,%xmm8
  329. pxor %xmm0,%xmm8
  330. .byte 102,68,15,58,68,231,0
  331. xorps %xmm13,%xmm5
  332. leaq 64(%rdx),%rdx
  333. subq $0x40,%rcx
  334. jnc L$mod4_loop
  335. L$tail4x:
  336. .byte 102,65,15,58,68,199,0
  337. .byte 102,65,15,58,68,207,17
  338. .byte 102,68,15,58,68,199,16
  339. xorps %xmm12,%xmm4
  340. xorps %xmm3,%xmm0
  341. xorps %xmm5,%xmm1
  342. pxor %xmm0,%xmm1
  343. pxor %xmm4,%xmm8
  344. pxor %xmm1,%xmm8
  345. pxor %xmm0,%xmm1
  346. movdqa %xmm8,%xmm9
  347. psrldq $8,%xmm8
  348. pslldq $8,%xmm9
  349. pxor %xmm8,%xmm1
  350. pxor %xmm9,%xmm0
  351. movdqa %xmm0,%xmm4
  352. movdqa %xmm0,%xmm3
  353. psllq $5,%xmm0
  354. pxor %xmm0,%xmm3
  355. psllq $1,%xmm0
  356. pxor %xmm3,%xmm0
  357. psllq $57,%xmm0
  358. movdqa %xmm0,%xmm3
  359. pslldq $8,%xmm0
  360. psrldq $8,%xmm3
  361. pxor %xmm4,%xmm0
  362. pxor %xmm3,%xmm1
  363. movdqa %xmm0,%xmm4
  364. psrlq $1,%xmm0
  365. pxor %xmm4,%xmm1
  366. pxor %xmm0,%xmm4
  367. psrlq $5,%xmm0
  368. pxor %xmm4,%xmm0
  369. psrlq $1,%xmm0
  370. pxor %xmm1,%xmm0
  371. addq $0x40,%rcx
  372. jz L$done
  373. movdqu 32(%rsi),%xmm7
  374. subq $0x10,%rcx
  375. jz L$odd_tail
  376. L$skip4x:
  377. movdqu (%rdx),%xmm8
  378. movdqu 16(%rdx),%xmm3
  379. .byte 102,69,15,56,0,194
  380. .byte 102,65,15,56,0,218
  381. pxor %xmm8,%xmm0
  382. movdqa %xmm3,%xmm5
  383. pshufd $78,%xmm3,%xmm4
  384. pxor %xmm3,%xmm4
  385. .byte 102,15,58,68,218,0
  386. .byte 102,15,58,68,234,17
  387. .byte 102,15,58,68,231,0
  388. leaq 32(%rdx),%rdx
  389. nop
  390. subq $0x20,%rcx
  391. jbe L$even_tail
  392. nop
  393. jmp L$mod_loop
  394. .p2align 5
  395. L$mod_loop:
  396. movdqa %xmm0,%xmm1
  397. movdqa %xmm4,%xmm8
  398. pshufd $78,%xmm0,%xmm4
  399. pxor %xmm0,%xmm4
  400. .byte 102,15,58,68,198,0
  401. .byte 102,15,58,68,206,17
  402. .byte 102,15,58,68,231,16
  403. pxor %xmm3,%xmm0
  404. pxor %xmm5,%xmm1
  405. movdqu (%rdx),%xmm9
  406. pxor %xmm0,%xmm8
  407. .byte 102,69,15,56,0,202
  408. movdqu 16(%rdx),%xmm3
  409. pxor %xmm1,%xmm8
  410. pxor %xmm9,%xmm1
  411. pxor %xmm8,%xmm4
  412. .byte 102,65,15,56,0,218
  413. movdqa %xmm4,%xmm8
  414. psrldq $8,%xmm8
  415. pslldq $8,%xmm4
  416. pxor %xmm8,%xmm1
  417. pxor %xmm4,%xmm0
  418. movdqa %xmm3,%xmm5
  419. movdqa %xmm0,%xmm9
  420. movdqa %xmm0,%xmm8
  421. psllq $5,%xmm0
  422. pxor %xmm0,%xmm8
  423. .byte 102,15,58,68,218,0
  424. psllq $1,%xmm0
  425. pxor %xmm8,%xmm0
  426. psllq $57,%xmm0
  427. movdqa %xmm0,%xmm8
  428. pslldq $8,%xmm0
  429. psrldq $8,%xmm8
  430. pxor %xmm9,%xmm0
  431. pshufd $78,%xmm5,%xmm4
  432. pxor %xmm8,%xmm1
  433. pxor %xmm5,%xmm4
  434. movdqa %xmm0,%xmm9
  435. psrlq $1,%xmm0
  436. .byte 102,15,58,68,234,17
  437. pxor %xmm9,%xmm1
  438. pxor %xmm0,%xmm9
  439. psrlq $5,%xmm0
  440. pxor %xmm9,%xmm0
  441. leaq 32(%rdx),%rdx
  442. psrlq $1,%xmm0
  443. .byte 102,15,58,68,231,0
  444. pxor %xmm1,%xmm0
  445. subq $0x20,%rcx
  446. ja L$mod_loop
  447. L$even_tail:
  448. movdqa %xmm0,%xmm1
  449. movdqa %xmm4,%xmm8
  450. pshufd $78,%xmm0,%xmm4
  451. pxor %xmm0,%xmm4
  452. .byte 102,15,58,68,198,0
  453. .byte 102,15,58,68,206,17
  454. .byte 102,15,58,68,231,16
  455. pxor %xmm3,%xmm0
  456. pxor %xmm5,%xmm1
  457. pxor %xmm0,%xmm8
  458. pxor %xmm1,%xmm8
  459. pxor %xmm8,%xmm4
  460. movdqa %xmm4,%xmm8
  461. psrldq $8,%xmm8
  462. pslldq $8,%xmm4
  463. pxor %xmm8,%xmm1
  464. pxor %xmm4,%xmm0
  465. movdqa %xmm0,%xmm4
  466. movdqa %xmm0,%xmm3
  467. psllq $5,%xmm0
  468. pxor %xmm0,%xmm3
  469. psllq $1,%xmm0
  470. pxor %xmm3,%xmm0
  471. psllq $57,%xmm0
  472. movdqa %xmm0,%xmm3
  473. pslldq $8,%xmm0
  474. psrldq $8,%xmm3
  475. pxor %xmm4,%xmm0
  476. pxor %xmm3,%xmm1
  477. movdqa %xmm0,%xmm4
  478. psrlq $1,%xmm0
  479. pxor %xmm4,%xmm1
  480. pxor %xmm0,%xmm4
  481. psrlq $5,%xmm0
  482. pxor %xmm4,%xmm0
  483. psrlq $1,%xmm0
  484. pxor %xmm1,%xmm0
  485. testq %rcx,%rcx
  486. jnz L$done
  487. L$odd_tail:
  488. movdqu (%rdx),%xmm8
  489. .byte 102,69,15,56,0,194
  490. pxor %xmm8,%xmm0
  491. movdqa %xmm0,%xmm1
  492. pshufd $78,%xmm0,%xmm3
  493. pxor %xmm0,%xmm3
  494. .byte 102,15,58,68,194,0
  495. .byte 102,15,58,68,202,17
  496. .byte 102,15,58,68,223,0
  497. pxor %xmm0,%xmm3
  498. pxor %xmm1,%xmm3
  499. movdqa %xmm3,%xmm4
  500. psrldq $8,%xmm3
  501. pslldq $8,%xmm4
  502. pxor %xmm3,%xmm1
  503. pxor %xmm4,%xmm0
  504. movdqa %xmm0,%xmm4
  505. movdqa %xmm0,%xmm3
  506. psllq $5,%xmm0
  507. pxor %xmm0,%xmm3
  508. psllq $1,%xmm0
  509. pxor %xmm3,%xmm0
  510. psllq $57,%xmm0
  511. movdqa %xmm0,%xmm3
  512. pslldq $8,%xmm0
  513. psrldq $8,%xmm3
  514. pxor %xmm4,%xmm0
  515. pxor %xmm3,%xmm1
  516. movdqa %xmm0,%xmm4
  517. psrlq $1,%xmm0
  518. pxor %xmm4,%xmm1
  519. pxor %xmm0,%xmm4
  520. psrlq $5,%xmm0
  521. pxor %xmm4,%xmm0
  522. psrlq $1,%xmm0
  523. pxor %xmm1,%xmm0
  524. L$done:
  525. .byte 102,65,15,56,0,194
  526. movdqu %xmm0,(%rdi)
  527. .byte 0xf3,0xc3
  528. .globl _gcm_init_avx
  529. .private_extern _gcm_init_avx
  530. .p2align 5
  531. _gcm_init_avx:
  532. vzeroupper
  533. vmovdqu (%rsi),%xmm2
  534. vpshufd $78,%xmm2,%xmm2
  535. vpshufd $255,%xmm2,%xmm4
  536. vpsrlq $63,%xmm2,%xmm3
  537. vpsllq $1,%xmm2,%xmm2
  538. vpxor %xmm5,%xmm5,%xmm5
  539. vpcmpgtd %xmm4,%xmm5,%xmm5
  540. vpslldq $8,%xmm3,%xmm3
  541. vpor %xmm3,%xmm2,%xmm2
  542. vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5
  543. vpxor %xmm5,%xmm2,%xmm2
  544. vpunpckhqdq %xmm2,%xmm2,%xmm6
  545. vmovdqa %xmm2,%xmm0
  546. vpxor %xmm2,%xmm6,%xmm6
  547. movq $4,%r10
  548. jmp L$init_start_avx
  549. .p2align 5
  550. L$init_loop_avx:
  551. vpalignr $8,%xmm3,%xmm4,%xmm5
  552. vmovdqu %xmm5,-16(%rdi)
  553. vpunpckhqdq %xmm0,%xmm0,%xmm3
  554. vpxor %xmm0,%xmm3,%xmm3
  555. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  556. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  557. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  558. vpxor %xmm0,%xmm1,%xmm4
  559. vpxor %xmm4,%xmm3,%xmm3
  560. vpslldq $8,%xmm3,%xmm4
  561. vpsrldq $8,%xmm3,%xmm3
  562. vpxor %xmm4,%xmm0,%xmm0
  563. vpxor %xmm3,%xmm1,%xmm1
  564. vpsllq $57,%xmm0,%xmm3
  565. vpsllq $62,%xmm0,%xmm4
  566. vpxor %xmm3,%xmm4,%xmm4
  567. vpsllq $63,%xmm0,%xmm3
  568. vpxor %xmm3,%xmm4,%xmm4
  569. vpslldq $8,%xmm4,%xmm3
  570. vpsrldq $8,%xmm4,%xmm4
  571. vpxor %xmm3,%xmm0,%xmm0
  572. vpxor %xmm4,%xmm1,%xmm1
  573. vpsrlq $1,%xmm0,%xmm4
  574. vpxor %xmm0,%xmm1,%xmm1
  575. vpxor %xmm4,%xmm0,%xmm0
  576. vpsrlq $5,%xmm4,%xmm4
  577. vpxor %xmm4,%xmm0,%xmm0
  578. vpsrlq $1,%xmm0,%xmm0
  579. vpxor %xmm1,%xmm0,%xmm0
  580. L$init_start_avx:
  581. vmovdqa %xmm0,%xmm5
  582. vpunpckhqdq %xmm0,%xmm0,%xmm3
  583. vpxor %xmm0,%xmm3,%xmm3
  584. vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
  585. vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
  586. vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
  587. vpxor %xmm0,%xmm1,%xmm4
  588. vpxor %xmm4,%xmm3,%xmm3
  589. vpslldq $8,%xmm3,%xmm4
  590. vpsrldq $8,%xmm3,%xmm3
  591. vpxor %xmm4,%xmm0,%xmm0
  592. vpxor %xmm3,%xmm1,%xmm1
  593. vpsllq $57,%xmm0,%xmm3
  594. vpsllq $62,%xmm0,%xmm4
  595. vpxor %xmm3,%xmm4,%xmm4
  596. vpsllq $63,%xmm0,%xmm3
  597. vpxor %xmm3,%xmm4,%xmm4
  598. vpslldq $8,%xmm4,%xmm3
  599. vpsrldq $8,%xmm4,%xmm4
  600. vpxor %xmm3,%xmm0,%xmm0
  601. vpxor %xmm4,%xmm1,%xmm1
  602. vpsrlq $1,%xmm0,%xmm4
  603. vpxor %xmm0,%xmm1,%xmm1
  604. vpxor %xmm4,%xmm0,%xmm0
  605. vpsrlq $5,%xmm4,%xmm4
  606. vpxor %xmm4,%xmm0,%xmm0
  607. vpsrlq $1,%xmm0,%xmm0
  608. vpxor %xmm1,%xmm0,%xmm0
  609. vpshufd $78,%xmm5,%xmm3
  610. vpshufd $78,%xmm0,%xmm4
  611. vpxor %xmm5,%xmm3,%xmm3
  612. vmovdqu %xmm5,0(%rdi)
  613. vpxor %xmm0,%xmm4,%xmm4
  614. vmovdqu %xmm0,16(%rdi)
  615. leaq 48(%rdi),%rdi
  616. subq $1,%r10
  617. jnz L$init_loop_avx
  618. vpalignr $8,%xmm4,%xmm3,%xmm5
  619. vmovdqu %xmm5,-16(%rdi)
  620. vzeroupper
  621. .byte 0xf3,0xc3
  622. .globl _gcm_gmult_avx
  623. .private_extern _gcm_gmult_avx
  624. .p2align 5
  625. _gcm_gmult_avx:
  626. jmp L$_gmult_clmul
  627. .globl _gcm_ghash_avx
  628. .private_extern _gcm_ghash_avx
  629. .p2align 5
  630. _gcm_ghash_avx:
  631. vzeroupper
  632. vmovdqu (%rdi),%xmm10
  633. leaq L$0x1c2_polynomial(%rip),%r10
  634. leaq 64(%rsi),%rsi
  635. vmovdqu L$bswap_mask(%rip),%xmm13
  636. vpshufb %xmm13,%xmm10,%xmm10
  637. cmpq $0x80,%rcx
  638. jb L$short_avx
  639. subq $0x80,%rcx
  640. vmovdqu 112(%rdx),%xmm14
  641. vmovdqu 0-64(%rsi),%xmm6
  642. vpshufb %xmm13,%xmm14,%xmm14
  643. vmovdqu 32-64(%rsi),%xmm7
  644. vpunpckhqdq %xmm14,%xmm14,%xmm9
  645. vmovdqu 96(%rdx),%xmm15
  646. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  647. vpxor %xmm14,%xmm9,%xmm9
  648. vpshufb %xmm13,%xmm15,%xmm15
  649. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  650. vmovdqu 16-64(%rsi),%xmm6
  651. vpunpckhqdq %xmm15,%xmm15,%xmm8
  652. vmovdqu 80(%rdx),%xmm14
  653. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  654. vpxor %xmm15,%xmm8,%xmm8
  655. vpshufb %xmm13,%xmm14,%xmm14
  656. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  657. vpunpckhqdq %xmm14,%xmm14,%xmm9
  658. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  659. vmovdqu 48-64(%rsi),%xmm6
  660. vpxor %xmm14,%xmm9,%xmm9
  661. vmovdqu 64(%rdx),%xmm15
  662. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  663. vmovdqu 80-64(%rsi),%xmm7
  664. vpshufb %xmm13,%xmm15,%xmm15
  665. vpxor %xmm0,%xmm3,%xmm3
  666. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  667. vpxor %xmm1,%xmm4,%xmm4
  668. vpunpckhqdq %xmm15,%xmm15,%xmm8
  669. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  670. vmovdqu 64-64(%rsi),%xmm6
  671. vpxor %xmm2,%xmm5,%xmm5
  672. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  673. vpxor %xmm15,%xmm8,%xmm8
  674. vmovdqu 48(%rdx),%xmm14
  675. vpxor %xmm3,%xmm0,%xmm0
  676. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  677. vpxor %xmm4,%xmm1,%xmm1
  678. vpshufb %xmm13,%xmm14,%xmm14
  679. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  680. vmovdqu 96-64(%rsi),%xmm6
  681. vpxor %xmm5,%xmm2,%xmm2
  682. vpunpckhqdq %xmm14,%xmm14,%xmm9
  683. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  684. vmovdqu 128-64(%rsi),%xmm7
  685. vpxor %xmm14,%xmm9,%xmm9
  686. vmovdqu 32(%rdx),%xmm15
  687. vpxor %xmm0,%xmm3,%xmm3
  688. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  689. vpxor %xmm1,%xmm4,%xmm4
  690. vpshufb %xmm13,%xmm15,%xmm15
  691. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  692. vmovdqu 112-64(%rsi),%xmm6
  693. vpxor %xmm2,%xmm5,%xmm5
  694. vpunpckhqdq %xmm15,%xmm15,%xmm8
  695. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  696. vpxor %xmm15,%xmm8,%xmm8
  697. vmovdqu 16(%rdx),%xmm14
  698. vpxor %xmm3,%xmm0,%xmm0
  699. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  700. vpxor %xmm4,%xmm1,%xmm1
  701. vpshufb %xmm13,%xmm14,%xmm14
  702. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  703. vmovdqu 144-64(%rsi),%xmm6
  704. vpxor %xmm5,%xmm2,%xmm2
  705. vpunpckhqdq %xmm14,%xmm14,%xmm9
  706. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  707. vmovdqu 176-64(%rsi),%xmm7
  708. vpxor %xmm14,%xmm9,%xmm9
  709. vmovdqu (%rdx),%xmm15
  710. vpxor %xmm0,%xmm3,%xmm3
  711. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  712. vpxor %xmm1,%xmm4,%xmm4
  713. vpshufb %xmm13,%xmm15,%xmm15
  714. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  715. vmovdqu 160-64(%rsi),%xmm6
  716. vpxor %xmm2,%xmm5,%xmm5
  717. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  718. leaq 128(%rdx),%rdx
  719. cmpq $0x80,%rcx
  720. jb L$tail_avx
  721. vpxor %xmm10,%xmm15,%xmm15
  722. subq $0x80,%rcx
  723. jmp L$oop8x_avx
  724. .p2align 5
  725. L$oop8x_avx:
  726. vpunpckhqdq %xmm15,%xmm15,%xmm8
  727. vmovdqu 112(%rdx),%xmm14
  728. vpxor %xmm0,%xmm3,%xmm3
  729. vpxor %xmm15,%xmm8,%xmm8
  730. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
  731. vpshufb %xmm13,%xmm14,%xmm14
  732. vpxor %xmm1,%xmm4,%xmm4
  733. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
  734. vmovdqu 0-64(%rsi),%xmm6
  735. vpunpckhqdq %xmm14,%xmm14,%xmm9
  736. vpxor %xmm2,%xmm5,%xmm5
  737. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
  738. vmovdqu 32-64(%rsi),%xmm7
  739. vpxor %xmm14,%xmm9,%xmm9
  740. vmovdqu 96(%rdx),%xmm15
  741. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  742. vpxor %xmm3,%xmm10,%xmm10
  743. vpshufb %xmm13,%xmm15,%xmm15
  744. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  745. vxorps %xmm4,%xmm11,%xmm11
  746. vmovdqu 16-64(%rsi),%xmm6
  747. vpunpckhqdq %xmm15,%xmm15,%xmm8
  748. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  749. vpxor %xmm5,%xmm12,%xmm12
  750. vxorps %xmm15,%xmm8,%xmm8
  751. vmovdqu 80(%rdx),%xmm14
  752. vpxor %xmm10,%xmm12,%xmm12
  753. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  754. vpxor %xmm11,%xmm12,%xmm12
  755. vpslldq $8,%xmm12,%xmm9
  756. vpxor %xmm0,%xmm3,%xmm3
  757. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  758. vpsrldq $8,%xmm12,%xmm12
  759. vpxor %xmm9,%xmm10,%xmm10
  760. vmovdqu 48-64(%rsi),%xmm6
  761. vpshufb %xmm13,%xmm14,%xmm14
  762. vxorps %xmm12,%xmm11,%xmm11
  763. vpxor %xmm1,%xmm4,%xmm4
  764. vpunpckhqdq %xmm14,%xmm14,%xmm9
  765. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  766. vmovdqu 80-64(%rsi),%xmm7
  767. vpxor %xmm14,%xmm9,%xmm9
  768. vpxor %xmm2,%xmm5,%xmm5
  769. vmovdqu 64(%rdx),%xmm15
  770. vpalignr $8,%xmm10,%xmm10,%xmm12
  771. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  772. vpshufb %xmm13,%xmm15,%xmm15
  773. vpxor %xmm3,%xmm0,%xmm0
  774. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  775. vmovdqu 64-64(%rsi),%xmm6
  776. vpunpckhqdq %xmm15,%xmm15,%xmm8
  777. vpxor %xmm4,%xmm1,%xmm1
  778. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  779. vxorps %xmm15,%xmm8,%xmm8
  780. vpxor %xmm5,%xmm2,%xmm2
  781. vmovdqu 48(%rdx),%xmm14
  782. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  783. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  784. vpshufb %xmm13,%xmm14,%xmm14
  785. vpxor %xmm0,%xmm3,%xmm3
  786. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  787. vmovdqu 96-64(%rsi),%xmm6
  788. vpunpckhqdq %xmm14,%xmm14,%xmm9
  789. vpxor %xmm1,%xmm4,%xmm4
  790. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  791. vmovdqu 128-64(%rsi),%xmm7
  792. vpxor %xmm14,%xmm9,%xmm9
  793. vpxor %xmm2,%xmm5,%xmm5
  794. vmovdqu 32(%rdx),%xmm15
  795. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  796. vpshufb %xmm13,%xmm15,%xmm15
  797. vpxor %xmm3,%xmm0,%xmm0
  798. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  799. vmovdqu 112-64(%rsi),%xmm6
  800. vpunpckhqdq %xmm15,%xmm15,%xmm8
  801. vpxor %xmm4,%xmm1,%xmm1
  802. vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
  803. vpxor %xmm15,%xmm8,%xmm8
  804. vpxor %xmm5,%xmm2,%xmm2
  805. vxorps %xmm12,%xmm10,%xmm10
  806. vmovdqu 16(%rdx),%xmm14
  807. vpalignr $8,%xmm10,%xmm10,%xmm12
  808. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
  809. vpshufb %xmm13,%xmm14,%xmm14
  810. vpxor %xmm0,%xmm3,%xmm3
  811. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
  812. vmovdqu 144-64(%rsi),%xmm6
  813. vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
  814. vxorps %xmm11,%xmm12,%xmm12
  815. vpunpckhqdq %xmm14,%xmm14,%xmm9
  816. vpxor %xmm1,%xmm4,%xmm4
  817. vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
  818. vmovdqu 176-64(%rsi),%xmm7
  819. vpxor %xmm14,%xmm9,%xmm9
  820. vpxor %xmm2,%xmm5,%xmm5
  821. vmovdqu (%rdx),%xmm15
  822. vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
  823. vpshufb %xmm13,%xmm15,%xmm15
  824. vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
  825. vmovdqu 160-64(%rsi),%xmm6
  826. vpxor %xmm12,%xmm15,%xmm15
  827. vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
  828. vpxor %xmm10,%xmm15,%xmm15
  829. leaq 128(%rdx),%rdx
  830. subq $0x80,%rcx
  831. jnc L$oop8x_avx
  832. addq $0x80,%rcx
  833. jmp L$tail_no_xor_avx
  834. .p2align 5
  835. L$short_avx:
  836. vmovdqu -16(%rdx,%rcx,1),%xmm14
  837. leaq (%rdx,%rcx,1),%rdx
  838. vmovdqu 0-64(%rsi),%xmm6
  839. vmovdqu 32-64(%rsi),%xmm7
  840. vpshufb %xmm13,%xmm14,%xmm15
  841. vmovdqa %xmm0,%xmm3
  842. vmovdqa %xmm1,%xmm4
  843. vmovdqa %xmm2,%xmm5
  844. subq $0x10,%rcx
  845. jz L$tail_avx
  846. vpunpckhqdq %xmm15,%xmm15,%xmm8
  847. vpxor %xmm0,%xmm3,%xmm3
  848. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  849. vpxor %xmm15,%xmm8,%xmm8
  850. vmovdqu -32(%rdx),%xmm14
  851. vpxor %xmm1,%xmm4,%xmm4
  852. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  853. vmovdqu 16-64(%rsi),%xmm6
  854. vpshufb %xmm13,%xmm14,%xmm15
  855. vpxor %xmm2,%xmm5,%xmm5
  856. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  857. vpsrldq $8,%xmm7,%xmm7
  858. subq $0x10,%rcx
  859. jz L$tail_avx
  860. vpunpckhqdq %xmm15,%xmm15,%xmm8
  861. vpxor %xmm0,%xmm3,%xmm3
  862. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  863. vpxor %xmm15,%xmm8,%xmm8
  864. vmovdqu -48(%rdx),%xmm14
  865. vpxor %xmm1,%xmm4,%xmm4
  866. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  867. vmovdqu 48-64(%rsi),%xmm6
  868. vpshufb %xmm13,%xmm14,%xmm15
  869. vpxor %xmm2,%xmm5,%xmm5
  870. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  871. vmovdqu 80-64(%rsi),%xmm7
  872. subq $0x10,%rcx
  873. jz L$tail_avx
  874. vpunpckhqdq %xmm15,%xmm15,%xmm8
  875. vpxor %xmm0,%xmm3,%xmm3
  876. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  877. vpxor %xmm15,%xmm8,%xmm8
  878. vmovdqu -64(%rdx),%xmm14
  879. vpxor %xmm1,%xmm4,%xmm4
  880. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  881. vmovdqu 64-64(%rsi),%xmm6
  882. vpshufb %xmm13,%xmm14,%xmm15
  883. vpxor %xmm2,%xmm5,%xmm5
  884. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  885. vpsrldq $8,%xmm7,%xmm7
  886. subq $0x10,%rcx
  887. jz L$tail_avx
  888. vpunpckhqdq %xmm15,%xmm15,%xmm8
  889. vpxor %xmm0,%xmm3,%xmm3
  890. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  891. vpxor %xmm15,%xmm8,%xmm8
  892. vmovdqu -80(%rdx),%xmm14
  893. vpxor %xmm1,%xmm4,%xmm4
  894. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  895. vmovdqu 96-64(%rsi),%xmm6
  896. vpshufb %xmm13,%xmm14,%xmm15
  897. vpxor %xmm2,%xmm5,%xmm5
  898. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  899. vmovdqu 128-64(%rsi),%xmm7
  900. subq $0x10,%rcx
  901. jz L$tail_avx
  902. vpunpckhqdq %xmm15,%xmm15,%xmm8
  903. vpxor %xmm0,%xmm3,%xmm3
  904. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  905. vpxor %xmm15,%xmm8,%xmm8
  906. vmovdqu -96(%rdx),%xmm14
  907. vpxor %xmm1,%xmm4,%xmm4
  908. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  909. vmovdqu 112-64(%rsi),%xmm6
  910. vpshufb %xmm13,%xmm14,%xmm15
  911. vpxor %xmm2,%xmm5,%xmm5
  912. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  913. vpsrldq $8,%xmm7,%xmm7
  914. subq $0x10,%rcx
  915. jz L$tail_avx
  916. vpunpckhqdq %xmm15,%xmm15,%xmm8
  917. vpxor %xmm0,%xmm3,%xmm3
  918. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  919. vpxor %xmm15,%xmm8,%xmm8
  920. vmovdqu -112(%rdx),%xmm14
  921. vpxor %xmm1,%xmm4,%xmm4
  922. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  923. vmovdqu 144-64(%rsi),%xmm6
  924. vpshufb %xmm13,%xmm14,%xmm15
  925. vpxor %xmm2,%xmm5,%xmm5
  926. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  927. vmovq 184-64(%rsi),%xmm7
  928. subq $0x10,%rcx
  929. jmp L$tail_avx
  930. .p2align 5
  931. L$tail_avx:
  932. vpxor %xmm10,%xmm15,%xmm15
  933. L$tail_no_xor_avx:
  934. vpunpckhqdq %xmm15,%xmm15,%xmm8
  935. vpxor %xmm0,%xmm3,%xmm3
  936. vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
  937. vpxor %xmm15,%xmm8,%xmm8
  938. vpxor %xmm1,%xmm4,%xmm4
  939. vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
  940. vpxor %xmm2,%xmm5,%xmm5
  941. vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
  942. vmovdqu (%r10),%xmm12
  943. vpxor %xmm0,%xmm3,%xmm10
  944. vpxor %xmm1,%xmm4,%xmm11
  945. vpxor %xmm2,%xmm5,%xmm5
  946. vpxor %xmm10,%xmm5,%xmm5
  947. vpxor %xmm11,%xmm5,%xmm5
  948. vpslldq $8,%xmm5,%xmm9
  949. vpsrldq $8,%xmm5,%xmm5
  950. vpxor %xmm9,%xmm10,%xmm10
  951. vpxor %xmm5,%xmm11,%xmm11
  952. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  953. vpalignr $8,%xmm10,%xmm10,%xmm10
  954. vpxor %xmm9,%xmm10,%xmm10
  955. vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
  956. vpalignr $8,%xmm10,%xmm10,%xmm10
  957. vpxor %xmm11,%xmm10,%xmm10
  958. vpxor %xmm9,%xmm10,%xmm10
  959. cmpq $0,%rcx
  960. jne L$short_avx
  961. vpshufb %xmm13,%xmm10,%xmm10
  962. vmovdqu %xmm10,(%rdi)
  963. vzeroupper
  964. .byte 0xf3,0xc3
  965. .p2align 6
  966. L$bswap_mask:
  967. .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
  968. L$0x1c2_polynomial:
  969. .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
  970. L$7_mask:
  971. .long 7,0,7,0
  972. .p2align 6
  973. .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
  974. .p2align 6
  975. #endif