chacha-armv4.S 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498
  1. // This file is generated from a similarly-named Perl script in the BoringSSL
  2. // source tree. Do not edit by hand.
  3. #if !defined(__has_feature)
  4. #define __has_feature(x) 0
  5. #endif
  6. #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
  7. #define OPENSSL_NO_ASM
  8. #endif
  9. #if !defined(OPENSSL_NO_ASM)
  10. #if defined(BORINGSSL_PREFIX)
  11. #include <boringssl_prefix_symbols_asm.h>
  12. #endif
  13. #include <openssl/arm_arch.h>
  14. @ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
  15. @ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
  16. .text
  17. #if defined(__thumb2__) || defined(__clang__)
  18. .syntax unified
  19. #endif
  20. #if defined(__thumb2__)
  21. .thumb
  22. #else
  23. .code 32
  24. #endif
  25. #if defined(__thumb2__) || defined(__clang__)
  26. #define ldrhsb ldrbhs
  27. #endif
  28. .align 5
  29. Lsigma:
  30. .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
  31. Lone:
  32. .long 1,0,0,0
  33. #if __ARM_MAX_ARCH__>=7
  34. LOPENSSL_armcap:
  35. .word OPENSSL_armcap_P-LChaCha20_ctr32
  36. #else
  37. .word -1
  38. #endif
  39. .globl _ChaCha20_ctr32
  40. .private_extern _ChaCha20_ctr32
  41. #ifdef __thumb2__
  42. .thumb_func _ChaCha20_ctr32
  43. #endif
  44. .align 5
  45. _ChaCha20_ctr32:
  46. LChaCha20_ctr32:
  47. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  48. stmdb sp!,{r0,r1,r2,r4-r11,lr}
  49. #if __ARM_ARCH__<7 && !defined(__thumb2__)
  50. sub r14,pc,#16 @ _ChaCha20_ctr32
  51. #else
  52. adr r14,LChaCha20_ctr32
  53. #endif
  54. cmp r2,#0 @ len==0?
  55. #ifdef __thumb2__
  56. itt eq
  57. #endif
  58. addeq sp,sp,#4*3
  59. beq Lno_data
  60. #if __ARM_MAX_ARCH__>=7
  61. cmp r2,#192 @ test len
  62. bls Lshort
  63. ldr r4,[r14,#-32]
  64. ldr r4,[r14,r4]
  65. # ifdef __APPLE__
  66. ldr r4,[r4]
  67. # endif
  68. tst r4,#ARMV7_NEON
  69. bne LChaCha20_neon
  70. Lshort:
  71. #endif
  72. ldmia r12,{r4,r5,r6,r7} @ load counter and nonce
  73. sub sp,sp,#4*(16) @ off-load area
  74. sub r14,r14,#64 @ Lsigma
  75. stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce
  76. ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
  77. ldmia r14,{r0,r1,r2,r3} @ load sigma
  78. stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key
  79. stmdb sp!,{r0,r1,r2,r3} @ copy sigma
  80. str r10,[sp,#4*(16+10)] @ off-load "rx"
  81. str r11,[sp,#4*(16+11)] @ off-load "rx"
  82. b Loop_outer_enter
  83. .align 4
  84. Loop_outer:
  85. ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
  86. str r11,[sp,#4*(32+2)] @ save len
  87. str r12, [sp,#4*(32+1)] @ save inp
  88. str r14, [sp,#4*(32+0)] @ save out
  89. Loop_outer_enter:
  90. ldr r11, [sp,#4*(15)]
  91. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  92. ldr r10, [sp,#4*(13)]
  93. ldr r14,[sp,#4*(14)]
  94. str r11, [sp,#4*(16+15)]
  95. mov r11,#10
  96. b Loop
  97. .align 4
  98. Loop:
  99. subs r11,r11,#1
  100. add r0,r0,r4
  101. mov r12,r12,ror#16
  102. add r1,r1,r5
  103. mov r10,r10,ror#16
  104. eor r12,r12,r0,ror#16
  105. eor r10,r10,r1,ror#16
  106. add r8,r8,r12
  107. mov r4,r4,ror#20
  108. add r9,r9,r10
  109. mov r5,r5,ror#20
  110. eor r4,r4,r8,ror#20
  111. eor r5,r5,r9,ror#20
  112. add r0,r0,r4
  113. mov r12,r12,ror#24
  114. add r1,r1,r5
  115. mov r10,r10,ror#24
  116. eor r12,r12,r0,ror#24
  117. eor r10,r10,r1,ror#24
  118. add r8,r8,r12
  119. mov r4,r4,ror#25
  120. add r9,r9,r10
  121. mov r5,r5,ror#25
  122. str r10,[sp,#4*(16+13)]
  123. ldr r10,[sp,#4*(16+15)]
  124. eor r4,r4,r8,ror#25
  125. eor r5,r5,r9,ror#25
  126. str r8,[sp,#4*(16+8)]
  127. ldr r8,[sp,#4*(16+10)]
  128. add r2,r2,r6
  129. mov r14,r14,ror#16
  130. str r9,[sp,#4*(16+9)]
  131. ldr r9,[sp,#4*(16+11)]
  132. add r3,r3,r7
  133. mov r10,r10,ror#16
  134. eor r14,r14,r2,ror#16
  135. eor r10,r10,r3,ror#16
  136. add r8,r8,r14
  137. mov r6,r6,ror#20
  138. add r9,r9,r10
  139. mov r7,r7,ror#20
  140. eor r6,r6,r8,ror#20
  141. eor r7,r7,r9,ror#20
  142. add r2,r2,r6
  143. mov r14,r14,ror#24
  144. add r3,r3,r7
  145. mov r10,r10,ror#24
  146. eor r14,r14,r2,ror#24
  147. eor r10,r10,r3,ror#24
  148. add r8,r8,r14
  149. mov r6,r6,ror#25
  150. add r9,r9,r10
  151. mov r7,r7,ror#25
  152. eor r6,r6,r8,ror#25
  153. eor r7,r7,r9,ror#25
  154. add r0,r0,r5
  155. mov r10,r10,ror#16
  156. add r1,r1,r6
  157. mov r12,r12,ror#16
  158. eor r10,r10,r0,ror#16
  159. eor r12,r12,r1,ror#16
  160. add r8,r8,r10
  161. mov r5,r5,ror#20
  162. add r9,r9,r12
  163. mov r6,r6,ror#20
  164. eor r5,r5,r8,ror#20
  165. eor r6,r6,r9,ror#20
  166. add r0,r0,r5
  167. mov r10,r10,ror#24
  168. add r1,r1,r6
  169. mov r12,r12,ror#24
  170. eor r10,r10,r0,ror#24
  171. eor r12,r12,r1,ror#24
  172. add r8,r8,r10
  173. mov r5,r5,ror#25
  174. str r10,[sp,#4*(16+15)]
  175. ldr r10,[sp,#4*(16+13)]
  176. add r9,r9,r12
  177. mov r6,r6,ror#25
  178. eor r5,r5,r8,ror#25
  179. eor r6,r6,r9,ror#25
  180. str r8,[sp,#4*(16+10)]
  181. ldr r8,[sp,#4*(16+8)]
  182. add r2,r2,r7
  183. mov r10,r10,ror#16
  184. str r9,[sp,#4*(16+11)]
  185. ldr r9,[sp,#4*(16+9)]
  186. add r3,r3,r4
  187. mov r14,r14,ror#16
  188. eor r10,r10,r2,ror#16
  189. eor r14,r14,r3,ror#16
  190. add r8,r8,r10
  191. mov r7,r7,ror#20
  192. add r9,r9,r14
  193. mov r4,r4,ror#20
  194. eor r7,r7,r8,ror#20
  195. eor r4,r4,r9,ror#20
  196. add r2,r2,r7
  197. mov r10,r10,ror#24
  198. add r3,r3,r4
  199. mov r14,r14,ror#24
  200. eor r10,r10,r2,ror#24
  201. eor r14,r14,r3,ror#24
  202. add r8,r8,r10
  203. mov r7,r7,ror#25
  204. add r9,r9,r14
  205. mov r4,r4,ror#25
  206. eor r7,r7,r8,ror#25
  207. eor r4,r4,r9,ror#25
  208. bne Loop
  209. ldr r11,[sp,#4*(32+2)] @ load len
  210. str r8, [sp,#4*(16+8)] @ modulo-scheduled store
  211. str r9, [sp,#4*(16+9)]
  212. str r12,[sp,#4*(16+12)]
  213. str r10, [sp,#4*(16+13)]
  214. str r14,[sp,#4*(16+14)]
  215. @ at this point we have first half of 512-bit result in
  216. @ rx and second half at sp+4*(16+8)
  217. cmp r11,#64 @ done yet?
  218. #ifdef __thumb2__
  219. itete lo
  220. #endif
  221. addlo r12,sp,#4*(0) @ shortcut or ...
  222. ldrhs r12,[sp,#4*(32+1)] @ ... load inp
  223. addlo r14,sp,#4*(0) @ shortcut or ...
  224. ldrhs r14,[sp,#4*(32+0)] @ ... load out
  225. ldr r8,[sp,#4*(0)] @ load key material
  226. ldr r9,[sp,#4*(1)]
  227. #if __ARM_ARCH__>=6 || !defined(__ARMEB__)
  228. # if __ARM_ARCH__<7
  229. orr r10,r12,r14
  230. tst r10,#3 @ are input and output aligned?
  231. ldr r10,[sp,#4*(2)]
  232. bne Lunaligned
  233. cmp r11,#64 @ restore flags
  234. # else
  235. ldr r10,[sp,#4*(2)]
  236. # endif
  237. ldr r11,[sp,#4*(3)]
  238. add r0,r0,r8 @ accumulate key material
  239. add r1,r1,r9
  240. # ifdef __thumb2__
  241. itt hs
  242. # endif
  243. ldrhs r8,[r12],#16 @ load input
  244. ldrhs r9,[r12,#-12]
  245. add r2,r2,r10
  246. add r3,r3,r11
  247. # ifdef __thumb2__
  248. itt hs
  249. # endif
  250. ldrhs r10,[r12,#-8]
  251. ldrhs r11,[r12,#-4]
  252. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  253. rev r0,r0
  254. rev r1,r1
  255. rev r2,r2
  256. rev r3,r3
  257. # endif
  258. # ifdef __thumb2__
  259. itt hs
  260. # endif
  261. eorhs r0,r0,r8 @ xor with input
  262. eorhs r1,r1,r9
  263. add r8,sp,#4*(4)
  264. str r0,[r14],#16 @ store output
  265. # ifdef __thumb2__
  266. itt hs
  267. # endif
  268. eorhs r2,r2,r10
  269. eorhs r3,r3,r11
  270. ldmia r8,{r8,r9,r10,r11} @ load key material
  271. str r1,[r14,#-12]
  272. str r2,[r14,#-8]
  273. str r3,[r14,#-4]
  274. add r4,r4,r8 @ accumulate key material
  275. add r5,r5,r9
  276. # ifdef __thumb2__
  277. itt hs
  278. # endif
  279. ldrhs r8,[r12],#16 @ load input
  280. ldrhs r9,[r12,#-12]
  281. add r6,r6,r10
  282. add r7,r7,r11
  283. # ifdef __thumb2__
  284. itt hs
  285. # endif
  286. ldrhs r10,[r12,#-8]
  287. ldrhs r11,[r12,#-4]
  288. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  289. rev r4,r4
  290. rev r5,r5
  291. rev r6,r6
  292. rev r7,r7
  293. # endif
  294. # ifdef __thumb2__
  295. itt hs
  296. # endif
  297. eorhs r4,r4,r8
  298. eorhs r5,r5,r9
  299. add r8,sp,#4*(8)
  300. str r4,[r14],#16 @ store output
  301. # ifdef __thumb2__
  302. itt hs
  303. # endif
  304. eorhs r6,r6,r10
  305. eorhs r7,r7,r11
  306. str r5,[r14,#-12]
  307. ldmia r8,{r8,r9,r10,r11} @ load key material
  308. str r6,[r14,#-8]
  309. add r0,sp,#4*(16+8)
  310. str r7,[r14,#-4]
  311. ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
  312. add r0,r0,r8 @ accumulate key material
  313. add r1,r1,r9
  314. # ifdef __thumb2__
  315. itt hs
  316. # endif
  317. ldrhs r8,[r12],#16 @ load input
  318. ldrhs r9,[r12,#-12]
  319. # ifdef __thumb2__
  320. itt hi
  321. # endif
  322. strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
  323. strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
  324. add r2,r2,r10
  325. add r3,r3,r11
  326. # ifdef __thumb2__
  327. itt hs
  328. # endif
  329. ldrhs r10,[r12,#-8]
  330. ldrhs r11,[r12,#-4]
  331. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  332. rev r0,r0
  333. rev r1,r1
  334. rev r2,r2
  335. rev r3,r3
  336. # endif
  337. # ifdef __thumb2__
  338. itt hs
  339. # endif
  340. eorhs r0,r0,r8
  341. eorhs r1,r1,r9
  342. add r8,sp,#4*(12)
  343. str r0,[r14],#16 @ store output
  344. # ifdef __thumb2__
  345. itt hs
  346. # endif
  347. eorhs r2,r2,r10
  348. eorhs r3,r3,r11
  349. str r1,[r14,#-12]
  350. ldmia r8,{r8,r9,r10,r11} @ load key material
  351. str r2,[r14,#-8]
  352. str r3,[r14,#-4]
  353. add r4,r4,r8 @ accumulate key material
  354. add r5,r5,r9
  355. # ifdef __thumb2__
  356. itt hi
  357. # endif
  358. addhi r8,r8,#1 @ next counter value
  359. strhi r8,[sp,#4*(12)] @ save next counter value
  360. # ifdef __thumb2__
  361. itt hs
  362. # endif
  363. ldrhs r8,[r12],#16 @ load input
  364. ldrhs r9,[r12,#-12]
  365. add r6,r6,r10
  366. add r7,r7,r11
  367. # ifdef __thumb2__
  368. itt hs
  369. # endif
  370. ldrhs r10,[r12,#-8]
  371. ldrhs r11,[r12,#-4]
  372. # if __ARM_ARCH__>=6 && defined(__ARMEB__)
  373. rev r4,r4
  374. rev r5,r5
  375. rev r6,r6
  376. rev r7,r7
  377. # endif
  378. # ifdef __thumb2__
  379. itt hs
  380. # endif
  381. eorhs r4,r4,r8
  382. eorhs r5,r5,r9
  383. # ifdef __thumb2__
  384. it ne
  385. # endif
  386. ldrne r8,[sp,#4*(32+2)] @ re-load len
  387. # ifdef __thumb2__
  388. itt hs
  389. # endif
  390. eorhs r6,r6,r10
  391. eorhs r7,r7,r11
  392. str r4,[r14],#16 @ store output
  393. str r5,[r14,#-12]
  394. # ifdef __thumb2__
  395. it hs
  396. # endif
  397. subhs r11,r8,#64 @ len-=64
  398. str r6,[r14,#-8]
  399. str r7,[r14,#-4]
  400. bhi Loop_outer
  401. beq Ldone
  402. # if __ARM_ARCH__<7
  403. b Ltail
  404. .align 4
  405. Lunaligned:@ unaligned endian-neutral path
  406. cmp r11,#64 @ restore flags
  407. # endif
  408. #endif
  409. #if __ARM_ARCH__<7
  410. ldr r11,[sp,#4*(3)]
  411. add r0,r0,r8 @ accumulate key material
  412. add r1,r1,r9
  413. add r2,r2,r10
  414. # ifdef __thumb2__
  415. itete lo
  416. # endif
  417. eorlo r8,r8,r8 @ zero or ...
  418. ldrhsb r8,[r12],#16 @ ... load input
  419. eorlo r9,r9,r9
  420. ldrhsb r9,[r12,#-12]
  421. add r3,r3,r11
  422. # ifdef __thumb2__
  423. itete lo
  424. # endif
  425. eorlo r10,r10,r10
  426. ldrhsb r10,[r12,#-8]
  427. eorlo r11,r11,r11
  428. ldrhsb r11,[r12,#-4]
  429. eor r0,r8,r0 @ xor with input (or zero)
  430. eor r1,r9,r1
  431. # ifdef __thumb2__
  432. itt hs
  433. # endif
  434. ldrhsb r8,[r12,#-15] @ load more input
  435. ldrhsb r9,[r12,#-11]
  436. eor r2,r10,r2
  437. strb r0,[r14],#16 @ store output
  438. eor r3,r11,r3
  439. # ifdef __thumb2__
  440. itt hs
  441. # endif
  442. ldrhsb r10,[r12,#-7]
  443. ldrhsb r11,[r12,#-3]
  444. strb r1,[r14,#-12]
  445. eor r0,r8,r0,lsr#8
  446. strb r2,[r14,#-8]
  447. eor r1,r9,r1,lsr#8
  448. # ifdef __thumb2__
  449. itt hs
  450. # endif
  451. ldrhsb r8,[r12,#-14] @ load more input
  452. ldrhsb r9,[r12,#-10]
  453. strb r3,[r14,#-4]
  454. eor r2,r10,r2,lsr#8
  455. strb r0,[r14,#-15]
  456. eor r3,r11,r3,lsr#8
  457. # ifdef __thumb2__
  458. itt hs
  459. # endif
  460. ldrhsb r10,[r12,#-6]
  461. ldrhsb r11,[r12,#-2]
  462. strb r1,[r14,#-11]
  463. eor r0,r8,r0,lsr#8
  464. strb r2,[r14,#-7]
  465. eor r1,r9,r1,lsr#8
  466. # ifdef __thumb2__
  467. itt hs
  468. # endif
  469. ldrhsb r8,[r12,#-13] @ load more input
  470. ldrhsb r9,[r12,#-9]
  471. strb r3,[r14,#-3]
  472. eor r2,r10,r2,lsr#8
  473. strb r0,[r14,#-14]
  474. eor r3,r11,r3,lsr#8
  475. # ifdef __thumb2__
  476. itt hs
  477. # endif
  478. ldrhsb r10,[r12,#-5]
  479. ldrhsb r11,[r12,#-1]
  480. strb r1,[r14,#-10]
  481. strb r2,[r14,#-6]
  482. eor r0,r8,r0,lsr#8
  483. strb r3,[r14,#-2]
  484. eor r1,r9,r1,lsr#8
  485. strb r0,[r14,#-13]
  486. eor r2,r10,r2,lsr#8
  487. strb r1,[r14,#-9]
  488. eor r3,r11,r3,lsr#8
  489. strb r2,[r14,#-5]
  490. strb r3,[r14,#-1]
  491. add r8,sp,#4*(4+0)
  492. ldmia r8,{r8,r9,r10,r11} @ load key material
  493. add r0,sp,#4*(16+8)
  494. add r4,r4,r8 @ accumulate key material
  495. add r5,r5,r9
  496. add r6,r6,r10
  497. # ifdef __thumb2__
  498. itete lo
  499. # endif
  500. eorlo r8,r8,r8 @ zero or ...
  501. ldrhsb r8,[r12],#16 @ ... load input
  502. eorlo r9,r9,r9
  503. ldrhsb r9,[r12,#-12]
  504. add r7,r7,r11
  505. # ifdef __thumb2__
  506. itete lo
  507. # endif
  508. eorlo r10,r10,r10
  509. ldrhsb r10,[r12,#-8]
  510. eorlo r11,r11,r11
  511. ldrhsb r11,[r12,#-4]
  512. eor r4,r8,r4 @ xor with input (or zero)
  513. eor r5,r9,r5
  514. # ifdef __thumb2__
  515. itt hs
  516. # endif
  517. ldrhsb r8,[r12,#-15] @ load more input
  518. ldrhsb r9,[r12,#-11]
  519. eor r6,r10,r6
  520. strb r4,[r14],#16 @ store output
  521. eor r7,r11,r7
  522. # ifdef __thumb2__
  523. itt hs
  524. # endif
  525. ldrhsb r10,[r12,#-7]
  526. ldrhsb r11,[r12,#-3]
  527. strb r5,[r14,#-12]
  528. eor r4,r8,r4,lsr#8
  529. strb r6,[r14,#-8]
  530. eor r5,r9,r5,lsr#8
  531. # ifdef __thumb2__
  532. itt hs
  533. # endif
  534. ldrhsb r8,[r12,#-14] @ load more input
  535. ldrhsb r9,[r12,#-10]
  536. strb r7,[r14,#-4]
  537. eor r6,r10,r6,lsr#8
  538. strb r4,[r14,#-15]
  539. eor r7,r11,r7,lsr#8
  540. # ifdef __thumb2__
  541. itt hs
  542. # endif
  543. ldrhsb r10,[r12,#-6]
  544. ldrhsb r11,[r12,#-2]
  545. strb r5,[r14,#-11]
  546. eor r4,r8,r4,lsr#8
  547. strb r6,[r14,#-7]
  548. eor r5,r9,r5,lsr#8
  549. # ifdef __thumb2__
  550. itt hs
  551. # endif
  552. ldrhsb r8,[r12,#-13] @ load more input
  553. ldrhsb r9,[r12,#-9]
  554. strb r7,[r14,#-3]
  555. eor r6,r10,r6,lsr#8
  556. strb r4,[r14,#-14]
  557. eor r7,r11,r7,lsr#8
  558. # ifdef __thumb2__
  559. itt hs
  560. # endif
  561. ldrhsb r10,[r12,#-5]
  562. ldrhsb r11,[r12,#-1]
  563. strb r5,[r14,#-10]
  564. strb r6,[r14,#-6]
  565. eor r4,r8,r4,lsr#8
  566. strb r7,[r14,#-2]
  567. eor r5,r9,r5,lsr#8
  568. strb r4,[r14,#-13]
  569. eor r6,r10,r6,lsr#8
  570. strb r5,[r14,#-9]
  571. eor r7,r11,r7,lsr#8
  572. strb r6,[r14,#-5]
  573. strb r7,[r14,#-1]
  574. add r8,sp,#4*(4+4)
  575. ldmia r8,{r8,r9,r10,r11} @ load key material
  576. ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
  577. # ifdef __thumb2__
  578. itt hi
  579. # endif
  580. strhi r10,[sp,#4*(16+10)] @ copy "rx"
  581. strhi r11,[sp,#4*(16+11)] @ copy "rx"
  582. add r0,r0,r8 @ accumulate key material
  583. add r1,r1,r9
  584. add r2,r2,r10
  585. # ifdef __thumb2__
  586. itete lo
  587. # endif
  588. eorlo r8,r8,r8 @ zero or ...
  589. ldrhsb r8,[r12],#16 @ ... load input
  590. eorlo r9,r9,r9
  591. ldrhsb r9,[r12,#-12]
  592. add r3,r3,r11
  593. # ifdef __thumb2__
  594. itete lo
  595. # endif
  596. eorlo r10,r10,r10
  597. ldrhsb r10,[r12,#-8]
  598. eorlo r11,r11,r11
  599. ldrhsb r11,[r12,#-4]
  600. eor r0,r8,r0 @ xor with input (or zero)
  601. eor r1,r9,r1
  602. # ifdef __thumb2__
  603. itt hs
  604. # endif
  605. ldrhsb r8,[r12,#-15] @ load more input
  606. ldrhsb r9,[r12,#-11]
  607. eor r2,r10,r2
  608. strb r0,[r14],#16 @ store output
  609. eor r3,r11,r3
  610. # ifdef __thumb2__
  611. itt hs
  612. # endif
  613. ldrhsb r10,[r12,#-7]
  614. ldrhsb r11,[r12,#-3]
  615. strb r1,[r14,#-12]
  616. eor r0,r8,r0,lsr#8
  617. strb r2,[r14,#-8]
  618. eor r1,r9,r1,lsr#8
  619. # ifdef __thumb2__
  620. itt hs
  621. # endif
  622. ldrhsb r8,[r12,#-14] @ load more input
  623. ldrhsb r9,[r12,#-10]
  624. strb r3,[r14,#-4]
  625. eor r2,r10,r2,lsr#8
  626. strb r0,[r14,#-15]
  627. eor r3,r11,r3,lsr#8
  628. # ifdef __thumb2__
  629. itt hs
  630. # endif
  631. ldrhsb r10,[r12,#-6]
  632. ldrhsb r11,[r12,#-2]
  633. strb r1,[r14,#-11]
  634. eor r0,r8,r0,lsr#8
  635. strb r2,[r14,#-7]
  636. eor r1,r9,r1,lsr#8
  637. # ifdef __thumb2__
  638. itt hs
  639. # endif
  640. ldrhsb r8,[r12,#-13] @ load more input
  641. ldrhsb r9,[r12,#-9]
  642. strb r3,[r14,#-3]
  643. eor r2,r10,r2,lsr#8
  644. strb r0,[r14,#-14]
  645. eor r3,r11,r3,lsr#8
  646. # ifdef __thumb2__
  647. itt hs
  648. # endif
  649. ldrhsb r10,[r12,#-5]
  650. ldrhsb r11,[r12,#-1]
  651. strb r1,[r14,#-10]
  652. strb r2,[r14,#-6]
  653. eor r0,r8,r0,lsr#8
  654. strb r3,[r14,#-2]
  655. eor r1,r9,r1,lsr#8
  656. strb r0,[r14,#-13]
  657. eor r2,r10,r2,lsr#8
  658. strb r1,[r14,#-9]
  659. eor r3,r11,r3,lsr#8
  660. strb r2,[r14,#-5]
  661. strb r3,[r14,#-1]
  662. add r8,sp,#4*(4+8)
  663. ldmia r8,{r8,r9,r10,r11} @ load key material
  664. add r4,r4,r8 @ accumulate key material
  665. # ifdef __thumb2__
  666. itt hi
  667. # endif
  668. addhi r8,r8,#1 @ next counter value
  669. strhi r8,[sp,#4*(12)] @ save next counter value
  670. add r5,r5,r9
  671. add r6,r6,r10
  672. # ifdef __thumb2__
  673. itete lo
  674. # endif
  675. eorlo r8,r8,r8 @ zero or ...
  676. ldrhsb r8,[r12],#16 @ ... load input
  677. eorlo r9,r9,r9
  678. ldrhsb r9,[r12,#-12]
  679. add r7,r7,r11
  680. # ifdef __thumb2__
  681. itete lo
  682. # endif
  683. eorlo r10,r10,r10
  684. ldrhsb r10,[r12,#-8]
  685. eorlo r11,r11,r11
  686. ldrhsb r11,[r12,#-4]
  687. eor r4,r8,r4 @ xor with input (or zero)
  688. eor r5,r9,r5
  689. # ifdef __thumb2__
  690. itt hs
  691. # endif
  692. ldrhsb r8,[r12,#-15] @ load more input
  693. ldrhsb r9,[r12,#-11]
  694. eor r6,r10,r6
  695. strb r4,[r14],#16 @ store output
  696. eor r7,r11,r7
  697. # ifdef __thumb2__
  698. itt hs
  699. # endif
  700. ldrhsb r10,[r12,#-7]
  701. ldrhsb r11,[r12,#-3]
  702. strb r5,[r14,#-12]
  703. eor r4,r8,r4,lsr#8
  704. strb r6,[r14,#-8]
  705. eor r5,r9,r5,lsr#8
  706. # ifdef __thumb2__
  707. itt hs
  708. # endif
  709. ldrhsb r8,[r12,#-14] @ load more input
  710. ldrhsb r9,[r12,#-10]
  711. strb r7,[r14,#-4]
  712. eor r6,r10,r6,lsr#8
  713. strb r4,[r14,#-15]
  714. eor r7,r11,r7,lsr#8
  715. # ifdef __thumb2__
  716. itt hs
  717. # endif
  718. ldrhsb r10,[r12,#-6]
  719. ldrhsb r11,[r12,#-2]
  720. strb r5,[r14,#-11]
  721. eor r4,r8,r4,lsr#8
  722. strb r6,[r14,#-7]
  723. eor r5,r9,r5,lsr#8
  724. # ifdef __thumb2__
  725. itt hs
  726. # endif
  727. ldrhsb r8,[r12,#-13] @ load more input
  728. ldrhsb r9,[r12,#-9]
  729. strb r7,[r14,#-3]
  730. eor r6,r10,r6,lsr#8
  731. strb r4,[r14,#-14]
  732. eor r7,r11,r7,lsr#8
  733. # ifdef __thumb2__
  734. itt hs
  735. # endif
  736. ldrhsb r10,[r12,#-5]
  737. ldrhsb r11,[r12,#-1]
  738. strb r5,[r14,#-10]
  739. strb r6,[r14,#-6]
  740. eor r4,r8,r4,lsr#8
  741. strb r7,[r14,#-2]
  742. eor r5,r9,r5,lsr#8
  743. strb r4,[r14,#-13]
  744. eor r6,r10,r6,lsr#8
  745. strb r5,[r14,#-9]
  746. eor r7,r11,r7,lsr#8
  747. strb r6,[r14,#-5]
  748. strb r7,[r14,#-1]
  749. # ifdef __thumb2__
  750. it ne
  751. # endif
  752. ldrne r8,[sp,#4*(32+2)] @ re-load len
  753. # ifdef __thumb2__
  754. it hs
  755. # endif
  756. subhs r11,r8,#64 @ len-=64
  757. bhi Loop_outer
  758. beq Ldone
  759. #endif
  760. Ltail:
  761. ldr r12,[sp,#4*(32+1)] @ load inp
  762. add r9,sp,#4*(0)
  763. ldr r14,[sp,#4*(32+0)] @ load out
  764. Loop_tail:
  765. ldrb r10,[r9],#1 @ read buffer on stack
  766. ldrb r11,[r12],#1 @ read input
  767. subs r8,r8,#1
  768. eor r11,r11,r10
  769. strb r11,[r14],#1 @ store output
  770. bne Loop_tail
  771. Ldone:
  772. add sp,sp,#4*(32+3)
  773. Lno_data:
  774. ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
  775. #if __ARM_MAX_ARCH__>=7
  776. #ifdef __thumb2__
  777. .thumb_func ChaCha20_neon
  778. #endif
  779. .align 5
  780. ChaCha20_neon:
  781. ldr r12,[sp,#0] @ pull pointer to counter and nonce
  782. stmdb sp!,{r0,r1,r2,r4-r11,lr}
  783. LChaCha20_neon:
  784. adr r14,Lsigma
  785. vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so
  786. stmdb sp!,{r0,r1,r2,r3}
  787. vld1.32 {q1,q2},[r3] @ load key
  788. ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
  789. sub sp,sp,#4*(16+16)
  790. vld1.32 {q3},[r12] @ load counter and nonce
  791. add r12,sp,#4*8
  792. ldmia r14,{r0,r1,r2,r3} @ load sigma
  793. vld1.32 {q0},[r14]! @ load sigma
  794. vld1.32 {q12},[r14] @ one
  795. vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce
  796. vst1.32 {q0,q1},[sp] @ copy sigma|1/2key
  797. str r10,[sp,#4*(16+10)] @ off-load "rx"
  798. str r11,[sp,#4*(16+11)] @ off-load "rx"
  799. vshl.i32 d26,d24,#1 @ two
  800. vstr d24,[sp,#4*(16+0)]
  801. vshl.i32 d28,d24,#2 @ four
  802. vstr d26,[sp,#4*(16+2)]
  803. vmov q4,q0
  804. vstr d28,[sp,#4*(16+4)]
  805. vmov q8,q0
  806. vmov q5,q1
  807. vmov q9,q1
  808. b Loop_neon_enter
  809. .align 4
  810. Loop_neon_outer:
  811. ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
  812. cmp r11,#64*2 @ if len<=64*2
  813. bls Lbreak_neon @ switch to integer-only
  814. vmov q4,q0
  815. str r11,[sp,#4*(32+2)] @ save len
  816. vmov q8,q0
  817. str r12, [sp,#4*(32+1)] @ save inp
  818. vmov q5,q1
  819. str r14, [sp,#4*(32+0)] @ save out
  820. vmov q9,q1
  821. Loop_neon_enter:
  822. ldr r11, [sp,#4*(15)]
  823. vadd.i32 q7,q3,q12 @ counter+1
  824. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  825. vmov q6,q2
  826. ldr r10, [sp,#4*(13)]
  827. vmov q10,q2
  828. ldr r14,[sp,#4*(14)]
  829. vadd.i32 q11,q7,q12 @ counter+2
  830. str r11, [sp,#4*(16+15)]
  831. mov r11,#10
  832. add r12,r12,#3 @ counter+3
  833. b Loop_neon
  834. .align 4
  835. Loop_neon:
  836. subs r11,r11,#1
  837. vadd.i32 q0,q0,q1
  838. add r0,r0,r4
  839. vadd.i32 q4,q4,q5
  840. mov r12,r12,ror#16
  841. vadd.i32 q8,q8,q9
  842. add r1,r1,r5
  843. veor q3,q3,q0
  844. mov r10,r10,ror#16
  845. veor q7,q7,q4
  846. eor r12,r12,r0,ror#16
  847. veor q11,q11,q8
  848. eor r10,r10,r1,ror#16
  849. vrev32.16 q3,q3
  850. add r8,r8,r12
  851. vrev32.16 q7,q7
  852. mov r4,r4,ror#20
  853. vrev32.16 q11,q11
  854. add r9,r9,r10
  855. vadd.i32 q2,q2,q3
  856. mov r5,r5,ror#20
  857. vadd.i32 q6,q6,q7
  858. eor r4,r4,r8,ror#20
  859. vadd.i32 q10,q10,q11
  860. eor r5,r5,r9,ror#20
  861. veor q12,q1,q2
  862. add r0,r0,r4
  863. veor q13,q5,q6
  864. mov r12,r12,ror#24
  865. veor q14,q9,q10
  866. add r1,r1,r5
  867. vshr.u32 q1,q12,#20
  868. mov r10,r10,ror#24
  869. vshr.u32 q5,q13,#20
  870. eor r12,r12,r0,ror#24
  871. vshr.u32 q9,q14,#20
  872. eor r10,r10,r1,ror#24
  873. vsli.32 q1,q12,#12
  874. add r8,r8,r12
  875. vsli.32 q5,q13,#12
  876. mov r4,r4,ror#25
  877. vsli.32 q9,q14,#12
  878. add r9,r9,r10
  879. vadd.i32 q0,q0,q1
  880. mov r5,r5,ror#25
  881. vadd.i32 q4,q4,q5
  882. str r10,[sp,#4*(16+13)]
  883. vadd.i32 q8,q8,q9
  884. ldr r10,[sp,#4*(16+15)]
  885. veor q12,q3,q0
  886. eor r4,r4,r8,ror#25
  887. veor q13,q7,q4
  888. eor r5,r5,r9,ror#25
  889. veor q14,q11,q8
  890. str r8,[sp,#4*(16+8)]
  891. vshr.u32 q3,q12,#24
  892. ldr r8,[sp,#4*(16+10)]
  893. vshr.u32 q7,q13,#24
  894. add r2,r2,r6
  895. vshr.u32 q11,q14,#24
  896. mov r14,r14,ror#16
  897. vsli.32 q3,q12,#8
  898. str r9,[sp,#4*(16+9)]
  899. vsli.32 q7,q13,#8
  900. ldr r9,[sp,#4*(16+11)]
  901. vsli.32 q11,q14,#8
  902. add r3,r3,r7
  903. vadd.i32 q2,q2,q3
  904. mov r10,r10,ror#16
  905. vadd.i32 q6,q6,q7
  906. eor r14,r14,r2,ror#16
  907. vadd.i32 q10,q10,q11
  908. eor r10,r10,r3,ror#16
  909. veor q12,q1,q2
  910. add r8,r8,r14
  911. veor q13,q5,q6
  912. mov r6,r6,ror#20
  913. veor q14,q9,q10
  914. add r9,r9,r10
  915. vshr.u32 q1,q12,#25
  916. mov r7,r7,ror#20
  917. vshr.u32 q5,q13,#25
  918. eor r6,r6,r8,ror#20
  919. vshr.u32 q9,q14,#25
  920. eor r7,r7,r9,ror#20
  921. vsli.32 q1,q12,#7
  922. add r2,r2,r6
  923. vsli.32 q5,q13,#7
  924. mov r14,r14,ror#24
  925. vsli.32 q9,q14,#7
  926. add r3,r3,r7
  927. vext.8 q2,q2,q2,#8
  928. mov r10,r10,ror#24
  929. vext.8 q6,q6,q6,#8
  930. eor r14,r14,r2,ror#24
  931. vext.8 q10,q10,q10,#8
  932. eor r10,r10,r3,ror#24
  933. vext.8 q1,q1,q1,#4
  934. add r8,r8,r14
  935. vext.8 q5,q5,q5,#4
  936. mov r6,r6,ror#25
  937. vext.8 q9,q9,q9,#4
  938. add r9,r9,r10
  939. vext.8 q3,q3,q3,#12
  940. mov r7,r7,ror#25
  941. vext.8 q7,q7,q7,#12
  942. eor r6,r6,r8,ror#25
  943. vext.8 q11,q11,q11,#12
  944. eor r7,r7,r9,ror#25
  945. vadd.i32 q0,q0,q1
  946. add r0,r0,r5
  947. vadd.i32 q4,q4,q5
  948. mov r10,r10,ror#16
  949. vadd.i32 q8,q8,q9
  950. add r1,r1,r6
  951. veor q3,q3,q0
  952. mov r12,r12,ror#16
  953. veor q7,q7,q4
  954. eor r10,r10,r0,ror#16
  955. veor q11,q11,q8
  956. eor r12,r12,r1,ror#16
  957. vrev32.16 q3,q3
  958. add r8,r8,r10
  959. vrev32.16 q7,q7
  960. mov r5,r5,ror#20
  961. vrev32.16 q11,q11
  962. add r9,r9,r12
  963. vadd.i32 q2,q2,q3
  964. mov r6,r6,ror#20
  965. vadd.i32 q6,q6,q7
  966. eor r5,r5,r8,ror#20
  967. vadd.i32 q10,q10,q11
  968. eor r6,r6,r9,ror#20
  969. veor q12,q1,q2
  970. add r0,r0,r5
  971. veor q13,q5,q6
  972. mov r10,r10,ror#24
  973. veor q14,q9,q10
  974. add r1,r1,r6
  975. vshr.u32 q1,q12,#20
  976. mov r12,r12,ror#24
  977. vshr.u32 q5,q13,#20
  978. eor r10,r10,r0,ror#24
  979. vshr.u32 q9,q14,#20
  980. eor r12,r12,r1,ror#24
  981. vsli.32 q1,q12,#12
  982. add r8,r8,r10
  983. vsli.32 q5,q13,#12
  984. mov r5,r5,ror#25
  985. vsli.32 q9,q14,#12
  986. str r10,[sp,#4*(16+15)]
  987. vadd.i32 q0,q0,q1
  988. ldr r10,[sp,#4*(16+13)]
  989. vadd.i32 q4,q4,q5
  990. add r9,r9,r12
  991. vadd.i32 q8,q8,q9
  992. mov r6,r6,ror#25
  993. veor q12,q3,q0
  994. eor r5,r5,r8,ror#25
  995. veor q13,q7,q4
  996. eor r6,r6,r9,ror#25
  997. veor q14,q11,q8
  998. str r8,[sp,#4*(16+10)]
  999. vshr.u32 q3,q12,#24
  1000. ldr r8,[sp,#4*(16+8)]
  1001. vshr.u32 q7,q13,#24
  1002. add r2,r2,r7
  1003. vshr.u32 q11,q14,#24
  1004. mov r10,r10,ror#16
  1005. vsli.32 q3,q12,#8
  1006. str r9,[sp,#4*(16+11)]
  1007. vsli.32 q7,q13,#8
  1008. ldr r9,[sp,#4*(16+9)]
  1009. vsli.32 q11,q14,#8
  1010. add r3,r3,r4
  1011. vadd.i32 q2,q2,q3
  1012. mov r14,r14,ror#16
  1013. vadd.i32 q6,q6,q7
  1014. eor r10,r10,r2,ror#16
  1015. vadd.i32 q10,q10,q11
  1016. eor r14,r14,r3,ror#16
  1017. veor q12,q1,q2
  1018. add r8,r8,r10
  1019. veor q13,q5,q6
  1020. mov r7,r7,ror#20
  1021. veor q14,q9,q10
  1022. add r9,r9,r14
  1023. vshr.u32 q1,q12,#25
  1024. mov r4,r4,ror#20
  1025. vshr.u32 q5,q13,#25
  1026. eor r7,r7,r8,ror#20
  1027. vshr.u32 q9,q14,#25
  1028. eor r4,r4,r9,ror#20
  1029. vsli.32 q1,q12,#7
  1030. add r2,r2,r7
  1031. vsli.32 q5,q13,#7
  1032. mov r10,r10,ror#24
  1033. vsli.32 q9,q14,#7
  1034. add r3,r3,r4
  1035. vext.8 q2,q2,q2,#8
  1036. mov r14,r14,ror#24
  1037. vext.8 q6,q6,q6,#8
  1038. eor r10,r10,r2,ror#24
  1039. vext.8 q10,q10,q10,#8
  1040. eor r14,r14,r3,ror#24
  1041. vext.8 q1,q1,q1,#12
  1042. add r8,r8,r10
  1043. vext.8 q5,q5,q5,#12
  1044. mov r7,r7,ror#25
  1045. vext.8 q9,q9,q9,#12
  1046. add r9,r9,r14
  1047. vext.8 q3,q3,q3,#4
  1048. mov r4,r4,ror#25
  1049. vext.8 q7,q7,q7,#4
  1050. eor r7,r7,r8,ror#25
  1051. vext.8 q11,q11,q11,#4
  1052. eor r4,r4,r9,ror#25
  1053. bne Loop_neon
  1054. add r11,sp,#32
  1055. vld1.32 {q12,q13},[sp] @ load key material
  1056. vld1.32 {q14,q15},[r11]
  1057. ldr r11,[sp,#4*(32+2)] @ load len
  1058. str r8, [sp,#4*(16+8)] @ modulo-scheduled store
  1059. str r9, [sp,#4*(16+9)]
  1060. str r12,[sp,#4*(16+12)]
  1061. str r10, [sp,#4*(16+13)]
  1062. str r14,[sp,#4*(16+14)]
  1063. @ at this point we have first half of 512-bit result in
  1064. @ rx and second half at sp+4*(16+8)
  1065. ldr r12,[sp,#4*(32+1)] @ load inp
  1066. ldr r14,[sp,#4*(32+0)] @ load out
  1067. vadd.i32 q0,q0,q12 @ accumulate key material
  1068. vadd.i32 q4,q4,q12
  1069. vadd.i32 q8,q8,q12
  1070. vldr d24,[sp,#4*(16+0)] @ one
  1071. vadd.i32 q1,q1,q13
  1072. vadd.i32 q5,q5,q13
  1073. vadd.i32 q9,q9,q13
  1074. vldr d26,[sp,#4*(16+2)] @ two
  1075. vadd.i32 q2,q2,q14
  1076. vadd.i32 q6,q6,q14
  1077. vadd.i32 q10,q10,q14
  1078. vadd.i32 d14,d14,d24 @ counter+1
  1079. vadd.i32 d22,d22,d26 @ counter+2
  1080. vadd.i32 q3,q3,q15
  1081. vadd.i32 q7,q7,q15
  1082. vadd.i32 q11,q11,q15
  1083. cmp r11,#64*4
  1084. blo Ltail_neon
  1085. vld1.8 {q12,q13},[r12]! @ load input
  1086. mov r11,sp
  1087. vld1.8 {q14,q15},[r12]!
  1088. veor q0,q0,q12 @ xor with input
  1089. veor q1,q1,q13
  1090. vld1.8 {q12,q13},[r12]!
  1091. veor q2,q2,q14
  1092. veor q3,q3,q15
  1093. vld1.8 {q14,q15},[r12]!
  1094. veor q4,q4,q12
  1095. vst1.8 {q0,q1},[r14]! @ store output
  1096. veor q5,q5,q13
  1097. vld1.8 {q12,q13},[r12]!
  1098. veor q6,q6,q14
  1099. vst1.8 {q2,q3},[r14]!
  1100. veor q7,q7,q15
  1101. vld1.8 {q14,q15},[r12]!
  1102. veor q8,q8,q12
  1103. vld1.32 {q0,q1},[r11]! @ load for next iteration
  1104. veor d25,d25,d25
  1105. vldr d24,[sp,#4*(16+4)] @ four
  1106. veor q9,q9,q13
  1107. vld1.32 {q2,q3},[r11]
  1108. veor q10,q10,q14
  1109. vst1.8 {q4,q5},[r14]!
  1110. veor q11,q11,q15
  1111. vst1.8 {q6,q7},[r14]!
  1112. vadd.i32 d6,d6,d24 @ next counter value
  1113. vldr d24,[sp,#4*(16+0)] @ one
  1114. ldmia sp,{r8,r9,r10,r11} @ load key material
  1115. add r0,r0,r8 @ accumulate key material
  1116. ldr r8,[r12],#16 @ load input
  1117. vst1.8 {q8,q9},[r14]!
  1118. add r1,r1,r9
  1119. ldr r9,[r12,#-12]
  1120. vst1.8 {q10,q11},[r14]!
  1121. add r2,r2,r10
  1122. ldr r10,[r12,#-8]
  1123. add r3,r3,r11
  1124. ldr r11,[r12,#-4]
  1125. # ifdef __ARMEB__
  1126. rev r0,r0
  1127. rev r1,r1
  1128. rev r2,r2
  1129. rev r3,r3
  1130. # endif
  1131. eor r0,r0,r8 @ xor with input
  1132. add r8,sp,#4*(4)
  1133. eor r1,r1,r9
  1134. str r0,[r14],#16 @ store output
  1135. eor r2,r2,r10
  1136. str r1,[r14,#-12]
  1137. eor r3,r3,r11
  1138. ldmia r8,{r8,r9,r10,r11} @ load key material
  1139. str r2,[r14,#-8]
  1140. str r3,[r14,#-4]
  1141. add r4,r4,r8 @ accumulate key material
  1142. ldr r8,[r12],#16 @ load input
  1143. add r5,r5,r9
  1144. ldr r9,[r12,#-12]
  1145. add r6,r6,r10
  1146. ldr r10,[r12,#-8]
  1147. add r7,r7,r11
  1148. ldr r11,[r12,#-4]
  1149. # ifdef __ARMEB__
  1150. rev r4,r4
  1151. rev r5,r5
  1152. rev r6,r6
  1153. rev r7,r7
  1154. # endif
  1155. eor r4,r4,r8
  1156. add r8,sp,#4*(8)
  1157. eor r5,r5,r9
  1158. str r4,[r14],#16 @ store output
  1159. eor r6,r6,r10
  1160. str r5,[r14,#-12]
  1161. eor r7,r7,r11
  1162. ldmia r8,{r8,r9,r10,r11} @ load key material
  1163. str r6,[r14,#-8]
  1164. add r0,sp,#4*(16+8)
  1165. str r7,[r14,#-4]
  1166. ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
  1167. add r0,r0,r8 @ accumulate key material
  1168. ldr r8,[r12],#16 @ load input
  1169. add r1,r1,r9
  1170. ldr r9,[r12,#-12]
  1171. # ifdef __thumb2__
  1172. it hi
  1173. # endif
  1174. strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
  1175. add r2,r2,r10
  1176. ldr r10,[r12,#-8]
  1177. # ifdef __thumb2__
  1178. it hi
  1179. # endif
  1180. strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
  1181. add r3,r3,r11
  1182. ldr r11,[r12,#-4]
  1183. # ifdef __ARMEB__
  1184. rev r0,r0
  1185. rev r1,r1
  1186. rev r2,r2
  1187. rev r3,r3
  1188. # endif
  1189. eor r0,r0,r8
  1190. add r8,sp,#4*(12)
  1191. eor r1,r1,r9
  1192. str r0,[r14],#16 @ store output
  1193. eor r2,r2,r10
  1194. str r1,[r14,#-12]
  1195. eor r3,r3,r11
  1196. ldmia r8,{r8,r9,r10,r11} @ load key material
  1197. str r2,[r14,#-8]
  1198. str r3,[r14,#-4]
  1199. add r4,r4,r8 @ accumulate key material
  1200. add r8,r8,#4 @ next counter value
  1201. add r5,r5,r9
  1202. str r8,[sp,#4*(12)] @ save next counter value
  1203. ldr r8,[r12],#16 @ load input
  1204. add r6,r6,r10
  1205. add r4,r4,#3 @ counter+3
  1206. ldr r9,[r12,#-12]
  1207. add r7,r7,r11
  1208. ldr r10,[r12,#-8]
  1209. ldr r11,[r12,#-4]
  1210. # ifdef __ARMEB__
  1211. rev r4,r4
  1212. rev r5,r5
  1213. rev r6,r6
  1214. rev r7,r7
  1215. # endif
  1216. eor r4,r4,r8
  1217. # ifdef __thumb2__
  1218. it hi
  1219. # endif
  1220. ldrhi r8,[sp,#4*(32+2)] @ re-load len
  1221. eor r5,r5,r9
  1222. eor r6,r6,r10
  1223. str r4,[r14],#16 @ store output
  1224. eor r7,r7,r11
  1225. str r5,[r14,#-12]
  1226. sub r11,r8,#64*4 @ len-=64*4
  1227. str r6,[r14,#-8]
  1228. str r7,[r14,#-4]
  1229. bhi Loop_neon_outer
  1230. b Ldone_neon
  1231. .align 4
  1232. Lbreak_neon:
  1233. @ harmonize NEON and integer-only stack frames: load data
  1234. @ from NEON frame, but save to integer-only one; distance
  1235. @ between the two is 4*(32+4+16-32)=4*(20).
  1236. str r11, [sp,#4*(20+32+2)] @ save len
  1237. add r11,sp,#4*(32+4)
  1238. str r12, [sp,#4*(20+32+1)] @ save inp
  1239. str r14, [sp,#4*(20+32+0)] @ save out
  1240. ldr r12,[sp,#4*(16+10)]
  1241. ldr r14,[sp,#4*(16+11)]
  1242. vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement
  1243. str r12,[sp,#4*(20+16+10)] @ copy "rx"
  1244. str r14,[sp,#4*(20+16+11)] @ copy "rx"
  1245. ldr r11, [sp,#4*(15)]
  1246. ldr r12,[sp,#4*(12)] @ modulo-scheduled load
  1247. ldr r10, [sp,#4*(13)]
  1248. ldr r14,[sp,#4*(14)]
  1249. str r11, [sp,#4*(20+16+15)]
  1250. add r11,sp,#4*(20)
  1251. vst1.32 {q0,q1},[r11]! @ copy key
  1252. add sp,sp,#4*(20) @ switch frame
  1253. vst1.32 {q2,q3},[r11]
  1254. mov r11,#10
  1255. b Loop @ go integer-only
  1256. .align 4
  1257. Ltail_neon:
  1258. cmp r11,#64*3
  1259. bhs L192_or_more_neon
  1260. cmp r11,#64*2
  1261. bhs L128_or_more_neon
  1262. cmp r11,#64*1
  1263. bhs L64_or_more_neon
  1264. add r8,sp,#4*(8)
  1265. vst1.8 {q0,q1},[sp]
  1266. add r10,sp,#4*(0)
  1267. vst1.8 {q2,q3},[r8]
  1268. b Loop_tail_neon
  1269. .align 4
  1270. L64_or_more_neon:
  1271. vld1.8 {q12,q13},[r12]!
  1272. vld1.8 {q14,q15},[r12]!
  1273. veor q0,q0,q12
  1274. veor q1,q1,q13
  1275. veor q2,q2,q14
  1276. veor q3,q3,q15
  1277. vst1.8 {q0,q1},[r14]!
  1278. vst1.8 {q2,q3},[r14]!
  1279. beq Ldone_neon
  1280. add r8,sp,#4*(8)
  1281. vst1.8 {q4,q5},[sp]
  1282. add r10,sp,#4*(0)
  1283. vst1.8 {q6,q7},[r8]
  1284. sub r11,r11,#64*1 @ len-=64*1
  1285. b Loop_tail_neon
  1286. .align 4
  1287. L128_or_more_neon:
  1288. vld1.8 {q12,q13},[r12]!
  1289. vld1.8 {q14,q15},[r12]!
  1290. veor q0,q0,q12
  1291. veor q1,q1,q13
  1292. vld1.8 {q12,q13},[r12]!
  1293. veor q2,q2,q14
  1294. veor q3,q3,q15
  1295. vld1.8 {q14,q15},[r12]!
  1296. veor q4,q4,q12
  1297. veor q5,q5,q13
  1298. vst1.8 {q0,q1},[r14]!
  1299. veor q6,q6,q14
  1300. vst1.8 {q2,q3},[r14]!
  1301. veor q7,q7,q15
  1302. vst1.8 {q4,q5},[r14]!
  1303. vst1.8 {q6,q7},[r14]!
  1304. beq Ldone_neon
  1305. add r8,sp,#4*(8)
  1306. vst1.8 {q8,q9},[sp]
  1307. add r10,sp,#4*(0)
  1308. vst1.8 {q10,q11},[r8]
  1309. sub r11,r11,#64*2 @ len-=64*2
  1310. b Loop_tail_neon
  1311. .align 4
  1312. L192_or_more_neon:
  1313. vld1.8 {q12,q13},[r12]!
  1314. vld1.8 {q14,q15},[r12]!
  1315. veor q0,q0,q12
  1316. veor q1,q1,q13
  1317. vld1.8 {q12,q13},[r12]!
  1318. veor q2,q2,q14
  1319. veor q3,q3,q15
  1320. vld1.8 {q14,q15},[r12]!
  1321. veor q4,q4,q12
  1322. veor q5,q5,q13
  1323. vld1.8 {q12,q13},[r12]!
  1324. veor q6,q6,q14
  1325. vst1.8 {q0,q1},[r14]!
  1326. veor q7,q7,q15
  1327. vld1.8 {q14,q15},[r12]!
  1328. veor q8,q8,q12
  1329. vst1.8 {q2,q3},[r14]!
  1330. veor q9,q9,q13
  1331. vst1.8 {q4,q5},[r14]!
  1332. veor q10,q10,q14
  1333. vst1.8 {q6,q7},[r14]!
  1334. veor q11,q11,q15
  1335. vst1.8 {q8,q9},[r14]!
  1336. vst1.8 {q10,q11},[r14]!
  1337. beq Ldone_neon
  1338. ldmia sp,{r8,r9,r10,r11} @ load key material
  1339. add r0,r0,r8 @ accumulate key material
  1340. add r8,sp,#4*(4)
  1341. add r1,r1,r9
  1342. add r2,r2,r10
  1343. add r3,r3,r11
  1344. ldmia r8,{r8,r9,r10,r11} @ load key material
  1345. add r4,r4,r8 @ accumulate key material
  1346. add r8,sp,#4*(8)
  1347. add r5,r5,r9
  1348. add r6,r6,r10
  1349. add r7,r7,r11
  1350. ldmia r8,{r8,r9,r10,r11} @ load key material
  1351. # ifdef __ARMEB__
  1352. rev r0,r0
  1353. rev r1,r1
  1354. rev r2,r2
  1355. rev r3,r3
  1356. rev r4,r4
  1357. rev r5,r5
  1358. rev r6,r6
  1359. rev r7,r7
  1360. # endif
  1361. stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}
  1362. add r0,sp,#4*(16+8)
  1363. ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
  1364. add r0,r0,r8 @ accumulate key material
  1365. add r8,sp,#4*(12)
  1366. add r1,r1,r9
  1367. add r2,r2,r10
  1368. add r3,r3,r11
  1369. ldmia r8,{r8,r9,r10,r11} @ load key material
  1370. add r4,r4,r8 @ accumulate key material
  1371. add r8,sp,#4*(8)
  1372. add r5,r5,r9
  1373. add r4,r4,#3 @ counter+3
  1374. add r6,r6,r10
  1375. add r7,r7,r11
  1376. ldr r11,[sp,#4*(32+2)] @ re-load len
  1377. # ifdef __ARMEB__
  1378. rev r0,r0
  1379. rev r1,r1
  1380. rev r2,r2
  1381. rev r3,r3
  1382. rev r4,r4
  1383. rev r5,r5
  1384. rev r6,r6
  1385. rev r7,r7
  1386. # endif
  1387. stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}
  1388. add r10,sp,#4*(0)
  1389. sub r11,r11,#64*3 @ len-=64*3
  1390. Loop_tail_neon:
  1391. ldrb r8,[r10],#1 @ read buffer on stack
  1392. ldrb r9,[r12],#1 @ read input
  1393. subs r11,r11,#1
  1394. eor r8,r8,r9
  1395. strb r8,[r14],#1 @ store output
  1396. bne Loop_tail_neon
  1397. Ldone_neon:
  1398. add sp,sp,#4*(32+4)
  1399. vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}
  1400. add sp,sp,#4*(16+3)
  1401. ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
  1402. .comm _OPENSSL_armcap_P,4
  1403. .non_lazy_symbol_pointer
  1404. OPENSSL_armcap_P:
  1405. .indirect_symbol _OPENSSL_armcap_P
  1406. .long 0
  1407. #endif
  1408. #endif // !OPENSSL_NO_ASM