cleanup.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. # Copyright 2021 gRPC authors.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Clean up resources created by the tests.
  15. This is intended as a tool to delete leaked resources from old tests.
  16. Typical usage examples:
  17. python3 tools/run_tests/xds_k8s_test_driver/bin/cleanup/cleanup.py\
  18. --project=grpc-testing\
  19. --network=default-vpc\
  20. --kube_context=gke_grpc-testing_us-central1-a_interop-test-psm-sec-v2-us-central1-a\
  21. --resource_prefix='required-but-does-not-matter'\
  22. --td_bootstrap_image='required-but-does-not-matter' --server_image='required-but-does-not-matter' --client_image='required-but-does-not-matter'
  23. """
  24. import datetime
  25. import functools
  26. import json
  27. import logging
  28. import os
  29. import re
  30. import subprocess
  31. from typing import Any, List
  32. from absl import app
  33. from absl import flags
  34. import dateutil
  35. from framework import xds_flags
  36. from framework import xds_k8s_flags
  37. from framework.infrastructure import gcp
  38. from framework.infrastructure import k8s
  39. from framework.infrastructure import traffic_director
  40. from framework.test_app import client_app
  41. from framework.test_app import server_app
  42. logger = logging.getLogger(__name__)
  43. Json = Any
  44. KubernetesClientRunner = client_app.KubernetesClientRunner
  45. KubernetesServerRunner = server_app.KubernetesServerRunner
  46. GCLOUD = os.environ.get('GCLOUD', 'gcloud')
  47. GCLOUD_CMD_TIMEOUT_S = datetime.timedelta(seconds=5).total_seconds()
  48. ZONE = 'us-central1-a'
  49. SECONDARY_ZONE = 'us-west1-b'
  50. PSM_SECURITY_PREFIX = 'xds-k8s-security' # Prefix for gke resources to delete.
  51. URL_MAP_TEST_PREFIX = 'interop-psm-url-map' # Prefix for url-map test resources to delete.
  52. KEEP_PERIOD_HOURS = flags.DEFINE_integer(
  53. "keep_hours",
  54. default=168,
  55. help=
  56. "number of hours for a resource to keep. Resources older than this will be deleted. Default is 168 (7 days)"
  57. )
  58. DRY_RUN = flags.DEFINE_bool(
  59. "dry_run",
  60. default=False,
  61. help="dry run, print resources but do not perform deletion")
  62. TD_RESOURCE_PREFIXES = flags.DEFINE_list(
  63. "td_resource_prefixes",
  64. default=[PSM_SECURITY_PREFIX],
  65. help=
  66. "a comma-separated list of prefixes for which the leaked TD resources will be deleted",
  67. )
  68. SERVER_PREFIXES = flags.DEFINE_list(
  69. "server_prefixes",
  70. default=[PSM_SECURITY_PREFIX],
  71. help=
  72. "a comma-separated list of prefixes for which the leaked servers will be deleted",
  73. )
  74. CLIENT_PREFIXES = flags.DEFINE_list(
  75. "client_prefixes",
  76. default=[PSM_SECURITY_PREFIX, URL_MAP_TEST_PREFIX],
  77. help=
  78. "a comma-separated list of prefixes for which the leaked clients will be deleted",
  79. )
  80. def load_keep_config() -> None:
  81. global KEEP_CONFIG
  82. json_path = os.path.realpath(
  83. os.path.join(os.path.dirname(os.path.abspath(__file__)),
  84. 'keep_xds_interop_resources.json'))
  85. with open(json_path, 'r') as f:
  86. KEEP_CONFIG = json.load(f)
  87. logging.debug('Resource keep config loaded: %s',
  88. json.dumps(KEEP_CONFIG, indent=2))
  89. def is_marked_as_keep_gce(suffix: str) -> bool:
  90. return suffix in KEEP_CONFIG["gce_framework"]["suffix"]
  91. def is_marked_as_keep_gke(suffix: str) -> bool:
  92. return suffix in KEEP_CONFIG["gke_framework"]["suffix"]
  93. @functools.lru_cache()
  94. def get_expire_timestamp() -> datetime.datetime:
  95. return datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
  96. hours=KEEP_PERIOD_HOURS.value)
  97. def exec_gcloud(project: str, *cmds: List[str]) -> Json:
  98. cmds = [GCLOUD, '--project', project, '--quiet'] + list(cmds)
  99. if 'list' in cmds:
  100. # Add arguments to shape the list output
  101. cmds.extend([
  102. '--format', 'json', '--filter',
  103. f'creationTimestamp <= {get_expire_timestamp().isoformat()}'
  104. ])
  105. # Executing the gcloud command
  106. logging.debug('Executing: %s', " ".join(cmds))
  107. proc = subprocess.Popen(cmds,
  108. stdout=subprocess.PIPE,
  109. stderr=subprocess.PIPE)
  110. # NOTE(lidiz) the gcloud subprocess won't return unless its output is read
  111. stdout = proc.stdout.read()
  112. stderr = proc.stderr.read()
  113. try:
  114. returncode = proc.wait(timeout=GCLOUD_CMD_TIMEOUT_S)
  115. except subprocess.TimeoutExpired:
  116. logging.error('> Timeout executing cmd [%s]', " ".join(cmds))
  117. return None
  118. if returncode:
  119. logging.error('> Failed to execute cmd [%s], returned %d, stderr: %s',
  120. " ".join(cmds), returncode, stderr)
  121. return None
  122. if stdout:
  123. return json.loads(stdout)
  124. return None
  125. def remove_relative_resources_run_xds_tests(project: str, network: str,
  126. prefix: str, suffix: str):
  127. """Removing GCP resources created by run_xds_tests.py."""
  128. logging.info('----- Removing run_xds_tests.py resources with suffix [%s]',
  129. suffix)
  130. exec_gcloud(project, 'compute', 'forwarding-rules', 'delete',
  131. f'test-forwarding-rule{suffix}', '--global')
  132. exec_gcloud(project, 'compute', 'target-http-proxies', 'delete',
  133. f'test-target-proxy{suffix}')
  134. exec_gcloud(project, 'alpha', 'compute', 'target-grpc-proxies', 'delete',
  135. f'test-target-proxy{suffix}')
  136. exec_gcloud(project, 'compute', 'url-maps', 'delete', f'test-map{suffix}')
  137. exec_gcloud(project, 'compute', 'backend-services', 'delete',
  138. f'test-backend-service{suffix}', '--global')
  139. exec_gcloud(project, 'compute', 'backend-services', 'delete',
  140. f'test-backend-service-alternate{suffix}', '--global')
  141. exec_gcloud(project, 'compute', 'backend-services', 'delete',
  142. f'test-backend-service-extra{suffix}', '--global')
  143. exec_gcloud(project, 'compute', 'backend-services', 'delete',
  144. f'test-backend-service-more-extra{suffix}', '--global')
  145. exec_gcloud(project, 'compute', 'firewall-rules', 'delete',
  146. f'test-fw-rule{suffix}')
  147. exec_gcloud(project, 'compute', 'health-checks', 'delete',
  148. f'test-hc{suffix}')
  149. exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
  150. f'test-ig{suffix}', '--zone', ZONE)
  151. exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
  152. f'test-ig-same-zone{suffix}', '--zone', ZONE)
  153. exec_gcloud(project, 'compute', 'instance-groups', 'managed', 'delete',
  154. f'test-ig-secondary-zone{suffix}', '--zone', SECONDARY_ZONE)
  155. exec_gcloud(project, 'compute', 'instance-templates', 'delete',
  156. f'test-template{suffix}')
  157. # cleanup_td creates TrafficDirectorManager (and its varients for security and
  158. # AppNet), and then calls the cleanup() methods.
  159. #
  160. # Note that the varients are all based on the basic TrafficDirectorManager, so
  161. # their `cleanup()` might do duplicate work. But deleting an non-exist resource
  162. # returns 404, and is OK.
  163. def cleanup_td_for_gke(project, network, resource_prefix, resource_suffix):
  164. gcp_api_manager = gcp.api.GcpApiManager()
  165. plain_td = traffic_director.TrafficDirectorManager(
  166. gcp_api_manager,
  167. project=project,
  168. network=network,
  169. resource_prefix=resource_prefix,
  170. resource_suffix=resource_suffix)
  171. security_td = traffic_director.TrafficDirectorSecureManager(
  172. gcp_api_manager,
  173. project=project,
  174. network=network,
  175. resource_prefix=resource_prefix,
  176. resource_suffix=resource_suffix)
  177. # TODO: cleanup appnet resources.
  178. # appnet_td = traffic_director.TrafficDirectorAppNetManager(
  179. # gcp_api_manager,
  180. # project=project,
  181. # network=network,
  182. # resource_prefix=resource_prefix,
  183. # resource_suffix=resource_suffix)
  184. logger.info('----- Removing traffic director for gke, prefix %s, suffix %s',
  185. resource_prefix, resource_suffix)
  186. security_td.cleanup(force=True)
  187. # appnet_td.cleanup(force=True)
  188. plain_td.cleanup(force=True)
  189. # cleanup_client creates a client runner, and calls its cleanup() method.
  190. def cleanup_client(project, network, k8s_api_manager, resource_prefix,
  191. resource_suffix, gcp_service_account):
  192. runner_kwargs = dict(
  193. deployment_name=xds_flags.CLIENT_NAME.value,
  194. image_name=xds_k8s_flags.CLIENT_IMAGE.value,
  195. td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
  196. gcp_project=project,
  197. gcp_api_manager=gcp.api.GcpApiManager(),
  198. gcp_service_account=gcp_service_account,
  199. xds_server_uri=xds_flags.XDS_SERVER_URI.value,
  200. network=network,
  201. stats_port=xds_flags.CLIENT_PORT.value)
  202. client_namespace = KubernetesClientRunner.make_namespace_name(
  203. resource_prefix, resource_suffix)
  204. client_runner = KubernetesClientRunner(
  205. k8s.KubernetesNamespace(k8s_api_manager, client_namespace),
  206. **runner_kwargs)
  207. logger.info('Cleanup client')
  208. client_runner.cleanup(force=True, force_namespace=True)
  209. # cleanup_server creates a server runner, and calls its cleanup() method.
  210. def cleanup_server(project, network, k8s_api_manager, resource_prefix,
  211. resource_suffix, gcp_service_account):
  212. runner_kwargs = dict(
  213. deployment_name=xds_flags.SERVER_NAME.value,
  214. image_name=xds_k8s_flags.SERVER_IMAGE.value,
  215. td_bootstrap_image=xds_k8s_flags.TD_BOOTSTRAP_IMAGE.value,
  216. gcp_project=project,
  217. gcp_api_manager=gcp.api.GcpApiManager(),
  218. gcp_service_account=gcp_service_account,
  219. network=network)
  220. server_namespace = KubernetesServerRunner.make_namespace_name(
  221. resource_prefix, resource_suffix)
  222. server_runner = KubernetesServerRunner(
  223. k8s.KubernetesNamespace(k8s_api_manager, server_namespace),
  224. **runner_kwargs)
  225. logger.info('Cleanup server')
  226. server_runner.cleanup(force=True, force_namespace=True)
  227. def delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
  228. resources):
  229. for resource in resources:
  230. logger.info('-----')
  231. logger.info('----- Cleaning up resource %s', resource['name'])
  232. if dry_run:
  233. # Skip deletion for dry-runs
  234. logging.info('----- Skipped [Dry Run]: %s', resource['name'])
  235. continue
  236. matched = False
  237. for (regex, resource_prefix, keep, remove) in td_resource_rules:
  238. result = re.search(regex, resource['name'])
  239. if result is not None:
  240. matched = True
  241. if keep(result.group(1)):
  242. logging.info('Skipped [keep]:')
  243. break # break inner loop, continue outer loop
  244. remove(project, network, resource_prefix, result.group(1))
  245. break
  246. if not matched:
  247. logging.info(
  248. '----- Skipped [does not matching resource name templates]')
  249. def delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
  250. k8s_api_manager, gcp_service_account, namespaces):
  251. for ns in namespaces:
  252. logger.info('-----')
  253. logger.info('----- Cleaning up k8s namespaces %s', ns.metadata.name)
  254. if ns.metadata.creation_timestamp <= get_expire_timestamp():
  255. if dry_run:
  256. # Skip deletion for dry-runs
  257. logging.info('----- Skipped [Dry Run]: %s', ns.metadata.name)
  258. continue
  259. matched = False
  260. for (regex, resource_prefix, remove) in k8s_resource_rules:
  261. result = re.search(regex, ns.metadata.name)
  262. if result is not None:
  263. matched = True
  264. remove(project, network, k8s_api_manager, resource_prefix,
  265. result.group(1), gcp_service_account)
  266. break
  267. if not matched:
  268. logging.info(
  269. '----- Skipped [does not matching resource name templates]')
  270. else:
  271. logging.info('----- Skipped [resource is within expiry date]')
  272. def find_and_remove_leaked_k8s_resources(dry_run, project, network,
  273. gcp_service_account):
  274. k8s_resource_rules = [
  275. # items in each tuple, in order
  276. # - regex to match
  277. # - prefix of the resources
  278. # - function to delete the resource
  279. ]
  280. for prefix in CLIENT_PREFIXES.value:
  281. k8s_resource_rules.append(
  282. (f'{prefix}-client-(.*)', prefix, cleanup_client),)
  283. for prefix in SERVER_PREFIXES.value:
  284. k8s_resource_rules.append(
  285. (f'{prefix}-server-(.*)', prefix, cleanup_server),)
  286. # Delete leaked k8s namespaces, those usually mean there are leaked testing
  287. # client/servers from the gke framework.
  288. k8s_api_manager = k8s.KubernetesApiManager(xds_k8s_flags.KUBE_CONTEXT.value)
  289. nss = k8s_api_manager.core.list_namespace()
  290. delete_k8s_resources(dry_run, k8s_resource_rules, project, network,
  291. k8s_api_manager, gcp_service_account, nss.items)
  292. def main(argv):
  293. if len(argv) > 1:
  294. raise app.UsageError('Too many command-line arguments.')
  295. load_keep_config()
  296. project: str = xds_flags.PROJECT.value
  297. network: str = xds_flags.NETWORK.value
  298. gcp_service_account: str = xds_k8s_flags.GCP_SERVICE_ACCOUNT.value
  299. dry_run: bool = DRY_RUN.value
  300. td_resource_rules = [
  301. # itmes in each tuple, in order
  302. # - regex to match
  303. # - prefix of the resource (only used by gke resources)
  304. # - function to check of the resource should be kept
  305. # - function to delete the resource
  306. (r'test-hc(.*)', '', is_marked_as_keep_gce,
  307. remove_relative_resources_run_xds_tests),
  308. (r'test-template(.*)', '', is_marked_as_keep_gce,
  309. remove_relative_resources_run_xds_tests),
  310. ]
  311. for prefix in TD_RESOURCE_PREFIXES.value:
  312. td_resource_rules.append((f'{prefix}-health-check-(.*)', prefix,
  313. is_marked_as_keep_gke, cleanup_td_for_gke),)
  314. # List resources older than KEEP_PERIOD. We only list health-checks and
  315. # instance templates because these are leaves in the resource dependency tree.
  316. #
  317. # E.g. forwarding-rule depends on the target-proxy. So leaked
  318. # forwarding-rule indicates there's a leaked target-proxy (because this
  319. # target proxy cannot deleted unless the forwarding rule is deleted). The
  320. # leaked target-proxy is guaranteed to be a super set of leaked
  321. # forwarding-rule.
  322. compute = gcp.compute.ComputeV1(gcp.api.GcpApiManager(), project)
  323. leakedHealthChecks = []
  324. for item in compute.list_health_check()['items']:
  325. if dateutil.parser.isoparse(
  326. item['creationTimestamp']) <= get_expire_timestamp():
  327. leakedHealthChecks.append(item)
  328. delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
  329. leakedHealthChecks)
  330. # Delete leaked instance templates, those usually mean there are leaked VMs
  331. # from the gce framework. Also note that this is only needed for the gce
  332. # resources.
  333. leakedInstanceTemplates = exec_gcloud(project, 'compute',
  334. 'instance-templates', 'list')
  335. delete_leaked_td_resources(dry_run, td_resource_rules, project, network,
  336. leakedInstanceTemplates)
  337. find_and_remove_leaked_k8s_resources(dry_run, project, network,
  338. gcp_service_account)
  339. if __name__ == '__main__':
  340. app.run(main)