run_xds_tests.py 150 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538
  1. #!/usr/bin/env python3
  2. # Copyright 2020 gRPC authors.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Run xDS integration tests on GCP using Traffic Director."""
  16. import argparse
  17. import datetime
  18. import json
  19. import logging
  20. import os
  21. import random
  22. import re
  23. import shlex
  24. import socket
  25. import subprocess
  26. import sys
  27. import tempfile
  28. import time
  29. import uuid
  30. from google.protobuf import json_format
  31. import googleapiclient.discovery
  32. import grpc
  33. from oauth2client.client import GoogleCredentials
  34. import python_utils.jobset as jobset
  35. import python_utils.report_utils as report_utils
  36. from src.proto.grpc.health.v1 import health_pb2
  37. from src.proto.grpc.health.v1 import health_pb2_grpc
  38. from src.proto.grpc.testing import empty_pb2
  39. from src.proto.grpc.testing import messages_pb2
  40. from src.proto.grpc.testing import test_pb2_grpc
  41. # Envoy protos provided by PyPI package xds-protos
  42. # Needs to import the generated Python file to load descriptors
  43. try:
  44. from envoy.extensions.filters.common.fault.v3 import fault_pb2
  45. from envoy.extensions.filters.http.fault.v3 import fault_pb2
  46. from envoy.extensions.filters.http.router.v3 import router_pb2
  47. from envoy.extensions.filters.network.http_connection_manager.v3 import \
  48. http_connection_manager_pb2
  49. from envoy.service.status.v3 import csds_pb2
  50. from envoy.service.status.v3 import csds_pb2_grpc
  51. except ImportError:
  52. # These protos are required by CSDS test. We should not fail the entire
  53. # script for one test case.
  54. pass
  55. logger = logging.getLogger()
  56. console_handler = logging.StreamHandler()
  57. formatter = logging.Formatter(fmt='%(asctime)s: %(levelname)-8s %(message)s')
  58. console_handler.setFormatter(formatter)
  59. logger.handlers = []
  60. logger.addHandler(console_handler)
  61. logger.setLevel(logging.WARNING)
  62. # Suppress excessive logs for gRPC Python
  63. original_grpc_trace = os.environ.pop('GRPC_TRACE', None)
  64. original_grpc_verbosity = os.environ.pop('GRPC_VERBOSITY', None)
  65. # Suppress not-essential logs for GCP clients
  66. logging.getLogger('google_auth_httplib2').setLevel(logging.WARNING)
  67. logging.getLogger('googleapiclient.discovery').setLevel(logging.WARNING)
  68. _TEST_CASES = [
  69. 'backends_restart',
  70. 'change_backend_service',
  71. 'gentle_failover',
  72. 'load_report_based_failover',
  73. 'ping_pong',
  74. 'remove_instance_group',
  75. 'round_robin',
  76. 'secondary_locality_gets_no_requests_on_partial_primary_failure',
  77. 'secondary_locality_gets_requests_on_primary_failure',
  78. 'traffic_splitting',
  79. 'path_matching',
  80. 'header_matching',
  81. 'api_listener',
  82. 'forwarding_rule_port_match',
  83. 'forwarding_rule_default_port',
  84. 'metadata_filter',
  85. ]
  86. # Valid test cases, but not in all. So the tests can only run manually, and
  87. # aren't enabled automatically for all languages.
  88. #
  89. # TODO: Move them into _TEST_CASES when support is ready in all languages.
  90. _ADDITIONAL_TEST_CASES = [
  91. 'circuit_breaking',
  92. 'timeout',
  93. 'fault_injection',
  94. 'csds',
  95. ]
  96. # Test cases that require the V3 API. Skipped in older runs.
  97. _V3_TEST_CASES = frozenset(['timeout', 'fault_injection', 'csds'])
  98. # Test cases that require the alpha API. Skipped for stable API runs.
  99. _ALPHA_TEST_CASES = frozenset(['timeout'])
  100. def parse_test_cases(arg):
  101. if arg == '':
  102. return []
  103. arg_split = arg.split(',')
  104. test_cases = set()
  105. all_test_cases = _TEST_CASES + _ADDITIONAL_TEST_CASES
  106. for arg in arg_split:
  107. if arg == "all":
  108. test_cases = test_cases.union(_TEST_CASES)
  109. else:
  110. test_cases = test_cases.union([arg])
  111. if not all([test_case in all_test_cases for test_case in test_cases]):
  112. raise Exception('Failed to parse test cases %s' % arg)
  113. # Perserve order.
  114. return [x for x in all_test_cases if x in test_cases]
  115. def parse_port_range(port_arg):
  116. try:
  117. port = int(port_arg)
  118. return list(range(port, port + 1))
  119. except:
  120. port_min, port_max = port_arg.split(':')
  121. return list(range(int(port_min), int(port_max) + 1))
  122. argp = argparse.ArgumentParser(description='Run xDS interop tests on GCP')
  123. # TODO(zdapeng): remove default value of project_id and project_num
  124. argp.add_argument('--project_id', default='grpc-testing', help='GCP project id')
  125. argp.add_argument('--project_num',
  126. default='830293263384',
  127. help='GCP project number')
  128. argp.add_argument(
  129. '--gcp_suffix',
  130. default='',
  131. help='Optional suffix for all generated GCP resource names. Useful to '
  132. 'ensure distinct names across test runs.')
  133. argp.add_argument(
  134. '--test_case',
  135. default='ping_pong',
  136. type=parse_test_cases,
  137. help='Comma-separated list of test cases to run. Available tests: %s, '
  138. '(or \'all\' to run every test). '
  139. 'Alternative tests not included in \'all\': %s' %
  140. (','.join(_TEST_CASES), ','.join(_ADDITIONAL_TEST_CASES)))
  141. argp.add_argument(
  142. '--bootstrap_file',
  143. default='',
  144. help='File to reference via GRPC_XDS_BOOTSTRAP. Disables built-in '
  145. 'bootstrap generation')
  146. argp.add_argument(
  147. '--xds_v3_support',
  148. default=False,
  149. action='store_true',
  150. help='Support xDS v3 via GRPC_XDS_EXPERIMENTAL_V3_SUPPORT. '
  151. 'If a pre-created bootstrap file is provided via the --bootstrap_file '
  152. 'parameter, it should include xds_v3 in its server_features field.')
  153. argp.add_argument(
  154. '--client_cmd',
  155. default=None,
  156. help='Command to launch xDS test client. {server_uri}, {stats_port} and '
  157. '{qps} references will be replaced using str.format(). GRPC_XDS_BOOTSTRAP '
  158. 'will be set for the command')
  159. argp.add_argument(
  160. '--client_hosts',
  161. default=None,
  162. help='Comma-separated list of hosts running client processes. If set, '
  163. '--client_cmd is ignored and client processes are assumed to be running on '
  164. 'the specified hosts.')
  165. argp.add_argument('--zone', default='us-central1-a')
  166. argp.add_argument('--secondary_zone',
  167. default='us-west1-b',
  168. help='Zone to use for secondary TD locality tests')
  169. argp.add_argument('--qps', default=100, type=int, help='Client QPS')
  170. argp.add_argument(
  171. '--wait_for_backend_sec',
  172. default=1200,
  173. type=int,
  174. help='Time limit for waiting for created backend services to report '
  175. 'healthy when launching or updated GCP resources')
  176. argp.add_argument(
  177. '--use_existing_gcp_resources',
  178. default=False,
  179. action='store_true',
  180. help=
  181. 'If set, find and use already created GCP resources instead of creating new'
  182. ' ones.')
  183. argp.add_argument(
  184. '--keep_gcp_resources',
  185. default=False,
  186. action='store_true',
  187. help=
  188. 'Leave GCP VMs and configuration running after test. Default behavior is '
  189. 'to delete when tests complete.')
  190. argp.add_argument('--halt_after_fail',
  191. action='store_true',
  192. help='Halt and save the resources when test failed.')
  193. argp.add_argument(
  194. '--compute_discovery_document',
  195. default=None,
  196. type=str,
  197. help=
  198. 'If provided, uses this file instead of retrieving via the GCP discovery '
  199. 'API')
  200. argp.add_argument(
  201. '--alpha_compute_discovery_document',
  202. default=None,
  203. type=str,
  204. help='If provided, uses this file instead of retrieving via the alpha GCP '
  205. 'discovery API')
  206. argp.add_argument('--network',
  207. default='global/networks/default',
  208. help='GCP network to use')
  209. _DEFAULT_PORT_RANGE = '8080:8280'
  210. argp.add_argument('--service_port_range',
  211. default=_DEFAULT_PORT_RANGE,
  212. type=parse_port_range,
  213. help='Listening port for created gRPC backends. Specified as '
  214. 'either a single int or as a range in the format min:max, in '
  215. 'which case an available port p will be chosen s.t. min <= p '
  216. '<= max')
  217. argp.add_argument(
  218. '--stats_port',
  219. default=8079,
  220. type=int,
  221. help='Local port for the client process to expose the LB stats service')
  222. argp.add_argument('--xds_server',
  223. default='trafficdirector.googleapis.com:443',
  224. help='xDS server')
  225. argp.add_argument('--source_image',
  226. default='projects/debian-cloud/global/images/family/debian-9',
  227. help='Source image for VMs created during the test')
  228. argp.add_argument('--path_to_server_binary',
  229. default=None,
  230. type=str,
  231. help='If set, the server binary must already be pre-built on '
  232. 'the specified source image')
  233. argp.add_argument('--machine_type',
  234. default='e2-standard-2',
  235. help='Machine type for VMs created during the test')
  236. argp.add_argument(
  237. '--instance_group_size',
  238. default=2,
  239. type=int,
  240. help='Number of VMs to create per instance group. Certain test cases (e.g., '
  241. 'round_robin) may not give meaningful results if this is set to a value '
  242. 'less than 2.')
  243. argp.add_argument('--verbose',
  244. help='verbose log output',
  245. default=False,
  246. action='store_true')
  247. # TODO(ericgribkoff) Remove this param once the sponge-formatted log files are
  248. # visible in all test environments.
  249. argp.add_argument('--log_client_output',
  250. help='Log captured client output',
  251. default=False,
  252. action='store_true')
  253. # TODO(ericgribkoff) Remove this flag once all test environments are verified to
  254. # have access to the alpha compute APIs.
  255. argp.add_argument('--only_stable_gcp_apis',
  256. help='Do not use alpha compute APIs. Some tests may be '
  257. 'incompatible with this option (gRPC health checks are '
  258. 'currently alpha and required for simulating server failure',
  259. default=False,
  260. action='store_true')
  261. args = argp.parse_args()
  262. if args.verbose:
  263. logger.setLevel(logging.DEBUG)
  264. CLIENT_HOSTS = []
  265. if args.client_hosts:
  266. CLIENT_HOSTS = args.client_hosts.split(',')
  267. # Each of the config propagation in the control plane should finish within 600s.
  268. # Otherwise, it indicates a bug in the control plane. The config propagation
  269. # includes all kinds of traffic config update, like updating urlMap, creating
  270. # the resources for the first time, updating BackendService, and changing the
  271. # status of endpoints in BackendService.
  272. _WAIT_FOR_URL_MAP_PATCH_SEC = 600
  273. # In general, fetching load balancing stats only takes ~10s. However, slow
  274. # config update could lead to empty EDS or similar symptoms causing the
  275. # connection to hang for a long period of time. So, we want to extend the stats
  276. # wait time to be the same as urlMap patch time.
  277. _WAIT_FOR_STATS_SEC = _WAIT_FOR_URL_MAP_PATCH_SEC
  278. _DEFAULT_SERVICE_PORT = 80
  279. _WAIT_FOR_BACKEND_SEC = args.wait_for_backend_sec
  280. _WAIT_FOR_OPERATION_SEC = 1200
  281. _INSTANCE_GROUP_SIZE = args.instance_group_size
  282. _NUM_TEST_RPCS = 10 * args.qps
  283. _CONNECTION_TIMEOUT_SEC = 60
  284. _GCP_API_RETRIES = 5
  285. _BOOTSTRAP_TEMPLATE = """
  286. {{
  287. "node": {{
  288. "id": "{node_id}",
  289. "metadata": {{
  290. "TRAFFICDIRECTOR_NETWORK_NAME": "%s",
  291. "com.googleapis.trafficdirector.config_time_trace": "TRUE"
  292. }},
  293. "locality": {{
  294. "zone": "%s"
  295. }}
  296. }},
  297. "xds_servers": [{{
  298. "server_uri": "%s",
  299. "channel_creds": [
  300. {{
  301. "type": "google_default",
  302. "config": {{}}
  303. }}
  304. ],
  305. "server_features": {server_features}
  306. }}]
  307. }}""" % (args.network.split('/')[-1], args.zone, args.xds_server)
  308. # TODO(ericgribkoff) Add change_backend_service to this list once TD no longer
  309. # sends an update with no localities when adding the MIG to the backend service
  310. # can race with the URL map patch.
  311. _TESTS_TO_FAIL_ON_RPC_FAILURE = ['ping_pong', 'round_robin']
  312. # Tests that run UnaryCall and EmptyCall.
  313. _TESTS_TO_RUN_MULTIPLE_RPCS = ['path_matching', 'header_matching']
  314. # Tests that make UnaryCall with test metadata.
  315. _TESTS_TO_SEND_METADATA = ['header_matching']
  316. _TEST_METADATA_KEY = 'xds_md'
  317. _TEST_METADATA_VALUE_UNARY = 'unary_yranu'
  318. _TEST_METADATA_VALUE_EMPTY = 'empty_ytpme'
  319. # Extra RPC metadata whose value is a number, sent with UnaryCall only.
  320. _TEST_METADATA_NUMERIC_KEY = 'xds_md_numeric'
  321. _TEST_METADATA_NUMERIC_VALUE = '159'
  322. _PATH_MATCHER_NAME = 'path-matcher'
  323. _BASE_TEMPLATE_NAME = 'test-template'
  324. _BASE_INSTANCE_GROUP_NAME = 'test-ig'
  325. _BASE_HEALTH_CHECK_NAME = 'test-hc'
  326. _BASE_FIREWALL_RULE_NAME = 'test-fw-rule'
  327. _BASE_BACKEND_SERVICE_NAME = 'test-backend-service'
  328. _BASE_URL_MAP_NAME = 'test-map'
  329. _BASE_SERVICE_HOST = 'grpc-test'
  330. _BASE_TARGET_PROXY_NAME = 'test-target-proxy'
  331. _BASE_FORWARDING_RULE_NAME = 'test-forwarding-rule'
  332. _TEST_LOG_BASE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)),
  333. '../../reports')
  334. _SPONGE_LOG_NAME = 'sponge_log.log'
  335. _SPONGE_XML_NAME = 'sponge_log.xml'
  336. def get_client_stats(num_rpcs, timeout_sec):
  337. if CLIENT_HOSTS:
  338. hosts = CLIENT_HOSTS
  339. else:
  340. hosts = ['localhost']
  341. for host in hosts:
  342. with grpc.insecure_channel('%s:%d' %
  343. (host, args.stats_port)) as channel:
  344. stub = test_pb2_grpc.LoadBalancerStatsServiceStub(channel)
  345. request = messages_pb2.LoadBalancerStatsRequest()
  346. request.num_rpcs = num_rpcs
  347. request.timeout_sec = timeout_sec
  348. rpc_timeout = timeout_sec + _CONNECTION_TIMEOUT_SEC
  349. logger.debug('Invoking GetClientStats RPC to %s:%d:', host,
  350. args.stats_port)
  351. response = stub.GetClientStats(request,
  352. wait_for_ready=True,
  353. timeout=rpc_timeout)
  354. logger.debug('Invoked GetClientStats RPC to %s: %s', host,
  355. json_format.MessageToJson(response))
  356. return response
  357. def get_client_accumulated_stats():
  358. if CLIENT_HOSTS:
  359. hosts = CLIENT_HOSTS
  360. else:
  361. hosts = ['localhost']
  362. for host in hosts:
  363. with grpc.insecure_channel('%s:%d' %
  364. (host, args.stats_port)) as channel:
  365. stub = test_pb2_grpc.LoadBalancerStatsServiceStub(channel)
  366. request = messages_pb2.LoadBalancerAccumulatedStatsRequest()
  367. logger.debug('Invoking GetClientAccumulatedStats RPC to %s:%d:',
  368. host, args.stats_port)
  369. response = stub.GetClientAccumulatedStats(
  370. request, wait_for_ready=True, timeout=_CONNECTION_TIMEOUT_SEC)
  371. logger.debug('Invoked GetClientAccumulatedStats RPC to %s: %s',
  372. host, response)
  373. return response
  374. def get_client_xds_config_dump():
  375. if CLIENT_HOSTS:
  376. hosts = CLIENT_HOSTS
  377. else:
  378. hosts = ['localhost']
  379. for host in hosts:
  380. server_address = '%s:%d' % (host, args.stats_port)
  381. with grpc.insecure_channel(server_address) as channel:
  382. stub = csds_pb2_grpc.ClientStatusDiscoveryServiceStub(channel)
  383. logger.debug('Fetching xDS config dump from %s', server_address)
  384. response = stub.FetchClientStatus(csds_pb2.ClientStatusRequest(),
  385. wait_for_ready=True,
  386. timeout=_CONNECTION_TIMEOUT_SEC)
  387. logger.debug('Fetched xDS config dump from %s', server_address)
  388. if len(response.config) != 1:
  389. logger.error('Unexpected number of ClientConfigs %d: %s',
  390. len(response.config), response)
  391. return None
  392. else:
  393. # Converting the ClientStatusResponse into JSON, because many
  394. # fields are packed in google.protobuf.Any. It will require many
  395. # duplicated code to unpack proto message and inspect values.
  396. return json_format.MessageToDict(
  397. response.config[0], preserving_proto_field_name=True)
  398. def configure_client(rpc_types, metadata=[], timeout_sec=None):
  399. if CLIENT_HOSTS:
  400. hosts = CLIENT_HOSTS
  401. else:
  402. hosts = ['localhost']
  403. for host in hosts:
  404. with grpc.insecure_channel('%s:%d' %
  405. (host, args.stats_port)) as channel:
  406. stub = test_pb2_grpc.XdsUpdateClientConfigureServiceStub(channel)
  407. request = messages_pb2.ClientConfigureRequest()
  408. request.types.extend(rpc_types)
  409. for rpc_type, md_key, md_value in metadata:
  410. md = request.metadata.add()
  411. md.type = rpc_type
  412. md.key = md_key
  413. md.value = md_value
  414. if timeout_sec:
  415. request.timeout_sec = timeout_sec
  416. logger.debug(
  417. 'Invoking XdsUpdateClientConfigureService RPC to %s:%d: %s',
  418. host, args.stats_port, request)
  419. stub.Configure(request,
  420. wait_for_ready=True,
  421. timeout=_CONNECTION_TIMEOUT_SEC)
  422. logger.debug('Invoked XdsUpdateClientConfigureService RPC to %s',
  423. host)
  424. class RpcDistributionError(Exception):
  425. pass
  426. def _verify_rpcs_to_given_backends(backends, timeout_sec, num_rpcs,
  427. allow_failures):
  428. start_time = time.time()
  429. error_msg = None
  430. logger.debug('Waiting for %d sec until backends %s receive load' %
  431. (timeout_sec, backends))
  432. while time.time() - start_time <= timeout_sec:
  433. error_msg = None
  434. stats = get_client_stats(num_rpcs, timeout_sec)
  435. rpcs_by_peer = stats.rpcs_by_peer
  436. for backend in backends:
  437. if backend not in rpcs_by_peer:
  438. error_msg = 'Backend %s did not receive load' % backend
  439. break
  440. if not error_msg and len(rpcs_by_peer) > len(backends):
  441. error_msg = 'Unexpected backend received load: %s' % rpcs_by_peer
  442. if not allow_failures and stats.num_failures > 0:
  443. error_msg = '%d RPCs failed' % stats.num_failures
  444. if not error_msg:
  445. return
  446. raise RpcDistributionError(error_msg)
  447. def wait_until_all_rpcs_go_to_given_backends_or_fail(backends,
  448. timeout_sec,
  449. num_rpcs=_NUM_TEST_RPCS):
  450. _verify_rpcs_to_given_backends(backends,
  451. timeout_sec,
  452. num_rpcs,
  453. allow_failures=True)
  454. def wait_until_all_rpcs_go_to_given_backends(backends,
  455. timeout_sec,
  456. num_rpcs=_NUM_TEST_RPCS):
  457. _verify_rpcs_to_given_backends(backends,
  458. timeout_sec,
  459. num_rpcs,
  460. allow_failures=False)
  461. def wait_until_no_rpcs_go_to_given_backends(backends, timeout_sec):
  462. start_time = time.time()
  463. while time.time() - start_time <= timeout_sec:
  464. stats = get_client_stats(_NUM_TEST_RPCS, timeout_sec)
  465. error_msg = None
  466. rpcs_by_peer = stats.rpcs_by_peer
  467. for backend in backends:
  468. if backend in rpcs_by_peer:
  469. error_msg = 'Unexpected backend %s receives load' % backend
  470. break
  471. if not error_msg:
  472. return
  473. raise Exception('Unexpected RPCs going to given backends')
  474. def wait_until_rpcs_in_flight(rpc_type, timeout_sec, num_rpcs, threshold):
  475. '''Block until the test client reaches the state with the given number
  476. of RPCs being outstanding stably.
  477. Args:
  478. rpc_type: A string indicating the RPC method to check for. Either
  479. 'UnaryCall' or 'EmptyCall'.
  480. timeout_sec: Maximum number of seconds to wait until the desired state
  481. is reached.
  482. num_rpcs: Expected number of RPCs to be in-flight.
  483. threshold: Number within [0,100], the tolerable percentage by which
  484. the actual number of RPCs in-flight can differ from the expected number.
  485. '''
  486. if threshold < 0 or threshold > 100:
  487. raise ValueError('Value error: Threshold should be between 0 to 100')
  488. threshold_fraction = threshold / 100.0
  489. start_time = time.time()
  490. error_msg = None
  491. logger.debug(
  492. 'Waiting for %d sec until %d %s RPCs (with %d%% tolerance) in-flight' %
  493. (timeout_sec, num_rpcs, rpc_type, threshold))
  494. while time.time() - start_time <= timeout_sec:
  495. error_msg = _check_rpcs_in_flight(rpc_type, num_rpcs, threshold,
  496. threshold_fraction)
  497. if error_msg:
  498. logger.debug('Progress: %s', error_msg)
  499. time.sleep(2)
  500. else:
  501. break
  502. # Ensure the number of outstanding RPCs is stable.
  503. if not error_msg:
  504. time.sleep(5)
  505. error_msg = _check_rpcs_in_flight(rpc_type, num_rpcs, threshold,
  506. threshold_fraction)
  507. if error_msg:
  508. raise Exception("Wrong number of %s RPCs in-flight: %s" %
  509. (rpc_type, error_msg))
  510. def _check_rpcs_in_flight(rpc_type, num_rpcs, threshold, threshold_fraction):
  511. error_msg = None
  512. stats = get_client_accumulated_stats()
  513. rpcs_started = stats.num_rpcs_started_by_method[rpc_type]
  514. rpcs_succeeded = stats.num_rpcs_succeeded_by_method[rpc_type]
  515. rpcs_failed = stats.num_rpcs_failed_by_method[rpc_type]
  516. rpcs_in_flight = rpcs_started - rpcs_succeeded - rpcs_failed
  517. if rpcs_in_flight < (num_rpcs * (1 - threshold_fraction)):
  518. error_msg = ('actual(%d) < expected(%d - %d%%)' %
  519. (rpcs_in_flight, num_rpcs, threshold))
  520. elif rpcs_in_flight > (num_rpcs * (1 + threshold_fraction)):
  521. error_msg = ('actual(%d) > expected(%d + %d%%)' %
  522. (rpcs_in_flight, num_rpcs, threshold))
  523. return error_msg
  524. def compare_distributions(actual_distribution, expected_distribution,
  525. threshold):
  526. """Compare if two distributions are similar.
  527. Args:
  528. actual_distribution: A list of floats, contains the actual distribution.
  529. expected_distribution: A list of floats, contains the expected distribution.
  530. threshold: Number within [0,100], the threshold percentage by which the
  531. actual distribution can differ from the expected distribution.
  532. Returns:
  533. The similarity between the distributions as a boolean. Returns true if the
  534. actual distribution lies within the threshold of the expected
  535. distribution, false otherwise.
  536. Raises:
  537. ValueError: if threshold is not with in [0,100].
  538. Exception: containing detailed error messages.
  539. """
  540. if len(expected_distribution) != len(actual_distribution):
  541. raise Exception(
  542. 'Error: expected and actual distributions have different size (%d vs %d)'
  543. % (len(expected_distribution), len(actual_distribution)))
  544. if threshold < 0 or threshold > 100:
  545. raise ValueError('Value error: Threshold should be between 0 to 100')
  546. threshold_fraction = threshold / 100.0
  547. for expected, actual in zip(expected_distribution, actual_distribution):
  548. if actual < (expected * (1 - threshold_fraction)):
  549. raise Exception("actual(%f) < expected(%f-%d%%)" %
  550. (actual, expected, threshold))
  551. if actual > (expected * (1 + threshold_fraction)):
  552. raise Exception("actual(%f) > expected(%f+%d%%)" %
  553. (actual, expected, threshold))
  554. return True
  555. def compare_expected_instances(stats, expected_instances):
  556. """Compare if stats have expected instances for each type of RPC.
  557. Args:
  558. stats: LoadBalancerStatsResponse reported by interop client.
  559. expected_instances: a dict with key as the RPC type (string), value as
  560. the expected backend instances (list of strings).
  561. Returns:
  562. Returns true if the instances are expected. False if not.
  563. """
  564. for rpc_type, expected_peers in list(expected_instances.items()):
  565. rpcs_by_peer_for_type = stats.rpcs_by_method[rpc_type]
  566. rpcs_by_peer = rpcs_by_peer_for_type.rpcs_by_peer if rpcs_by_peer_for_type else None
  567. logger.debug('rpc: %s, by_peer: %s', rpc_type, rpcs_by_peer)
  568. peers = list(rpcs_by_peer.keys())
  569. if set(peers) != set(expected_peers):
  570. logger.info('unexpected peers for %s, got %s, want %s', rpc_type,
  571. peers, expected_peers)
  572. return False
  573. return True
  574. def test_backends_restart(gcp, backend_service, instance_group):
  575. logger.info('Running test_backends_restart')
  576. instance_names = get_instance_names(gcp, instance_group)
  577. num_instances = len(instance_names)
  578. start_time = time.time()
  579. wait_until_all_rpcs_go_to_given_backends(instance_names,
  580. _WAIT_FOR_STATS_SEC)
  581. try:
  582. resize_instance_group(gcp, instance_group, 0)
  583. wait_until_all_rpcs_go_to_given_backends_or_fail([],
  584. _WAIT_FOR_BACKEND_SEC)
  585. finally:
  586. resize_instance_group(gcp, instance_group, num_instances)
  587. wait_for_healthy_backends(gcp, backend_service, instance_group)
  588. new_instance_names = get_instance_names(gcp, instance_group)
  589. wait_until_all_rpcs_go_to_given_backends(new_instance_names,
  590. _WAIT_FOR_BACKEND_SEC)
  591. def test_change_backend_service(gcp, original_backend_service, instance_group,
  592. alternate_backend_service,
  593. same_zone_instance_group):
  594. logger.info('Running test_change_backend_service')
  595. original_backend_instances = get_instance_names(gcp, instance_group)
  596. alternate_backend_instances = get_instance_names(gcp,
  597. same_zone_instance_group)
  598. patch_backend_service(gcp, alternate_backend_service,
  599. [same_zone_instance_group])
  600. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  601. wait_for_healthy_backends(gcp, alternate_backend_service,
  602. same_zone_instance_group)
  603. wait_until_all_rpcs_go_to_given_backends(original_backend_instances,
  604. _WAIT_FOR_STATS_SEC)
  605. passed = True
  606. try:
  607. patch_url_map_backend_service(gcp, alternate_backend_service)
  608. wait_until_all_rpcs_go_to_given_backends(alternate_backend_instances,
  609. _WAIT_FOR_URL_MAP_PATCH_SEC)
  610. except Exception:
  611. passed = False
  612. raise
  613. finally:
  614. if passed or not args.halt_after_fail:
  615. patch_url_map_backend_service(gcp, original_backend_service)
  616. patch_backend_service(gcp, alternate_backend_service, [])
  617. def test_gentle_failover(gcp,
  618. backend_service,
  619. primary_instance_group,
  620. secondary_instance_group,
  621. swapped_primary_and_secondary=False):
  622. logger.info('Running test_gentle_failover')
  623. num_primary_instances = len(get_instance_names(gcp, primary_instance_group))
  624. min_instances_for_gentle_failover = 3 # Need >50% failure to start failover
  625. passed = True
  626. try:
  627. if num_primary_instances < min_instances_for_gentle_failover:
  628. resize_instance_group(gcp, primary_instance_group,
  629. min_instances_for_gentle_failover)
  630. patch_backend_service(
  631. gcp, backend_service,
  632. [primary_instance_group, secondary_instance_group])
  633. primary_instance_names = get_instance_names(gcp, primary_instance_group)
  634. secondary_instance_names = get_instance_names(gcp,
  635. secondary_instance_group)
  636. wait_for_healthy_backends(gcp, backend_service, primary_instance_group)
  637. wait_for_healthy_backends(gcp, backend_service,
  638. secondary_instance_group)
  639. wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
  640. _WAIT_FOR_STATS_SEC)
  641. instances_to_stop = primary_instance_names[:-1]
  642. remaining_instances = primary_instance_names[-1:]
  643. try:
  644. set_serving_status(instances_to_stop,
  645. gcp.service_port,
  646. serving=False)
  647. wait_until_all_rpcs_go_to_given_backends(
  648. remaining_instances + secondary_instance_names,
  649. _WAIT_FOR_BACKEND_SEC)
  650. finally:
  651. set_serving_status(primary_instance_names,
  652. gcp.service_port,
  653. serving=True)
  654. except RpcDistributionError as e:
  655. if not swapped_primary_and_secondary and is_primary_instance_group(
  656. gcp, secondary_instance_group):
  657. # Swap expectation of primary and secondary instance groups.
  658. test_gentle_failover(gcp,
  659. backend_service,
  660. secondary_instance_group,
  661. primary_instance_group,
  662. swapped_primary_and_secondary=True)
  663. else:
  664. passed = False
  665. raise e
  666. except Exception:
  667. passed = False
  668. raise
  669. finally:
  670. if passed or not args.halt_after_fail:
  671. patch_backend_service(gcp, backend_service,
  672. [primary_instance_group])
  673. resize_instance_group(gcp, primary_instance_group,
  674. num_primary_instances)
  675. instance_names = get_instance_names(gcp, primary_instance_group)
  676. wait_until_all_rpcs_go_to_given_backends(instance_names,
  677. _WAIT_FOR_BACKEND_SEC)
  678. def test_load_report_based_failover(gcp, backend_service,
  679. primary_instance_group,
  680. secondary_instance_group):
  681. logger.info('Running test_load_report_based_failover')
  682. passed = True
  683. try:
  684. patch_backend_service(
  685. gcp, backend_service,
  686. [primary_instance_group, secondary_instance_group])
  687. primary_instance_names = get_instance_names(gcp, primary_instance_group)
  688. secondary_instance_names = get_instance_names(gcp,
  689. secondary_instance_group)
  690. wait_for_healthy_backends(gcp, backend_service, primary_instance_group)
  691. wait_for_healthy_backends(gcp, backend_service,
  692. secondary_instance_group)
  693. wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
  694. _WAIT_FOR_STATS_SEC)
  695. # Set primary locality's balance mode to RATE, and RPS to 20% of the
  696. # client's QPS. The secondary locality will be used.
  697. max_rate = int(args.qps * 1 / 5)
  698. logger.info('Patching backend service to RATE with %d max_rate',
  699. max_rate)
  700. patch_backend_service(
  701. gcp,
  702. backend_service, [primary_instance_group, secondary_instance_group],
  703. balancing_mode='RATE',
  704. max_rate=max_rate)
  705. wait_until_all_rpcs_go_to_given_backends(
  706. primary_instance_names + secondary_instance_names,
  707. _WAIT_FOR_BACKEND_SEC)
  708. # Set primary locality's balance mode to RATE, and RPS to 120% of the
  709. # client's QPS. Only the primary locality will be used.
  710. max_rate = int(args.qps * 6 / 5)
  711. logger.info('Patching backend service to RATE with %d max_rate',
  712. max_rate)
  713. patch_backend_service(
  714. gcp,
  715. backend_service, [primary_instance_group, secondary_instance_group],
  716. balancing_mode='RATE',
  717. max_rate=max_rate)
  718. wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
  719. _WAIT_FOR_BACKEND_SEC)
  720. logger.info("success")
  721. except Exception:
  722. passed = False
  723. raise
  724. finally:
  725. if passed or not args.halt_after_fail:
  726. patch_backend_service(gcp, backend_service,
  727. [primary_instance_group])
  728. instance_names = get_instance_names(gcp, primary_instance_group)
  729. wait_until_all_rpcs_go_to_given_backends(instance_names,
  730. _WAIT_FOR_BACKEND_SEC)
  731. def test_ping_pong(gcp, backend_service, instance_group):
  732. logger.info('Running test_ping_pong')
  733. wait_for_healthy_backends(gcp, backend_service, instance_group)
  734. instance_names = get_instance_names(gcp, instance_group)
  735. wait_until_all_rpcs_go_to_given_backends(instance_names,
  736. _WAIT_FOR_STATS_SEC)
  737. def test_remove_instance_group(gcp, backend_service, instance_group,
  738. same_zone_instance_group):
  739. logger.info('Running test_remove_instance_group')
  740. passed = True
  741. try:
  742. patch_backend_service(gcp,
  743. backend_service,
  744. [instance_group, same_zone_instance_group],
  745. balancing_mode='RATE')
  746. wait_for_healthy_backends(gcp, backend_service, instance_group)
  747. wait_for_healthy_backends(gcp, backend_service,
  748. same_zone_instance_group)
  749. instance_names = get_instance_names(gcp, instance_group)
  750. same_zone_instance_names = get_instance_names(gcp,
  751. same_zone_instance_group)
  752. try:
  753. wait_until_all_rpcs_go_to_given_backends(
  754. instance_names + same_zone_instance_names,
  755. _WAIT_FOR_OPERATION_SEC)
  756. remaining_instance_group = same_zone_instance_group
  757. remaining_instance_names = same_zone_instance_names
  758. except RpcDistributionError as e:
  759. # If connected to TD in a different zone, we may route traffic to
  760. # only one instance group. Determine which group that is to continue
  761. # with the remainder of the test case.
  762. try:
  763. wait_until_all_rpcs_go_to_given_backends(
  764. instance_names, _WAIT_FOR_STATS_SEC)
  765. remaining_instance_group = same_zone_instance_group
  766. remaining_instance_names = same_zone_instance_names
  767. except RpcDistributionError as e:
  768. wait_until_all_rpcs_go_to_given_backends(
  769. same_zone_instance_names, _WAIT_FOR_STATS_SEC)
  770. remaining_instance_group = instance_group
  771. remaining_instance_names = instance_names
  772. patch_backend_service(gcp,
  773. backend_service, [remaining_instance_group],
  774. balancing_mode='RATE')
  775. wait_until_all_rpcs_go_to_given_backends(remaining_instance_names,
  776. _WAIT_FOR_BACKEND_SEC)
  777. except Exception:
  778. passed = False
  779. raise
  780. finally:
  781. if passed or not args.halt_after_fail:
  782. patch_backend_service(gcp, backend_service, [instance_group])
  783. wait_until_all_rpcs_go_to_given_backends(instance_names,
  784. _WAIT_FOR_BACKEND_SEC)
  785. def test_round_robin(gcp, backend_service, instance_group):
  786. logger.info('Running test_round_robin')
  787. wait_for_healthy_backends(gcp, backend_service, instance_group)
  788. instance_names = get_instance_names(gcp, instance_group)
  789. threshold = 1
  790. wait_until_all_rpcs_go_to_given_backends(instance_names,
  791. _WAIT_FOR_STATS_SEC)
  792. # TODO(ericgribkoff) Delayed config propagation from earlier tests
  793. # may result in briefly receiving an empty EDS update, resulting in failed
  794. # RPCs. Retry distribution validation if this occurs; long-term fix is
  795. # creating new backend resources for each individual test case.
  796. # Each attempt takes 10 seconds. Config propagation can take several
  797. # minutes.
  798. max_attempts = 40
  799. for i in range(max_attempts):
  800. stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
  801. requests_received = [stats.rpcs_by_peer[x] for x in stats.rpcs_by_peer]
  802. total_requests_received = sum(requests_received)
  803. if total_requests_received != _NUM_TEST_RPCS:
  804. logger.info('Unexpected RPC failures, retrying: %s', stats)
  805. continue
  806. expected_requests = total_requests_received / len(instance_names)
  807. for instance in instance_names:
  808. if abs(stats.rpcs_by_peer[instance] -
  809. expected_requests) > threshold:
  810. raise Exception(
  811. 'RPC peer distribution differs from expected by more than %d '
  812. 'for instance %s (%s)' % (threshold, instance, stats))
  813. return
  814. raise Exception('RPC failures persisted through %d retries' % max_attempts)
  815. def test_secondary_locality_gets_no_requests_on_partial_primary_failure(
  816. gcp,
  817. backend_service,
  818. primary_instance_group,
  819. secondary_instance_group,
  820. swapped_primary_and_secondary=False):
  821. logger.info(
  822. 'Running secondary_locality_gets_no_requests_on_partial_primary_failure'
  823. )
  824. passed = True
  825. try:
  826. patch_backend_service(
  827. gcp, backend_service,
  828. [primary_instance_group, secondary_instance_group])
  829. wait_for_healthy_backends(gcp, backend_service, primary_instance_group)
  830. wait_for_healthy_backends(gcp, backend_service,
  831. secondary_instance_group)
  832. primary_instance_names = get_instance_names(gcp, primary_instance_group)
  833. wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
  834. _WAIT_FOR_STATS_SEC)
  835. instances_to_stop = primary_instance_names[:1]
  836. remaining_instances = primary_instance_names[1:]
  837. try:
  838. set_serving_status(instances_to_stop,
  839. gcp.service_port,
  840. serving=False)
  841. wait_until_all_rpcs_go_to_given_backends(remaining_instances,
  842. _WAIT_FOR_BACKEND_SEC)
  843. finally:
  844. set_serving_status(primary_instance_names,
  845. gcp.service_port,
  846. serving=True)
  847. except RpcDistributionError as e:
  848. if not swapped_primary_and_secondary and is_primary_instance_group(
  849. gcp, secondary_instance_group):
  850. # Swap expectation of primary and secondary instance groups.
  851. test_secondary_locality_gets_no_requests_on_partial_primary_failure(
  852. gcp,
  853. backend_service,
  854. secondary_instance_group,
  855. primary_instance_group,
  856. swapped_primary_and_secondary=True)
  857. else:
  858. passed = False
  859. raise e
  860. finally:
  861. if passed or not args.halt_after_fail:
  862. patch_backend_service(gcp, backend_service,
  863. [primary_instance_group])
  864. def test_secondary_locality_gets_requests_on_primary_failure(
  865. gcp,
  866. backend_service,
  867. primary_instance_group,
  868. secondary_instance_group,
  869. swapped_primary_and_secondary=False):
  870. logger.info('Running secondary_locality_gets_requests_on_primary_failure')
  871. passed = True
  872. try:
  873. patch_backend_service(
  874. gcp, backend_service,
  875. [primary_instance_group, secondary_instance_group])
  876. wait_for_healthy_backends(gcp, backend_service, primary_instance_group)
  877. wait_for_healthy_backends(gcp, backend_service,
  878. secondary_instance_group)
  879. primary_instance_names = get_instance_names(gcp, primary_instance_group)
  880. secondary_instance_names = get_instance_names(gcp,
  881. secondary_instance_group)
  882. wait_until_all_rpcs_go_to_given_backends(primary_instance_names,
  883. _WAIT_FOR_STATS_SEC)
  884. try:
  885. set_serving_status(primary_instance_names,
  886. gcp.service_port,
  887. serving=False)
  888. wait_until_all_rpcs_go_to_given_backends(secondary_instance_names,
  889. _WAIT_FOR_BACKEND_SEC)
  890. finally:
  891. set_serving_status(primary_instance_names,
  892. gcp.service_port,
  893. serving=True)
  894. except RpcDistributionError as e:
  895. if not swapped_primary_and_secondary and is_primary_instance_group(
  896. gcp, secondary_instance_group):
  897. # Swap expectation of primary and secondary instance groups.
  898. test_secondary_locality_gets_requests_on_primary_failure(
  899. gcp,
  900. backend_service,
  901. secondary_instance_group,
  902. primary_instance_group,
  903. swapped_primary_and_secondary=True)
  904. else:
  905. passed = False
  906. raise e
  907. finally:
  908. if passed or not args.halt_after_fail:
  909. patch_backend_service(gcp, backend_service,
  910. [primary_instance_group])
  911. def prepare_services_for_urlmap_tests(gcp, original_backend_service,
  912. instance_group, alternate_backend_service,
  913. same_zone_instance_group):
  914. '''
  915. This function prepares the services to be ready for tests that modifies
  916. urlmaps.
  917. Returns:
  918. Returns original and alternate backend names as lists of strings.
  919. '''
  920. logger.info('waiting for original backends to become healthy')
  921. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  922. patch_backend_service(gcp, alternate_backend_service,
  923. [same_zone_instance_group])
  924. logger.info('waiting for alternate to become healthy')
  925. wait_for_healthy_backends(gcp, alternate_backend_service,
  926. same_zone_instance_group)
  927. original_backend_instances = get_instance_names(gcp, instance_group)
  928. logger.info('original backends instances: %s', original_backend_instances)
  929. alternate_backend_instances = get_instance_names(gcp,
  930. same_zone_instance_group)
  931. logger.info('alternate backends instances: %s', alternate_backend_instances)
  932. # Start with all traffic going to original_backend_service.
  933. logger.info('waiting for traffic to all go to original backends')
  934. wait_until_all_rpcs_go_to_given_backends(original_backend_instances,
  935. _WAIT_FOR_STATS_SEC)
  936. return original_backend_instances, alternate_backend_instances
  937. def test_metadata_filter(gcp, original_backend_service, instance_group,
  938. alternate_backend_service, same_zone_instance_group):
  939. logger.info("Running test_metadata_filter")
  940. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  941. original_backend_instances = get_instance_names(gcp, instance_group)
  942. alternate_backend_instances = get_instance_names(gcp,
  943. same_zone_instance_group)
  944. patch_backend_service(gcp, alternate_backend_service,
  945. [same_zone_instance_group])
  946. wait_for_healthy_backends(gcp, alternate_backend_service,
  947. same_zone_instance_group)
  948. passed = True
  949. try:
  950. with open(bootstrap_path) as f:
  951. md = json.load(f)['node']['metadata']
  952. match_labels = []
  953. for k, v in list(md.items()):
  954. match_labels.append({'name': k, 'value': v})
  955. not_match_labels = [{'name': 'fake', 'value': 'fail'}]
  956. test_route_rules = [
  957. # test MATCH_ALL
  958. [
  959. {
  960. 'priority': 0,
  961. 'matchRules': [{
  962. 'prefixMatch':
  963. '/',
  964. 'metadataFilters': [{
  965. 'filterMatchCriteria': 'MATCH_ALL',
  966. 'filterLabels': not_match_labels
  967. }]
  968. }],
  969. 'service': original_backend_service.url
  970. },
  971. {
  972. 'priority': 1,
  973. 'matchRules': [{
  974. 'prefixMatch':
  975. '/',
  976. 'metadataFilters': [{
  977. 'filterMatchCriteria': 'MATCH_ALL',
  978. 'filterLabels': match_labels
  979. }]
  980. }],
  981. 'service': alternate_backend_service.url
  982. },
  983. ],
  984. # test mixing MATCH_ALL and MATCH_ANY
  985. # test MATCH_ALL: super set labels won't match
  986. [
  987. {
  988. 'priority': 0,
  989. 'matchRules': [{
  990. 'prefixMatch':
  991. '/',
  992. 'metadataFilters': [{
  993. 'filterMatchCriteria': 'MATCH_ALL',
  994. 'filterLabels': not_match_labels + match_labels
  995. }]
  996. }],
  997. 'service': original_backend_service.url
  998. },
  999. {
  1000. 'priority': 1,
  1001. 'matchRules': [{
  1002. 'prefixMatch':
  1003. '/',
  1004. 'metadataFilters': [{
  1005. 'filterMatchCriteria': 'MATCH_ANY',
  1006. 'filterLabels': not_match_labels + match_labels
  1007. }]
  1008. }],
  1009. 'service': alternate_backend_service.url
  1010. },
  1011. ],
  1012. # test MATCH_ANY
  1013. [
  1014. {
  1015. 'priority': 0,
  1016. 'matchRules': [{
  1017. 'prefixMatch':
  1018. '/',
  1019. 'metadataFilters': [{
  1020. 'filterMatchCriteria': 'MATCH_ANY',
  1021. 'filterLabels': not_match_labels
  1022. }]
  1023. }],
  1024. 'service': original_backend_service.url
  1025. },
  1026. {
  1027. 'priority': 1,
  1028. 'matchRules': [{
  1029. 'prefixMatch':
  1030. '/',
  1031. 'metadataFilters': [{
  1032. 'filterMatchCriteria': 'MATCH_ANY',
  1033. 'filterLabels': not_match_labels + match_labels
  1034. }]
  1035. }],
  1036. 'service': alternate_backend_service.url
  1037. },
  1038. ],
  1039. # test match multiple route rules
  1040. [
  1041. {
  1042. 'priority': 0,
  1043. 'matchRules': [{
  1044. 'prefixMatch':
  1045. '/',
  1046. 'metadataFilters': [{
  1047. 'filterMatchCriteria': 'MATCH_ANY',
  1048. 'filterLabels': match_labels
  1049. }]
  1050. }],
  1051. 'service': alternate_backend_service.url
  1052. },
  1053. {
  1054. 'priority': 1,
  1055. 'matchRules': [{
  1056. 'prefixMatch':
  1057. '/',
  1058. 'metadataFilters': [{
  1059. 'filterMatchCriteria': 'MATCH_ALL',
  1060. 'filterLabels': match_labels
  1061. }]
  1062. }],
  1063. 'service': original_backend_service.url
  1064. },
  1065. ]
  1066. ]
  1067. for route_rules in test_route_rules:
  1068. wait_until_all_rpcs_go_to_given_backends(original_backend_instances,
  1069. _WAIT_FOR_STATS_SEC)
  1070. patch_url_map_backend_service(gcp,
  1071. original_backend_service,
  1072. route_rules=route_rules)
  1073. wait_until_no_rpcs_go_to_given_backends(original_backend_instances,
  1074. _WAIT_FOR_STATS_SEC)
  1075. wait_until_all_rpcs_go_to_given_backends(
  1076. alternate_backend_instances, _WAIT_FOR_STATS_SEC)
  1077. patch_url_map_backend_service(gcp, original_backend_service)
  1078. except Exception:
  1079. passed = False
  1080. raise
  1081. finally:
  1082. if passed or not args.halt_after_fail:
  1083. patch_backend_service(gcp, alternate_backend_service, [])
  1084. def test_api_listener(gcp, backend_service, instance_group,
  1085. alternate_backend_service):
  1086. logger.info("Running api_listener")
  1087. passed = True
  1088. try:
  1089. wait_for_healthy_backends(gcp, backend_service, instance_group)
  1090. backend_instances = get_instance_names(gcp, instance_group)
  1091. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1092. _WAIT_FOR_STATS_SEC)
  1093. # create a second suite of map+tp+fr with the same host name in host rule
  1094. # and we have to disable proxyless validation because it needs `0.0.0.0`
  1095. # ip address in fr for proxyless and also we violate ip:port uniqueness
  1096. # for test purpose. See https://github.com/grpc/grpc-java/issues/8009
  1097. new_config_suffix = '2'
  1098. url_map_2 = create_url_map(gcp, url_map_name + new_config_suffix,
  1099. backend_service, service_host_name)
  1100. target_proxy_2 = create_target_proxy(
  1101. gcp, target_proxy_name + new_config_suffix, False, url_map_2)
  1102. if not gcp.service_port:
  1103. raise Exception(
  1104. 'Faied to find a valid port for the forwarding rule')
  1105. potential_ip_addresses = []
  1106. max_attempts = 10
  1107. for i in range(max_attempts):
  1108. potential_ip_addresses.append('10.10.10.%d' %
  1109. (random.randint(0, 255)))
  1110. create_global_forwarding_rule(gcp,
  1111. forwarding_rule_name + new_config_suffix,
  1112. [gcp.service_port],
  1113. potential_ip_addresses, target_proxy_2)
  1114. if gcp.service_port != _DEFAULT_SERVICE_PORT:
  1115. patch_url_map_host_rule_with_port(gcp,
  1116. url_map_name + new_config_suffix,
  1117. backend_service,
  1118. service_host_name)
  1119. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1120. _WAIT_FOR_STATS_SEC)
  1121. delete_global_forwarding_rule(gcp, gcp.global_forwarding_rules[0])
  1122. delete_target_proxy(gcp, gcp.target_proxies[0])
  1123. delete_url_map(gcp, gcp.url_maps[0])
  1124. verify_attempts = int(_WAIT_FOR_URL_MAP_PATCH_SEC / _NUM_TEST_RPCS *
  1125. args.qps)
  1126. for i in range(verify_attempts):
  1127. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1128. _WAIT_FOR_STATS_SEC)
  1129. # delete host rule for the original host name
  1130. patch_url_map_backend_service(gcp, alternate_backend_service)
  1131. wait_until_no_rpcs_go_to_given_backends(backend_instances,
  1132. _WAIT_FOR_STATS_SEC)
  1133. except Exception:
  1134. passed = False
  1135. raise
  1136. finally:
  1137. if passed or not args.halt_after_fail:
  1138. delete_global_forwarding_rules(gcp)
  1139. delete_target_proxies(gcp)
  1140. delete_url_maps(gcp)
  1141. create_url_map(gcp, url_map_name, backend_service,
  1142. service_host_name)
  1143. create_target_proxy(gcp, target_proxy_name)
  1144. create_global_forwarding_rule(gcp, forwarding_rule_name,
  1145. potential_service_ports)
  1146. if gcp.service_port != _DEFAULT_SERVICE_PORT:
  1147. patch_url_map_host_rule_with_port(gcp, url_map_name,
  1148. backend_service,
  1149. service_host_name)
  1150. server_uri = service_host_name + ':' + str(gcp.service_port)
  1151. else:
  1152. server_uri = service_host_name
  1153. return server_uri
  1154. def test_forwarding_rule_port_match(gcp, backend_service, instance_group):
  1155. logger.info("Running test_forwarding_rule_port_match")
  1156. passed = True
  1157. try:
  1158. wait_for_healthy_backends(gcp, backend_service, instance_group)
  1159. backend_instances = get_instance_names(gcp, instance_group)
  1160. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1161. _WAIT_FOR_STATS_SEC)
  1162. delete_global_forwarding_rules(gcp)
  1163. create_global_forwarding_rule(gcp, forwarding_rule_name, [
  1164. x for x in parse_port_range(_DEFAULT_PORT_RANGE)
  1165. if x != gcp.service_port
  1166. ])
  1167. wait_until_no_rpcs_go_to_given_backends(backend_instances,
  1168. _WAIT_FOR_STATS_SEC)
  1169. except Exception:
  1170. passed = False
  1171. raise
  1172. finally:
  1173. if passed or not args.halt_after_fail:
  1174. delete_global_forwarding_rules(gcp)
  1175. create_global_forwarding_rule(gcp, forwarding_rule_name,
  1176. potential_service_ports)
  1177. if gcp.service_port != _DEFAULT_SERVICE_PORT:
  1178. patch_url_map_host_rule_with_port(gcp, url_map_name,
  1179. backend_service,
  1180. service_host_name)
  1181. server_uri = service_host_name + ':' + str(gcp.service_port)
  1182. else:
  1183. server_uri = service_host_name
  1184. return server_uri
  1185. def test_forwarding_rule_default_port(gcp, backend_service, instance_group):
  1186. logger.info("Running test_forwarding_rule_default_port")
  1187. passed = True
  1188. try:
  1189. wait_for_healthy_backends(gcp, backend_service, instance_group)
  1190. backend_instances = get_instance_names(gcp, instance_group)
  1191. if gcp.service_port == _DEFAULT_SERVICE_PORT:
  1192. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1193. _WAIT_FOR_STATS_SEC)
  1194. delete_global_forwarding_rules(gcp)
  1195. create_global_forwarding_rule(gcp, forwarding_rule_name,
  1196. parse_port_range(_DEFAULT_PORT_RANGE))
  1197. patch_url_map_host_rule_with_port(gcp, url_map_name,
  1198. backend_service,
  1199. service_host_name)
  1200. wait_until_no_rpcs_go_to_given_backends(backend_instances,
  1201. _WAIT_FOR_STATS_SEC)
  1202. # expect success when no port in client request service uri, and no port in url-map
  1203. delete_global_forwarding_rule(gcp, gcp.global_forwarding_rules[0])
  1204. delete_target_proxy(gcp, gcp.target_proxies[0])
  1205. delete_url_map(gcp, gcp.url_maps[0])
  1206. create_url_map(gcp, url_map_name, backend_service, service_host_name)
  1207. create_target_proxy(gcp, target_proxy_name, False)
  1208. potential_ip_addresses = []
  1209. max_attempts = 10
  1210. for i in range(max_attempts):
  1211. potential_ip_addresses.append('10.10.10.%d' %
  1212. (random.randint(0, 255)))
  1213. create_global_forwarding_rule(gcp, forwarding_rule_name, [80],
  1214. potential_ip_addresses)
  1215. wait_until_all_rpcs_go_to_given_backends(backend_instances,
  1216. _WAIT_FOR_STATS_SEC)
  1217. # expect failure when no port in client request uri, but specify port in url-map
  1218. patch_url_map_host_rule_with_port(gcp, url_map_name, backend_service,
  1219. service_host_name)
  1220. wait_until_no_rpcs_go_to_given_backends(backend_instances,
  1221. _WAIT_FOR_STATS_SEC)
  1222. except Exception:
  1223. passed = False
  1224. raise
  1225. finally:
  1226. if passed or not args.halt_after_fail:
  1227. delete_global_forwarding_rules(gcp)
  1228. delete_target_proxies(gcp)
  1229. delete_url_maps(gcp)
  1230. create_url_map(gcp, url_map_name, backend_service,
  1231. service_host_name)
  1232. create_target_proxy(gcp, target_proxy_name)
  1233. create_global_forwarding_rule(gcp, forwarding_rule_name,
  1234. potential_service_ports)
  1235. if gcp.service_port != _DEFAULT_SERVICE_PORT:
  1236. patch_url_map_host_rule_with_port(gcp, url_map_name,
  1237. backend_service,
  1238. service_host_name)
  1239. server_uri = service_host_name + ':' + str(gcp.service_port)
  1240. else:
  1241. server_uri = service_host_name
  1242. return server_uri
  1243. def test_traffic_splitting(gcp, original_backend_service, instance_group,
  1244. alternate_backend_service, same_zone_instance_group):
  1245. # This test start with all traffic going to original_backend_service. Then
  1246. # it updates URL-map to set default action to traffic splitting between
  1247. # original and alternate. It waits for all backends in both services to
  1248. # receive traffic, then verifies that weights are expected.
  1249. logger.info('Running test_traffic_splitting')
  1250. original_backend_instances, alternate_backend_instances = prepare_services_for_urlmap_tests(
  1251. gcp, original_backend_service, instance_group,
  1252. alternate_backend_service, same_zone_instance_group)
  1253. passed = True
  1254. try:
  1255. # Patch urlmap, change route action to traffic splitting between
  1256. # original and alternate.
  1257. logger.info('patching url map with traffic splitting')
  1258. original_service_percentage, alternate_service_percentage = 20, 80
  1259. patch_url_map_backend_service(
  1260. gcp,
  1261. services_with_weights={
  1262. original_backend_service: original_service_percentage,
  1263. alternate_backend_service: alternate_service_percentage,
  1264. })
  1265. # Split percentage between instances: [20,80] -> [10,10,40,40].
  1266. expected_instance_percentage = [
  1267. original_service_percentage * 1.0 / len(original_backend_instances)
  1268. ] * len(original_backend_instances) + [
  1269. alternate_service_percentage * 1.0 /
  1270. len(alternate_backend_instances)
  1271. ] * len(alternate_backend_instances)
  1272. # Wait for traffic to go to both services.
  1273. logger.info(
  1274. 'waiting for traffic to go to all backends (including alternate)')
  1275. wait_until_all_rpcs_go_to_given_backends(
  1276. original_backend_instances + alternate_backend_instances,
  1277. _WAIT_FOR_STATS_SEC)
  1278. # Verify that weights between two services are expected.
  1279. retry_count = 10
  1280. # Each attempt takes about 10 seconds, 10 retries is equivalent to 100
  1281. # seconds timeout.
  1282. for i in range(retry_count):
  1283. stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
  1284. got_instance_count = [
  1285. stats.rpcs_by_peer[i] for i in original_backend_instances
  1286. ] + [stats.rpcs_by_peer[i] for i in alternate_backend_instances]
  1287. total_count = sum(got_instance_count)
  1288. got_instance_percentage = [
  1289. x * 100.0 / total_count for x in got_instance_count
  1290. ]
  1291. try:
  1292. compare_distributions(got_instance_percentage,
  1293. expected_instance_percentage, 5)
  1294. except Exception as e:
  1295. logger.info('attempt %d', i)
  1296. logger.info('got percentage: %s', got_instance_percentage)
  1297. logger.info('expected percentage: %s',
  1298. expected_instance_percentage)
  1299. logger.info(e)
  1300. if i == retry_count - 1:
  1301. raise Exception(
  1302. 'RPC distribution (%s) differs from expected (%s)' %
  1303. (got_instance_percentage, expected_instance_percentage))
  1304. else:
  1305. logger.info("success")
  1306. break
  1307. except Exception:
  1308. passed = False
  1309. raise
  1310. finally:
  1311. if passed or not args.halt_after_fail:
  1312. patch_url_map_backend_service(gcp, original_backend_service)
  1313. patch_backend_service(gcp, alternate_backend_service, [])
  1314. def test_path_matching(gcp, original_backend_service, instance_group,
  1315. alternate_backend_service, same_zone_instance_group):
  1316. # This test start with all traffic (UnaryCall and EmptyCall) going to
  1317. # original_backend_service.
  1318. #
  1319. # Then it updates URL-map to add routes, to make UnaryCall and EmptyCall to
  1320. # go different backends. It waits for all backends in both services to
  1321. # receive traffic, then verifies that traffic goes to the expected
  1322. # backends.
  1323. logger.info('Running test_path_matching')
  1324. original_backend_instances, alternate_backend_instances = prepare_services_for_urlmap_tests(
  1325. gcp, original_backend_service, instance_group,
  1326. alternate_backend_service, same_zone_instance_group)
  1327. passed = True
  1328. try:
  1329. # A list of tuples (route_rules, expected_instances).
  1330. test_cases = [
  1331. (
  1332. [{
  1333. 'priority': 0,
  1334. # FullPath EmptyCall -> alternate_backend_service.
  1335. 'matchRules': [{
  1336. 'fullPathMatch': '/grpc.testing.TestService/EmptyCall'
  1337. }],
  1338. 'service': alternate_backend_service.url
  1339. }],
  1340. {
  1341. "EmptyCall": alternate_backend_instances,
  1342. "UnaryCall": original_backend_instances
  1343. }),
  1344. (
  1345. [{
  1346. 'priority': 0,
  1347. # Prefix UnaryCall -> alternate_backend_service.
  1348. 'matchRules': [{
  1349. 'prefixMatch': '/grpc.testing.TestService/Unary'
  1350. }],
  1351. 'service': alternate_backend_service.url
  1352. }],
  1353. {
  1354. "UnaryCall": alternate_backend_instances,
  1355. "EmptyCall": original_backend_instances
  1356. }),
  1357. (
  1358. # This test case is similar to the one above (but with route
  1359. # services swapped). This test has two routes (full_path and
  1360. # the default) to match EmptyCall, and both routes set
  1361. # alternative_backend_service as the action. This forces the
  1362. # client to handle duplicate Clusters in the RDS response.
  1363. [
  1364. {
  1365. 'priority': 0,
  1366. # Prefix UnaryCall -> original_backend_service.
  1367. 'matchRules': [{
  1368. 'prefixMatch': '/grpc.testing.TestService/Unary'
  1369. }],
  1370. 'service': original_backend_service.url
  1371. },
  1372. {
  1373. 'priority': 1,
  1374. # FullPath EmptyCall -> alternate_backend_service.
  1375. 'matchRules': [{
  1376. 'fullPathMatch':
  1377. '/grpc.testing.TestService/EmptyCall'
  1378. }],
  1379. 'service': alternate_backend_service.url
  1380. }
  1381. ],
  1382. {
  1383. "UnaryCall": original_backend_instances,
  1384. "EmptyCall": alternate_backend_instances
  1385. }),
  1386. (
  1387. [{
  1388. 'priority': 0,
  1389. # Regex UnaryCall -> alternate_backend_service.
  1390. 'matchRules': [{
  1391. 'regexMatch':
  1392. '^\/.*\/UnaryCall$' # Unary methods with any services.
  1393. }],
  1394. 'service': alternate_backend_service.url
  1395. }],
  1396. {
  1397. "UnaryCall": alternate_backend_instances,
  1398. "EmptyCall": original_backend_instances
  1399. }),
  1400. (
  1401. [{
  1402. 'priority': 0,
  1403. # ignoreCase EmptyCall -> alternate_backend_service.
  1404. 'matchRules': [{
  1405. # Case insensitive matching.
  1406. 'fullPathMatch': '/gRpC.tEsTinG.tEstseRvice/empTycaLl',
  1407. 'ignoreCase': True,
  1408. }],
  1409. 'service': alternate_backend_service.url
  1410. }],
  1411. {
  1412. "UnaryCall": original_backend_instances,
  1413. "EmptyCall": alternate_backend_instances
  1414. }),
  1415. ]
  1416. for (route_rules, expected_instances) in test_cases:
  1417. logger.info('patching url map with %s', route_rules)
  1418. patch_url_map_backend_service(gcp,
  1419. original_backend_service,
  1420. route_rules=route_rules)
  1421. # Wait for traffic to go to both services.
  1422. logger.info(
  1423. 'waiting for traffic to go to all backends (including alternate)'
  1424. )
  1425. wait_until_all_rpcs_go_to_given_backends(
  1426. original_backend_instances + alternate_backend_instances,
  1427. _WAIT_FOR_STATS_SEC)
  1428. retry_count = 80
  1429. # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
  1430. # seconds timeout.
  1431. for i in range(retry_count):
  1432. stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
  1433. if not stats.rpcs_by_method:
  1434. raise ValueError(
  1435. 'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
  1436. )
  1437. logger.info('attempt %d', i)
  1438. if compare_expected_instances(stats, expected_instances):
  1439. logger.info("success")
  1440. break
  1441. elif i == retry_count - 1:
  1442. raise Exception(
  1443. 'timeout waiting for RPCs to the expected instances: %s'
  1444. % expected_instances)
  1445. except Exception:
  1446. passed = False
  1447. raise
  1448. finally:
  1449. if passed or not args.halt_after_fail:
  1450. patch_url_map_backend_service(gcp, original_backend_service)
  1451. patch_backend_service(gcp, alternate_backend_service, [])
  1452. def test_header_matching(gcp, original_backend_service, instance_group,
  1453. alternate_backend_service, same_zone_instance_group):
  1454. # This test start with all traffic (UnaryCall and EmptyCall) going to
  1455. # original_backend_service.
  1456. #
  1457. # Then it updates URL-map to add routes, to make RPCs with test headers to
  1458. # go to different backends. It waits for all backends in both services to
  1459. # receive traffic, then verifies that traffic goes to the expected
  1460. # backends.
  1461. logger.info('Running test_header_matching')
  1462. original_backend_instances, alternate_backend_instances = prepare_services_for_urlmap_tests(
  1463. gcp, original_backend_service, instance_group,
  1464. alternate_backend_service, same_zone_instance_group)
  1465. passed = True
  1466. try:
  1467. # A list of tuples (route_rules, expected_instances).
  1468. test_cases = [
  1469. (
  1470. [{
  1471. 'priority': 0,
  1472. # Header ExactMatch -> alternate_backend_service.
  1473. # EmptyCall is sent with the metadata.
  1474. 'matchRules': [{
  1475. 'prefixMatch':
  1476. '/',
  1477. 'headerMatches': [{
  1478. 'headerName': _TEST_METADATA_KEY,
  1479. 'exactMatch': _TEST_METADATA_VALUE_EMPTY
  1480. }]
  1481. }],
  1482. 'service': alternate_backend_service.url
  1483. }],
  1484. {
  1485. "EmptyCall": alternate_backend_instances,
  1486. "UnaryCall": original_backend_instances
  1487. }),
  1488. (
  1489. [{
  1490. 'priority': 0,
  1491. # Header PrefixMatch -> alternate_backend_service.
  1492. # UnaryCall is sent with the metadata.
  1493. 'matchRules': [{
  1494. 'prefixMatch':
  1495. '/',
  1496. 'headerMatches': [{
  1497. 'headerName': _TEST_METADATA_KEY,
  1498. 'prefixMatch': _TEST_METADATA_VALUE_UNARY[:2]
  1499. }]
  1500. }],
  1501. 'service': alternate_backend_service.url
  1502. }],
  1503. {
  1504. "EmptyCall": original_backend_instances,
  1505. "UnaryCall": alternate_backend_instances
  1506. }),
  1507. (
  1508. [{
  1509. 'priority': 0,
  1510. # Header SuffixMatch -> alternate_backend_service.
  1511. # EmptyCall is sent with the metadata.
  1512. 'matchRules': [{
  1513. 'prefixMatch':
  1514. '/',
  1515. 'headerMatches': [{
  1516. 'headerName': _TEST_METADATA_KEY,
  1517. 'suffixMatch': _TEST_METADATA_VALUE_EMPTY[-2:]
  1518. }]
  1519. }],
  1520. 'service': alternate_backend_service.url
  1521. }],
  1522. {
  1523. "EmptyCall": alternate_backend_instances,
  1524. "UnaryCall": original_backend_instances
  1525. }),
  1526. (
  1527. [{
  1528. 'priority': 0,
  1529. # Header 'xds_md_numeric' present -> alternate_backend_service.
  1530. # UnaryCall is sent with the metadata, so will be sent to alternative.
  1531. 'matchRules': [{
  1532. 'prefixMatch':
  1533. '/',
  1534. 'headerMatches': [{
  1535. 'headerName': _TEST_METADATA_NUMERIC_KEY,
  1536. 'presentMatch': True
  1537. }]
  1538. }],
  1539. 'service': alternate_backend_service.url
  1540. }],
  1541. {
  1542. "EmptyCall": original_backend_instances,
  1543. "UnaryCall": alternate_backend_instances
  1544. }),
  1545. (
  1546. [{
  1547. 'priority': 0,
  1548. # Header invert ExactMatch -> alternate_backend_service.
  1549. # UnaryCall is sent with the metadata, so will be sent to
  1550. # original. EmptyCall will be sent to alternative.
  1551. 'matchRules': [{
  1552. 'prefixMatch':
  1553. '/',
  1554. 'headerMatches': [{
  1555. 'headerName': _TEST_METADATA_KEY,
  1556. 'exactMatch': _TEST_METADATA_VALUE_UNARY,
  1557. 'invertMatch': True
  1558. }]
  1559. }],
  1560. 'service': alternate_backend_service.url
  1561. }],
  1562. {
  1563. "EmptyCall": alternate_backend_instances,
  1564. "UnaryCall": original_backend_instances
  1565. }),
  1566. (
  1567. [{
  1568. 'priority': 0,
  1569. # Header 'xds_md_numeric' range [100,200] -> alternate_backend_service.
  1570. # UnaryCall is sent with the metadata in range.
  1571. 'matchRules': [{
  1572. 'prefixMatch':
  1573. '/',
  1574. 'headerMatches': [{
  1575. 'headerName': _TEST_METADATA_NUMERIC_KEY,
  1576. 'rangeMatch': {
  1577. 'rangeStart': '100',
  1578. 'rangeEnd': '200'
  1579. }
  1580. }]
  1581. }],
  1582. 'service': alternate_backend_service.url
  1583. }],
  1584. {
  1585. "EmptyCall": original_backend_instances,
  1586. "UnaryCall": alternate_backend_instances
  1587. }),
  1588. (
  1589. [{
  1590. 'priority': 0,
  1591. # Header RegexMatch -> alternate_backend_service.
  1592. # EmptyCall is sent with the metadata.
  1593. 'matchRules': [{
  1594. 'prefixMatch':
  1595. '/',
  1596. 'headerMatches': [{
  1597. 'headerName':
  1598. _TEST_METADATA_KEY,
  1599. 'regexMatch':
  1600. "^%s.*%s$" % (_TEST_METADATA_VALUE_EMPTY[:2],
  1601. _TEST_METADATA_VALUE_EMPTY[-2:])
  1602. }]
  1603. }],
  1604. 'service': alternate_backend_service.url
  1605. }],
  1606. {
  1607. "EmptyCall": alternate_backend_instances,
  1608. "UnaryCall": original_backend_instances
  1609. }),
  1610. ]
  1611. for (route_rules, expected_instances) in test_cases:
  1612. logger.info('patching url map with %s -> alternative',
  1613. route_rules[0]['matchRules'])
  1614. patch_url_map_backend_service(gcp,
  1615. original_backend_service,
  1616. route_rules=route_rules)
  1617. # Wait for traffic to go to both services.
  1618. logger.info(
  1619. 'waiting for traffic to go to all backends (including alternate)'
  1620. )
  1621. wait_until_all_rpcs_go_to_given_backends(
  1622. original_backend_instances + alternate_backend_instances,
  1623. _WAIT_FOR_STATS_SEC)
  1624. retry_count = 80
  1625. # Each attempt takes about 5 seconds, 80 retries is equivalent to 400
  1626. # seconds timeout.
  1627. for i in range(retry_count):
  1628. stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
  1629. if not stats.rpcs_by_method:
  1630. raise ValueError(
  1631. 'stats.rpcs_by_method is None, the interop client stats service does not support this test case'
  1632. )
  1633. logger.info('attempt %d', i)
  1634. if compare_expected_instances(stats, expected_instances):
  1635. logger.info("success")
  1636. break
  1637. elif i == retry_count - 1:
  1638. raise Exception(
  1639. 'timeout waiting for RPCs to the expected instances: %s'
  1640. % expected_instances)
  1641. except Exception:
  1642. passed = False
  1643. raise
  1644. finally:
  1645. if passed or not args.halt_after_fail:
  1646. patch_url_map_backend_service(gcp, original_backend_service)
  1647. patch_backend_service(gcp, alternate_backend_service, [])
  1648. def test_circuit_breaking(gcp, original_backend_service, instance_group,
  1649. same_zone_instance_group):
  1650. '''
  1651. Since backend service circuit_breakers configuration cannot be unset,
  1652. which causes trouble for restoring validate_for_proxy flag in target
  1653. proxy/global forwarding rule. This test uses dedicated backend sevices.
  1654. The url_map and backend services undergoes the following state changes:
  1655. Before test:
  1656. original_backend_service -> [instance_group]
  1657. extra_backend_service -> []
  1658. more_extra_backend_service -> []
  1659. url_map -> [original_backend_service]
  1660. In test:
  1661. extra_backend_service (with circuit_breakers) -> [instance_group]
  1662. more_extra_backend_service (with circuit_breakers) -> [same_zone_instance_group]
  1663. url_map -> [extra_backend_service, more_extra_backend_service]
  1664. After test:
  1665. original_backend_service -> [instance_group]
  1666. extra_backend_service (with circuit_breakers) -> []
  1667. more_extra_backend_service (with circuit_breakers) -> []
  1668. url_map -> [original_backend_service]
  1669. '''
  1670. logger.info('Running test_circuit_breaking')
  1671. additional_backend_services = []
  1672. passed = True
  1673. try:
  1674. # TODO(chengyuanzhang): Dedicated backend services created for circuit
  1675. # breaking test. Once the issue for unsetting backend service circuit
  1676. # breakers is resolved or configuring backend service circuit breakers is
  1677. # enabled for config validation, these dedicated backend services can be
  1678. # eliminated.
  1679. extra_backend_service_name = _BASE_BACKEND_SERVICE_NAME + '-extra' + gcp_suffix
  1680. more_extra_backend_service_name = _BASE_BACKEND_SERVICE_NAME + '-more-extra' + gcp_suffix
  1681. extra_backend_service = add_backend_service(gcp,
  1682. extra_backend_service_name)
  1683. additional_backend_services.append(extra_backend_service)
  1684. more_extra_backend_service = add_backend_service(
  1685. gcp, more_extra_backend_service_name)
  1686. additional_backend_services.append(more_extra_backend_service)
  1687. # The config validation for proxyless doesn't allow setting
  1688. # circuit_breakers. Disable validate validate_for_proxyless
  1689. # for this test. This can be removed when validation
  1690. # accepts circuit_breakers.
  1691. logger.info('disabling validate_for_proxyless in target proxy')
  1692. set_validate_for_proxyless(gcp, False)
  1693. extra_backend_service_max_requests = 500
  1694. more_extra_backend_service_max_requests = 1000
  1695. patch_backend_service(gcp,
  1696. extra_backend_service, [instance_group],
  1697. circuit_breakers={
  1698. 'maxRequests':
  1699. extra_backend_service_max_requests
  1700. })
  1701. logger.info('Waiting for extra backends to become healthy')
  1702. wait_for_healthy_backends(gcp, extra_backend_service, instance_group)
  1703. patch_backend_service(gcp,
  1704. more_extra_backend_service,
  1705. [same_zone_instance_group],
  1706. circuit_breakers={
  1707. 'maxRequests':
  1708. more_extra_backend_service_max_requests
  1709. })
  1710. logger.info('Waiting for more extra backend to become healthy')
  1711. wait_for_healthy_backends(gcp, more_extra_backend_service,
  1712. same_zone_instance_group)
  1713. extra_backend_instances = get_instance_names(gcp, instance_group)
  1714. more_extra_backend_instances = get_instance_names(
  1715. gcp, same_zone_instance_group)
  1716. route_rules = [
  1717. {
  1718. 'priority': 0,
  1719. # UnaryCall -> extra_backend_service
  1720. 'matchRules': [{
  1721. 'fullPathMatch': '/grpc.testing.TestService/UnaryCall'
  1722. }],
  1723. 'service': extra_backend_service.url
  1724. },
  1725. {
  1726. 'priority': 1,
  1727. # EmptyCall -> more_extra_backend_service
  1728. 'matchRules': [{
  1729. 'fullPathMatch': '/grpc.testing.TestService/EmptyCall'
  1730. }],
  1731. 'service': more_extra_backend_service.url
  1732. },
  1733. ]
  1734. # Make client send UNARY_CALL and EMPTY_CALL.
  1735. configure_client([
  1736. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1737. messages_pb2.ClientConfigureRequest.RpcType.EMPTY_CALL
  1738. ])
  1739. logger.info('Patching url map with %s', route_rules)
  1740. patch_url_map_backend_service(gcp,
  1741. extra_backend_service,
  1742. route_rules=route_rules)
  1743. logger.info('Waiting for traffic to go to all backends')
  1744. wait_until_all_rpcs_go_to_given_backends(
  1745. extra_backend_instances + more_extra_backend_instances,
  1746. _WAIT_FOR_STATS_SEC)
  1747. # Make all calls keep-open.
  1748. configure_client([
  1749. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1750. messages_pb2.ClientConfigureRequest.RpcType.EMPTY_CALL
  1751. ], [(messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1752. 'rpc-behavior', 'keep-open'),
  1753. (messages_pb2.ClientConfigureRequest.RpcType.EMPTY_CALL,
  1754. 'rpc-behavior', 'keep-open')])
  1755. wait_until_rpcs_in_flight(
  1756. 'UNARY_CALL', (_WAIT_FOR_BACKEND_SEC +
  1757. int(extra_backend_service_max_requests / args.qps)),
  1758. extra_backend_service_max_requests, 1)
  1759. logger.info('UNARY_CALL reached stable state (%d)',
  1760. extra_backend_service_max_requests)
  1761. wait_until_rpcs_in_flight(
  1762. 'EMPTY_CALL',
  1763. (_WAIT_FOR_BACKEND_SEC +
  1764. int(more_extra_backend_service_max_requests / args.qps)),
  1765. more_extra_backend_service_max_requests, 1)
  1766. logger.info('EMPTY_CALL reached stable state (%d)',
  1767. more_extra_backend_service_max_requests)
  1768. # Increment circuit breakers max_requests threshold.
  1769. extra_backend_service_max_requests = 800
  1770. patch_backend_service(gcp,
  1771. extra_backend_service, [instance_group],
  1772. circuit_breakers={
  1773. 'maxRequests':
  1774. extra_backend_service_max_requests
  1775. })
  1776. wait_until_rpcs_in_flight(
  1777. 'UNARY_CALL', (_WAIT_FOR_BACKEND_SEC +
  1778. int(extra_backend_service_max_requests / args.qps)),
  1779. extra_backend_service_max_requests, 1)
  1780. logger.info('UNARY_CALL reached stable state after increase (%d)',
  1781. extra_backend_service_max_requests)
  1782. logger.info('success')
  1783. # Avoid new RPCs being outstanding (some test clients create threads
  1784. # for sending RPCs) after restoring backend services.
  1785. configure_client(
  1786. [messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL])
  1787. except Exception:
  1788. passed = False
  1789. raise
  1790. finally:
  1791. if passed or not args.halt_after_fail:
  1792. patch_url_map_backend_service(gcp, original_backend_service)
  1793. patch_backend_service(gcp, original_backend_service,
  1794. [instance_group])
  1795. for backend_service in additional_backend_services:
  1796. delete_backend_service(gcp, backend_service)
  1797. set_validate_for_proxyless(gcp, True)
  1798. def test_timeout(gcp, original_backend_service, instance_group):
  1799. logger.info('Running test_timeout')
  1800. logger.info('waiting for original backends to become healthy')
  1801. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  1802. # UnaryCall -> maxStreamDuration:3s
  1803. route_rules = [{
  1804. 'priority': 0,
  1805. 'matchRules': [{
  1806. 'fullPathMatch': '/grpc.testing.TestService/UnaryCall'
  1807. }],
  1808. 'service': original_backend_service.url,
  1809. 'routeAction': {
  1810. 'maxStreamDuration': {
  1811. 'seconds': 3,
  1812. },
  1813. },
  1814. }]
  1815. patch_url_map_backend_service(gcp,
  1816. original_backend_service,
  1817. route_rules=route_rules)
  1818. # A list of tuples (testcase_name, {client_config}, {expected_results})
  1819. test_cases = [
  1820. (
  1821. 'timeout_exceeded (UNARY_CALL), timeout_different_route (EMPTY_CALL)',
  1822. # UnaryCall and EmptyCall both sleep-4.
  1823. # UnaryCall timeouts, EmptyCall succeeds.
  1824. {
  1825. 'rpc_types': [
  1826. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1827. messages_pb2.ClientConfigureRequest.RpcType.EMPTY_CALL,
  1828. ],
  1829. 'metadata': [
  1830. (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1831. 'rpc-behavior', 'sleep-4'),
  1832. (messages_pb2.ClientConfigureRequest.RpcType.EMPTY_CALL,
  1833. 'rpc-behavior', 'sleep-4'),
  1834. ],
  1835. },
  1836. {
  1837. 'UNARY_CALL': 4, # DEADLINE_EXCEEDED
  1838. 'EMPTY_CALL': 0,
  1839. },
  1840. ),
  1841. (
  1842. 'app_timeout_exceeded',
  1843. # UnaryCall only with sleep-2; timeout=1s; calls timeout.
  1844. {
  1845. 'rpc_types': [
  1846. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1847. ],
  1848. 'metadata': [
  1849. (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1850. 'rpc-behavior', 'sleep-2'),
  1851. ],
  1852. 'timeout_sec': 1,
  1853. },
  1854. {
  1855. 'UNARY_CALL': 4, # DEADLINE_EXCEEDED
  1856. },
  1857. ),
  1858. (
  1859. 'timeout_not_exceeded',
  1860. # UnaryCall only with no sleep; calls succeed.
  1861. {
  1862. 'rpc_types': [
  1863. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  1864. ],
  1865. },
  1866. {
  1867. 'UNARY_CALL': 0,
  1868. },
  1869. )
  1870. ]
  1871. passed = True
  1872. try:
  1873. first_case = True
  1874. for (testcase_name, client_config, expected_results) in test_cases:
  1875. logger.info('starting case %s', testcase_name)
  1876. configure_client(**client_config)
  1877. # wait a second to help ensure the client stops sending RPCs with
  1878. # the old config. We will make multiple attempts if it is failing,
  1879. # but this improves confidence that the test is valid if the
  1880. # previous client_config would lead to the same results.
  1881. time.sleep(1)
  1882. # Each attempt takes 10 seconds; 20 attempts is equivalent to 200
  1883. # second timeout.
  1884. attempt_count = 20
  1885. if first_case:
  1886. attempt_count = 120
  1887. first_case = False
  1888. before_stats = get_client_accumulated_stats()
  1889. if not before_stats.stats_per_method:
  1890. raise ValueError(
  1891. 'stats.stats_per_method is None, the interop client stats service does not support this test case'
  1892. )
  1893. for i in range(attempt_count):
  1894. logger.info('%s: attempt %d', testcase_name, i)
  1895. test_runtime_secs = 10
  1896. time.sleep(test_runtime_secs)
  1897. after_stats = get_client_accumulated_stats()
  1898. success = True
  1899. for rpc, status in list(expected_results.items()):
  1900. qty = (after_stats.stats_per_method[rpc].result[status] -
  1901. before_stats.stats_per_method[rpc].result[status])
  1902. want = test_runtime_secs * args.qps
  1903. # Allow 10% deviation from expectation to reduce flakiness
  1904. if qty < (want * .9) or qty > (want * 1.1):
  1905. logger.info('%s: failed due to %s[%s]: got %d want ~%d',
  1906. testcase_name, rpc, status, qty, want)
  1907. success = False
  1908. if success:
  1909. logger.info('success')
  1910. break
  1911. logger.info('%s attempt %d failed', testcase_name, i)
  1912. before_stats = after_stats
  1913. else:
  1914. raise Exception(
  1915. '%s: timeout waiting for expected results: %s; got %s' %
  1916. (testcase_name, expected_results,
  1917. after_stats.stats_per_method))
  1918. except Exception:
  1919. passed = False
  1920. raise
  1921. finally:
  1922. if passed or not args.halt_after_fail:
  1923. patch_url_map_backend_service(gcp, original_backend_service)
  1924. def test_fault_injection(gcp, original_backend_service, instance_group):
  1925. logger.info('Running test_fault_injection')
  1926. logger.info('waiting for original backends to become healthy')
  1927. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  1928. testcase_header = 'fi_testcase'
  1929. def _route(pri, name, fi_policy):
  1930. return {
  1931. 'priority': pri,
  1932. 'matchRules': [{
  1933. 'prefixMatch':
  1934. '/',
  1935. 'headerMatches': [{
  1936. 'headerName': testcase_header,
  1937. 'exactMatch': name,
  1938. }],
  1939. }],
  1940. 'service': original_backend_service.url,
  1941. 'routeAction': {
  1942. 'faultInjectionPolicy': fi_policy
  1943. },
  1944. }
  1945. def _abort(pct):
  1946. return {
  1947. 'abort': {
  1948. 'httpStatus': 401,
  1949. 'percentage': pct,
  1950. }
  1951. }
  1952. def _delay(pct):
  1953. return {
  1954. 'delay': {
  1955. 'fixedDelay': {
  1956. 'seconds': '20'
  1957. },
  1958. 'percentage': pct,
  1959. }
  1960. }
  1961. zero_route = _abort(0)
  1962. zero_route.update(_delay(0))
  1963. route_rules = [
  1964. _route(0, 'zero_percent_fault_injection', zero_route),
  1965. _route(1, 'always_delay', _delay(100)),
  1966. _route(2, 'always_abort', _abort(100)),
  1967. _route(3, 'delay_half', _delay(50)),
  1968. _route(4, 'abort_half', _abort(50)),
  1969. {
  1970. 'priority': 5,
  1971. 'matchRules': [{
  1972. 'prefixMatch': '/'
  1973. }],
  1974. 'service': original_backend_service.url,
  1975. },
  1976. ]
  1977. set_validate_for_proxyless(gcp, False)
  1978. patch_url_map_backend_service(gcp,
  1979. original_backend_service,
  1980. route_rules=route_rules)
  1981. # A list of tuples (testcase_name, {client_config}, {code: percent}). Each
  1982. # test case will set the testcase_header with the testcase_name for routing
  1983. # to the appropriate config for the case, defined above.
  1984. test_cases = [
  1985. (
  1986. 'always_delay',
  1987. {
  1988. 'timeout_sec': 2
  1989. },
  1990. {
  1991. 4: 1
  1992. }, # DEADLINE_EXCEEDED
  1993. ),
  1994. (
  1995. 'always_abort',
  1996. {},
  1997. {
  1998. 16: 1
  1999. }, # UNAUTHENTICATED
  2000. ),
  2001. (
  2002. 'delay_half',
  2003. {
  2004. 'timeout_sec': 2
  2005. },
  2006. {
  2007. 4: .5,
  2008. 0: .5
  2009. }, # DEADLINE_EXCEEDED / OK: 50% / 50%
  2010. ),
  2011. (
  2012. 'abort_half',
  2013. {},
  2014. {
  2015. 16: .5,
  2016. 0: .5
  2017. }, # UNAUTHENTICATED / OK: 50% / 50%
  2018. ),
  2019. (
  2020. 'zero_percent_fault_injection',
  2021. {},
  2022. {
  2023. 0: 1
  2024. }, # OK
  2025. ),
  2026. (
  2027. 'non_matching_fault_injection', # Not in route_rules, above.
  2028. {},
  2029. {
  2030. 0: 1
  2031. }, # OK
  2032. ),
  2033. ]
  2034. passed = True
  2035. try:
  2036. first_case = True
  2037. for (testcase_name, client_config, expected_results) in test_cases:
  2038. logger.info('starting case %s', testcase_name)
  2039. client_config['metadata'] = [
  2040. (messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  2041. testcase_header, testcase_name)
  2042. ]
  2043. client_config['rpc_types'] = [
  2044. messages_pb2.ClientConfigureRequest.RpcType.UNARY_CALL,
  2045. ]
  2046. configure_client(**client_config)
  2047. # wait a second to help ensure the client stops sending RPCs with
  2048. # the old config. We will make multiple attempts if it is failing,
  2049. # but this improves confidence that the test is valid if the
  2050. # previous client_config would lead to the same results.
  2051. time.sleep(1)
  2052. # Each attempt takes 10 seconds
  2053. if first_case:
  2054. # Give the first test case 600s for xDS config propagation.
  2055. attempt_count = 60
  2056. first_case = False
  2057. else:
  2058. # The accumulated stats might include previous sub-test, running
  2059. # the test multiple times to deflake
  2060. attempt_count = 10
  2061. before_stats = get_client_accumulated_stats()
  2062. if not before_stats.stats_per_method:
  2063. raise ValueError(
  2064. 'stats.stats_per_method is None, the interop client stats service does not support this test case'
  2065. )
  2066. for i in range(attempt_count):
  2067. logger.info('%s: attempt %d', testcase_name, i)
  2068. test_runtime_secs = 10
  2069. time.sleep(test_runtime_secs)
  2070. after_stats = get_client_accumulated_stats()
  2071. success = True
  2072. for status, pct in list(expected_results.items()):
  2073. rpc = 'UNARY_CALL'
  2074. qty = (after_stats.stats_per_method[rpc].result[status] -
  2075. before_stats.stats_per_method[rpc].result[status])
  2076. want = pct * args.qps * test_runtime_secs
  2077. # Allow 10% deviation from expectation to reduce flakiness
  2078. VARIANCE_ALLOWED = 0.1
  2079. if abs(qty - want) > want * VARIANCE_ALLOWED:
  2080. logger.info('%s: failed due to %s[%s]: got %d want ~%d',
  2081. testcase_name, rpc, status, qty, want)
  2082. success = False
  2083. if success:
  2084. logger.info('success')
  2085. break
  2086. logger.info('%s attempt %d failed', testcase_name, i)
  2087. before_stats = after_stats
  2088. else:
  2089. raise Exception(
  2090. '%s: timeout waiting for expected results: %s; got %s' %
  2091. (testcase_name, expected_results,
  2092. after_stats.stats_per_method))
  2093. except Exception:
  2094. passed = False
  2095. raise
  2096. finally:
  2097. if passed or not args.halt_after_fail:
  2098. patch_url_map_backend_service(gcp, original_backend_service)
  2099. set_validate_for_proxyless(gcp, True)
  2100. def test_csds(gcp, original_backend_service, instance_group, server_uri):
  2101. test_csds_timeout_s = datetime.timedelta(minutes=5).total_seconds()
  2102. sleep_interval_between_attempts_s = datetime.timedelta(
  2103. seconds=2).total_seconds()
  2104. logger.info('Running test_csds')
  2105. logger.info('waiting for original backends to become healthy')
  2106. wait_for_healthy_backends(gcp, original_backend_service, instance_group)
  2107. # Test case timeout: 5 minutes
  2108. deadline = time.time() + test_csds_timeout_s
  2109. cnt = 0
  2110. while time.time() <= deadline:
  2111. client_config = get_client_xds_config_dump()
  2112. logger.info('test_csds attempt %d: received xDS config %s', cnt,
  2113. json.dumps(client_config, indent=2))
  2114. if client_config is not None:
  2115. # Got the xDS config dump, now validate it
  2116. ok = True
  2117. try:
  2118. if client_config['node']['locality']['zone'] != args.zone:
  2119. logger.info('Invalid zone %s != %s',
  2120. client_config['node']['locality']['zone'],
  2121. args.zone)
  2122. ok = False
  2123. seen = set()
  2124. for xds_config in client_config.get('xds_config', []):
  2125. if 'listener_config' in xds_config:
  2126. listener_name = xds_config['listener_config'][
  2127. 'dynamic_listeners'][0]['active_state']['listener'][
  2128. 'name']
  2129. if listener_name != server_uri:
  2130. logger.info('Invalid Listener name %s != %s',
  2131. listener_name, server_uri)
  2132. ok = False
  2133. else:
  2134. seen.add('lds')
  2135. elif 'route_config' in xds_config:
  2136. num_vh = len(
  2137. xds_config['route_config']['dynamic_route_configs']
  2138. [0]['route_config']['virtual_hosts'])
  2139. if num_vh <= 0:
  2140. logger.info('Invalid number of VirtualHosts %s',
  2141. num_vh)
  2142. ok = False
  2143. else:
  2144. seen.add('rds')
  2145. elif 'cluster_config' in xds_config:
  2146. cluster_type = xds_config['cluster_config'][
  2147. 'dynamic_active_clusters'][0]['cluster']['type']
  2148. if cluster_type != 'EDS':
  2149. logger.info('Invalid cluster type %s != EDS',
  2150. cluster_type)
  2151. ok = False
  2152. else:
  2153. seen.add('cds')
  2154. elif 'endpoint_config' in xds_config:
  2155. sub_zone = xds_config["endpoint_config"][
  2156. "dynamic_endpoint_configs"][0]["endpoint_config"][
  2157. "endpoints"][0]["locality"]["sub_zone"]
  2158. if args.zone not in sub_zone:
  2159. logger.info('Invalid endpoint sub_zone %s',
  2160. sub_zone)
  2161. ok = False
  2162. else:
  2163. seen.add('eds')
  2164. for generic_xds_config in client_config.get(
  2165. 'generic_xds_configs', []):
  2166. if re.search(r'\.Listener$',
  2167. generic_xds_config['type_url']):
  2168. seen.add('lds')
  2169. listener = generic_xds_config["xds_config"]
  2170. if listener['name'] != server_uri:
  2171. logger.info('Invalid Listener name %s != %s',
  2172. listener_name, server_uri)
  2173. ok = False
  2174. elif re.search(r'\.RouteConfiguration$',
  2175. generic_xds_config['type_url']):
  2176. seen.add('rds')
  2177. route_config = generic_xds_config["xds_config"]
  2178. if not len(route_config['virtual_hosts']):
  2179. logger.info('Invalid number of VirtualHosts %s',
  2180. num_vh)
  2181. ok = False
  2182. elif re.search(r'\.Cluster$',
  2183. generic_xds_config['type_url']):
  2184. seen.add('cds')
  2185. cluster = generic_xds_config["xds_config"]
  2186. if cluster['type'] != 'EDS':
  2187. logger.info('Invalid cluster type %s != EDS',
  2188. cluster_type)
  2189. ok = False
  2190. elif re.search(r'\.ClusterLoadAssignment$',
  2191. generic_xds_config['type_url']):
  2192. seen.add('eds')
  2193. endpoint = generic_xds_config["xds_config"]
  2194. if args.zone not in endpoint["endpoints"][0][
  2195. "locality"]["sub_zone"]:
  2196. logger.info('Invalid endpoint sub_zone %s',
  2197. sub_zone)
  2198. ok = False
  2199. want = {'lds', 'rds', 'cds', 'eds'}
  2200. if seen != want:
  2201. logger.info('Incomplete xDS config dump, seen=%s', seen)
  2202. ok = False
  2203. except:
  2204. logger.exception('Error in xDS config dump:')
  2205. ok = False
  2206. finally:
  2207. if ok:
  2208. # Successfully fetched xDS config, and they looks good.
  2209. logger.info('success')
  2210. return
  2211. logger.info('test_csds attempt %d failed', cnt)
  2212. # Give the client some time to fetch xDS resources
  2213. time.sleep(sleep_interval_between_attempts_s)
  2214. cnt += 1
  2215. raise RuntimeError('failed to receive a valid xDS config in %s seconds' %
  2216. test_csds_timeout_s)
  2217. def set_validate_for_proxyless(gcp, validate_for_proxyless):
  2218. if not gcp.alpha_compute:
  2219. logger.debug(
  2220. 'Not setting validateForProxy because alpha is not enabled')
  2221. return
  2222. if len(gcp.global_forwarding_rules) != 1 or len(
  2223. gcp.target_proxies) != 1 or len(gcp.url_maps) != 1:
  2224. logger.debug(
  2225. "Global forwarding rule, target proxy or url map not found.")
  2226. return
  2227. # This function deletes global_forwarding_rule and target_proxy, then
  2228. # recreate target_proxy with validateForProxyless=False. This is necessary
  2229. # because patching target_grpc_proxy isn't supported.
  2230. delete_global_forwarding_rule(gcp, gcp.global_forwarding_rules[0])
  2231. delete_target_proxy(gcp, gcp.target_proxies[0])
  2232. create_target_proxy(gcp, target_proxy_name, validate_for_proxyless)
  2233. create_global_forwarding_rule(gcp, forwarding_rule_name, [gcp.service_port])
  2234. def get_serving_status(instance, service_port):
  2235. with grpc.insecure_channel('%s:%d' % (instance, service_port)) as channel:
  2236. health_stub = health_pb2_grpc.HealthStub(channel)
  2237. return health_stub.Check(health_pb2.HealthCheckRequest())
  2238. def set_serving_status(instances, service_port, serving):
  2239. logger.info('setting %s serving status to %s', instances, serving)
  2240. for instance in instances:
  2241. with grpc.insecure_channel('%s:%d' %
  2242. (instance, service_port)) as channel:
  2243. logger.info('setting %s serving status to %s', instance, serving)
  2244. stub = test_pb2_grpc.XdsUpdateHealthServiceStub(channel)
  2245. retry_count = 5
  2246. for i in range(5):
  2247. if serving:
  2248. stub.SetServing(empty_pb2.Empty())
  2249. else:
  2250. stub.SetNotServing(empty_pb2.Empty())
  2251. serving_status = get_serving_status(instance, service_port)
  2252. logger.info('got instance service status %s', serving_status)
  2253. want_status = health_pb2.HealthCheckResponse.SERVING if serving else health_pb2.HealthCheckResponse.NOT_SERVING
  2254. if serving_status.status == want_status:
  2255. break
  2256. if i == retry_count - 1:
  2257. raise Exception(
  2258. 'failed to set instance service status after %d retries'
  2259. % retry_count)
  2260. def is_primary_instance_group(gcp, instance_group):
  2261. # Clients may connect to a TD instance in a different region than the
  2262. # client, in which case primary/secondary assignments may not be based on
  2263. # the client's actual locality.
  2264. instance_names = get_instance_names(gcp, instance_group)
  2265. stats = get_client_stats(_NUM_TEST_RPCS, _WAIT_FOR_STATS_SEC)
  2266. return all(
  2267. peer in instance_names for peer in list(stats.rpcs_by_peer.keys()))
  2268. def get_startup_script(path_to_server_binary, service_port):
  2269. if path_to_server_binary:
  2270. return 'nohup %s --port=%d 1>/dev/null &' % (path_to_server_binary,
  2271. service_port)
  2272. else:
  2273. return """#!/bin/bash
  2274. sudo apt update
  2275. sudo apt install -y git default-jdk
  2276. mkdir java_server
  2277. pushd java_server
  2278. git clone https://github.com/grpc/grpc-java.git
  2279. pushd grpc-java
  2280. pushd interop-testing
  2281. ../gradlew installDist -x test -PskipCodegen=true -PskipAndroid=true
  2282. nohup build/install/grpc-interop-testing/bin/xds-test-server \
  2283. --port=%d 1>/dev/null &""" % service_port
  2284. def create_instance_template(gcp, name, network, source_image, machine_type,
  2285. startup_script):
  2286. config = {
  2287. 'name': name,
  2288. 'properties': {
  2289. 'tags': {
  2290. 'items': ['allow-health-checks']
  2291. },
  2292. 'machineType': machine_type,
  2293. 'serviceAccounts': [{
  2294. 'email': 'default',
  2295. 'scopes': ['https://www.googleapis.com/auth/cloud-platform',]
  2296. }],
  2297. 'networkInterfaces': [{
  2298. 'accessConfigs': [{
  2299. 'type': 'ONE_TO_ONE_NAT'
  2300. }],
  2301. 'network': network
  2302. }],
  2303. 'disks': [{
  2304. 'boot': True,
  2305. 'initializeParams': {
  2306. 'sourceImage': source_image
  2307. },
  2308. 'autoDelete': True
  2309. }],
  2310. 'metadata': {
  2311. 'items': [{
  2312. 'key': 'startup-script',
  2313. 'value': startup_script
  2314. }]
  2315. }
  2316. }
  2317. }
  2318. logger.debug('Sending GCP request with body=%s', config)
  2319. result = gcp.compute.instanceTemplates().insert(
  2320. project=gcp.project, body=config).execute(num_retries=_GCP_API_RETRIES)
  2321. wait_for_global_operation(gcp, result['name'])
  2322. gcp.instance_template = GcpResource(config['name'], result['targetLink'])
  2323. def add_instance_group(gcp, zone, name, size):
  2324. config = {
  2325. 'name': name,
  2326. 'instanceTemplate': gcp.instance_template.url,
  2327. 'targetSize': size,
  2328. 'namedPorts': [{
  2329. 'name': 'grpc',
  2330. 'port': gcp.service_port
  2331. }]
  2332. }
  2333. logger.debug('Sending GCP request with body=%s', config)
  2334. result = gcp.compute.instanceGroupManagers().insert(
  2335. project=gcp.project, zone=zone,
  2336. body=config).execute(num_retries=_GCP_API_RETRIES)
  2337. wait_for_zone_operation(gcp, zone, result['name'])
  2338. result = gcp.compute.instanceGroupManagers().get(
  2339. project=gcp.project, zone=zone,
  2340. instanceGroupManager=config['name']).execute(
  2341. num_retries=_GCP_API_RETRIES)
  2342. instance_group = InstanceGroup(config['name'], result['instanceGroup'],
  2343. zone)
  2344. gcp.instance_groups.append(instance_group)
  2345. wait_for_instance_group_to_reach_expected_size(gcp, instance_group, size,
  2346. _WAIT_FOR_OPERATION_SEC)
  2347. return instance_group
  2348. def create_health_check(gcp, name):
  2349. if gcp.alpha_compute:
  2350. config = {
  2351. 'name': name,
  2352. 'type': 'GRPC',
  2353. 'grpcHealthCheck': {
  2354. 'portSpecification': 'USE_SERVING_PORT'
  2355. }
  2356. }
  2357. compute_to_use = gcp.alpha_compute
  2358. else:
  2359. config = {
  2360. 'name': name,
  2361. 'type': 'TCP',
  2362. 'tcpHealthCheck': {
  2363. 'portName': 'grpc'
  2364. }
  2365. }
  2366. compute_to_use = gcp.compute
  2367. logger.debug('Sending GCP request with body=%s', config)
  2368. result = compute_to_use.healthChecks().insert(
  2369. project=gcp.project, body=config).execute(num_retries=_GCP_API_RETRIES)
  2370. wait_for_global_operation(gcp, result['name'])
  2371. gcp.health_check = GcpResource(config['name'], result['targetLink'])
  2372. def create_health_check_firewall_rule(gcp, name):
  2373. config = {
  2374. 'name': name,
  2375. 'direction': 'INGRESS',
  2376. 'allowed': [{
  2377. 'IPProtocol': 'tcp'
  2378. }],
  2379. 'sourceRanges': ['35.191.0.0/16', '130.211.0.0/22'],
  2380. 'targetTags': ['allow-health-checks'],
  2381. }
  2382. logger.debug('Sending GCP request with body=%s', config)
  2383. result = gcp.compute.firewalls().insert(
  2384. project=gcp.project, body=config).execute(num_retries=_GCP_API_RETRIES)
  2385. wait_for_global_operation(gcp, result['name'])
  2386. gcp.health_check_firewall_rule = GcpResource(config['name'],
  2387. result['targetLink'])
  2388. def add_backend_service(gcp, name):
  2389. if gcp.alpha_compute:
  2390. protocol = 'GRPC'
  2391. compute_to_use = gcp.alpha_compute
  2392. else:
  2393. protocol = 'HTTP2'
  2394. compute_to_use = gcp.compute
  2395. config = {
  2396. 'name': name,
  2397. 'loadBalancingScheme': 'INTERNAL_SELF_MANAGED',
  2398. 'healthChecks': [gcp.health_check.url],
  2399. 'portName': 'grpc',
  2400. 'protocol': protocol
  2401. }
  2402. logger.debug('Sending GCP request with body=%s', config)
  2403. result = compute_to_use.backendServices().insert(
  2404. project=gcp.project, body=config).execute(num_retries=_GCP_API_RETRIES)
  2405. wait_for_global_operation(gcp, result['name'])
  2406. backend_service = GcpResource(config['name'], result['targetLink'])
  2407. gcp.backend_services.append(backend_service)
  2408. return backend_service
  2409. def create_url_map(gcp, name, backend_service, host_name):
  2410. config = {
  2411. 'name': name,
  2412. 'defaultService': backend_service.url,
  2413. 'pathMatchers': [{
  2414. 'name': _PATH_MATCHER_NAME,
  2415. 'defaultService': backend_service.url,
  2416. }],
  2417. 'hostRules': [{
  2418. 'hosts': [host_name],
  2419. 'pathMatcher': _PATH_MATCHER_NAME
  2420. }]
  2421. }
  2422. logger.debug('Sending GCP request with body=%s', config)
  2423. result = gcp.compute.urlMaps().insert(
  2424. project=gcp.project, body=config).execute(num_retries=_GCP_API_RETRIES)
  2425. wait_for_global_operation(gcp, result['name'])
  2426. url_map = GcpResource(config['name'], result['targetLink'])
  2427. gcp.url_maps.append(url_map)
  2428. return url_map
  2429. def patch_url_map_host_rule_with_port(gcp, name, backend_service, host_name):
  2430. config = {
  2431. 'hostRules': [{
  2432. 'hosts': ['%s:%d' % (host_name, gcp.service_port)],
  2433. 'pathMatcher': _PATH_MATCHER_NAME
  2434. }]
  2435. }
  2436. logger.debug('Sending GCP request with body=%s', config)
  2437. result = gcp.compute.urlMaps().patch(
  2438. project=gcp.project, urlMap=name,
  2439. body=config).execute(num_retries=_GCP_API_RETRIES)
  2440. wait_for_global_operation(gcp, result['name'])
  2441. def create_target_proxy(gcp, name, validate_for_proxyless=True, url_map=None):
  2442. if url_map:
  2443. arg_url_map_url = url_map.url
  2444. else:
  2445. arg_url_map_url = gcp.url_maps[0].url
  2446. if gcp.alpha_compute:
  2447. config = {
  2448. 'name': name,
  2449. 'url_map': arg_url_map_url,
  2450. 'validate_for_proxyless': validate_for_proxyless
  2451. }
  2452. logger.debug('Sending GCP request with body=%s', config)
  2453. result = gcp.alpha_compute.targetGrpcProxies().insert(
  2454. project=gcp.project,
  2455. body=config).execute(num_retries=_GCP_API_RETRIES)
  2456. else:
  2457. config = {
  2458. 'name': name,
  2459. 'url_map': arg_url_map_url,
  2460. }
  2461. logger.debug('Sending GCP request with body=%s', config)
  2462. result = gcp.compute.targetHttpProxies().insert(
  2463. project=gcp.project,
  2464. body=config).execute(num_retries=_GCP_API_RETRIES)
  2465. wait_for_global_operation(gcp, result['name'])
  2466. target_proxy = GcpResource(config['name'], result['targetLink'])
  2467. gcp.target_proxies.append(target_proxy)
  2468. return target_proxy
  2469. def create_global_forwarding_rule(gcp,
  2470. name,
  2471. potential_ports,
  2472. potential_ip_addresses=['0.0.0.0'],
  2473. target_proxy=None):
  2474. if target_proxy:
  2475. arg_target_proxy_url = target_proxy.url
  2476. else:
  2477. arg_target_proxy_url = gcp.target_proxies[0].url
  2478. if gcp.alpha_compute:
  2479. compute_to_use = gcp.alpha_compute
  2480. else:
  2481. compute_to_use = gcp.compute
  2482. for port in potential_ports:
  2483. for ip_address in potential_ip_addresses:
  2484. try:
  2485. config = {
  2486. 'name': name,
  2487. 'loadBalancingScheme': 'INTERNAL_SELF_MANAGED',
  2488. 'portRange': str(port),
  2489. 'IPAddress': ip_address,
  2490. 'network': args.network,
  2491. 'target': arg_target_proxy_url,
  2492. }
  2493. logger.debug('Sending GCP request with body=%s', config)
  2494. result = compute_to_use.globalForwardingRules().insert(
  2495. project=gcp.project,
  2496. body=config).execute(num_retries=_GCP_API_RETRIES)
  2497. wait_for_global_operation(gcp, result['name'])
  2498. global_forwarding_rule = GcpResource(config['name'],
  2499. result['targetLink'])
  2500. gcp.global_forwarding_rules.append(global_forwarding_rule)
  2501. gcp.service_port = port
  2502. return
  2503. except googleapiclient.errors.HttpError as http_error:
  2504. logger.warning(
  2505. 'Got error %s when attempting to create forwarding rule to '
  2506. '%s:%d. Retrying with another port.' %
  2507. (http_error, ip_address, port))
  2508. def get_health_check(gcp, health_check_name):
  2509. try:
  2510. result = gcp.compute.healthChecks().get(
  2511. project=gcp.project, healthCheck=health_check_name).execute()
  2512. gcp.health_check = GcpResource(health_check_name, result['selfLink'])
  2513. except Exception as e:
  2514. gcp.errors.append(e)
  2515. gcp.health_check = GcpResource(health_check_name, None)
  2516. def get_health_check_firewall_rule(gcp, firewall_name):
  2517. try:
  2518. result = gcp.compute.firewalls().get(project=gcp.project,
  2519. firewall=firewall_name).execute()
  2520. gcp.health_check_firewall_rule = GcpResource(firewall_name,
  2521. result['selfLink'])
  2522. except Exception as e:
  2523. gcp.errors.append(e)
  2524. gcp.health_check_firewall_rule = GcpResource(firewall_name, None)
  2525. def get_backend_service(gcp, backend_service_name, record_error=True):
  2526. try:
  2527. result = gcp.compute.backendServices().get(
  2528. project=gcp.project, backendService=backend_service_name).execute()
  2529. backend_service = GcpResource(backend_service_name, result['selfLink'])
  2530. except Exception as e:
  2531. if record_error:
  2532. gcp.errors.append(e)
  2533. backend_service = GcpResource(backend_service_name, None)
  2534. gcp.backend_services.append(backend_service)
  2535. return backend_service
  2536. def get_url_map(gcp, url_map_name, record_error=True):
  2537. try:
  2538. result = gcp.compute.urlMaps().get(project=gcp.project,
  2539. urlMap=url_map_name).execute()
  2540. url_map = GcpResource(url_map_name, result['selfLink'])
  2541. gcp.url_maps.append(url_map)
  2542. except Exception as e:
  2543. if record_error:
  2544. gcp.errors.append(e)
  2545. def get_target_proxy(gcp, target_proxy_name, record_error=True):
  2546. try:
  2547. if gcp.alpha_compute:
  2548. result = gcp.alpha_compute.targetGrpcProxies().get(
  2549. project=gcp.project,
  2550. targetGrpcProxy=target_proxy_name).execute()
  2551. else:
  2552. result = gcp.compute.targetHttpProxies().get(
  2553. project=gcp.project,
  2554. targetHttpProxy=target_proxy_name).execute()
  2555. target_proxy = GcpResource(target_proxy_name, result['selfLink'])
  2556. gcp.target_proxies.append(target_proxy)
  2557. except Exception as e:
  2558. if record_error:
  2559. gcp.errors.append(e)
  2560. def get_global_forwarding_rule(gcp, forwarding_rule_name, record_error=True):
  2561. try:
  2562. result = gcp.compute.globalForwardingRules().get(
  2563. project=gcp.project, forwardingRule=forwarding_rule_name).execute()
  2564. global_forwarding_rule = GcpResource(forwarding_rule_name,
  2565. result['selfLink'])
  2566. gcp.global_forwarding_rules.append(global_forwarding_rule)
  2567. except Exception as e:
  2568. if record_error:
  2569. gcp.errors.append(e)
  2570. def get_instance_template(gcp, template_name):
  2571. try:
  2572. result = gcp.compute.instanceTemplates().get(
  2573. project=gcp.project, instanceTemplate=template_name).execute()
  2574. gcp.instance_template = GcpResource(template_name, result['selfLink'])
  2575. except Exception as e:
  2576. gcp.errors.append(e)
  2577. gcp.instance_template = GcpResource(template_name, None)
  2578. def get_instance_group(gcp, zone, instance_group_name):
  2579. try:
  2580. result = gcp.compute.instanceGroups().get(
  2581. project=gcp.project, zone=zone,
  2582. instanceGroup=instance_group_name).execute()
  2583. gcp.service_port = result['namedPorts'][0]['port']
  2584. instance_group = InstanceGroup(instance_group_name, result['selfLink'],
  2585. zone)
  2586. except Exception as e:
  2587. gcp.errors.append(e)
  2588. instance_group = InstanceGroup(instance_group_name, None, zone)
  2589. gcp.instance_groups.append(instance_group)
  2590. return instance_group
  2591. def delete_global_forwarding_rule(gcp, forwarding_rule_to_delete=None):
  2592. if not forwarding_rule_to_delete:
  2593. return
  2594. try:
  2595. logger.debug('Deleting forwarding rule %s',
  2596. forwarding_rule_to_delete.name)
  2597. result = gcp.compute.globalForwardingRules().delete(
  2598. project=gcp.project,
  2599. forwardingRule=forwarding_rule_to_delete.name).execute(
  2600. num_retries=_GCP_API_RETRIES)
  2601. wait_for_global_operation(gcp, result['name'])
  2602. if forwarding_rule_to_delete in gcp.global_forwarding_rules:
  2603. gcp.global_forwarding_rules.remove(forwarding_rule_to_delete)
  2604. else:
  2605. logger.debug(
  2606. 'Forwarding rule %s does not exist in gcp.global_forwarding_rules',
  2607. forwarding_rule_to_delete.name)
  2608. except googleapiclient.errors.HttpError as http_error:
  2609. logger.info('Delete failed: %s', http_error)
  2610. def delete_global_forwarding_rules(gcp):
  2611. forwarding_rules_to_delete = gcp.global_forwarding_rules.copy()
  2612. for forwarding_rule in forwarding_rules_to_delete:
  2613. delete_global_forwarding_rule(gcp, forwarding_rule)
  2614. def delete_target_proxy(gcp, proxy_to_delete=None):
  2615. if not proxy_to_delete:
  2616. return
  2617. try:
  2618. if gcp.alpha_compute:
  2619. logger.debug('Deleting grpc proxy %s', proxy_to_delete.name)
  2620. result = gcp.alpha_compute.targetGrpcProxies().delete(
  2621. project=gcp.project,
  2622. targetGrpcProxy=proxy_to_delete.name).execute(
  2623. num_retries=_GCP_API_RETRIES)
  2624. else:
  2625. logger.debug('Deleting http proxy %s', proxy_to_delete.name)
  2626. result = gcp.compute.targetHttpProxies().delete(
  2627. project=gcp.project,
  2628. targetHttpProxy=proxy_to_delete.name).execute(
  2629. num_retries=_GCP_API_RETRIES)
  2630. wait_for_global_operation(gcp, result['name'])
  2631. if proxy_to_delete in gcp.target_proxies:
  2632. gcp.target_proxies.remove(proxy_to_delete)
  2633. else:
  2634. logger.debug('Gcp proxy %s does not exist in gcp.target_proxies',
  2635. proxy_to_delete.name)
  2636. except googleapiclient.errors.HttpError as http_error:
  2637. logger.info('Delete failed: %s', http_error)
  2638. def delete_target_proxies(gcp):
  2639. target_proxies_to_delete = gcp.target_proxies.copy()
  2640. for target_proxy in target_proxies_to_delete:
  2641. delete_target_proxy(gcp, target_proxy)
  2642. def delete_url_map(gcp, url_map_to_delete=None):
  2643. if not url_map_to_delete:
  2644. return
  2645. try:
  2646. logger.debug('Deleting url map %s', url_map_to_delete.name)
  2647. result = gcp.compute.urlMaps().delete(
  2648. project=gcp.project,
  2649. urlMap=url_map_to_delete.name).execute(num_retries=_GCP_API_RETRIES)
  2650. wait_for_global_operation(gcp, result['name'])
  2651. if url_map_to_delete in gcp.url_maps:
  2652. gcp.url_maps.remove(url_map_to_delete)
  2653. else:
  2654. logger.debug('Url map %s does not exist in gcp.url_maps',
  2655. url_map_to_delete.name)
  2656. except googleapiclient.errors.HttpError as http_error:
  2657. logger.info('Delete failed: %s', http_error)
  2658. def delete_url_maps(gcp):
  2659. url_maps_to_delete = gcp.url_maps.copy()
  2660. for url_map in url_maps_to_delete:
  2661. delete_url_map(gcp, url_map)
  2662. def delete_backend_service(gcp, backend_service):
  2663. try:
  2664. logger.debug('Deleting backend service %s', backend_service.name)
  2665. result = gcp.compute.backendServices().delete(
  2666. project=gcp.project, backendService=backend_service.name).execute(
  2667. num_retries=_GCP_API_RETRIES)
  2668. wait_for_global_operation(gcp, result['name'])
  2669. except googleapiclient.errors.HttpError as http_error:
  2670. logger.info('Delete failed: %s', http_error)
  2671. def delete_backend_services(gcp):
  2672. for backend_service in gcp.backend_services:
  2673. delete_backend_service(gcp, backend_service)
  2674. def delete_firewall(gcp):
  2675. try:
  2676. logger.debug('Deleting firewall %s',
  2677. gcp.health_check_firewall_rule.name)
  2678. result = gcp.compute.firewalls().delete(
  2679. project=gcp.project,
  2680. firewall=gcp.health_check_firewall_rule.name).execute(
  2681. num_retries=_GCP_API_RETRIES)
  2682. wait_for_global_operation(gcp, result['name'])
  2683. except googleapiclient.errors.HttpError as http_error:
  2684. logger.info('Delete failed: %s', http_error)
  2685. def delete_health_check(gcp):
  2686. try:
  2687. logger.debug('Deleting health check %s', gcp.health_check.name)
  2688. result = gcp.compute.healthChecks().delete(
  2689. project=gcp.project, healthCheck=gcp.health_check.name).execute(
  2690. num_retries=_GCP_API_RETRIES)
  2691. wait_for_global_operation(gcp, result['name'])
  2692. except googleapiclient.errors.HttpError as http_error:
  2693. logger.info('Delete failed: %s', http_error)
  2694. def delete_instance_groups(gcp):
  2695. for instance_group in gcp.instance_groups:
  2696. try:
  2697. logger.debug('Deleting instance group %s %s', instance_group.name,
  2698. instance_group.zone)
  2699. result = gcp.compute.instanceGroupManagers().delete(
  2700. project=gcp.project,
  2701. zone=instance_group.zone,
  2702. instanceGroupManager=instance_group.name).execute(
  2703. num_retries=_GCP_API_RETRIES)
  2704. wait_for_zone_operation(gcp,
  2705. instance_group.zone,
  2706. result['name'],
  2707. timeout_sec=_WAIT_FOR_BACKEND_SEC)
  2708. except googleapiclient.errors.HttpError as http_error:
  2709. logger.info('Delete failed: %s', http_error)
  2710. def delete_instance_template(gcp):
  2711. try:
  2712. logger.debug('Deleting instance template %s',
  2713. gcp.instance_template.name)
  2714. result = gcp.compute.instanceTemplates().delete(
  2715. project=gcp.project,
  2716. instanceTemplate=gcp.instance_template.name).execute(
  2717. num_retries=_GCP_API_RETRIES)
  2718. wait_for_global_operation(gcp, result['name'])
  2719. except googleapiclient.errors.HttpError as http_error:
  2720. logger.info('Delete failed: %s', http_error)
  2721. def patch_backend_service(gcp,
  2722. backend_service,
  2723. instance_groups,
  2724. balancing_mode='UTILIZATION',
  2725. max_rate=1,
  2726. circuit_breakers=None):
  2727. if gcp.alpha_compute:
  2728. compute_to_use = gcp.alpha_compute
  2729. else:
  2730. compute_to_use = gcp.compute
  2731. config = {
  2732. 'backends': [{
  2733. 'group': instance_group.url,
  2734. 'balancingMode': balancing_mode,
  2735. 'maxRate': max_rate if balancing_mode == 'RATE' else None
  2736. } for instance_group in instance_groups],
  2737. 'circuitBreakers': circuit_breakers,
  2738. }
  2739. logger.debug('Sending GCP request with body=%s', config)
  2740. result = compute_to_use.backendServices().patch(
  2741. project=gcp.project, backendService=backend_service.name,
  2742. body=config).execute(num_retries=_GCP_API_RETRIES)
  2743. wait_for_global_operation(gcp,
  2744. result['name'],
  2745. timeout_sec=_WAIT_FOR_BACKEND_SEC)
  2746. def resize_instance_group(gcp,
  2747. instance_group,
  2748. new_size,
  2749. timeout_sec=_WAIT_FOR_OPERATION_SEC):
  2750. result = gcp.compute.instanceGroupManagers().resize(
  2751. project=gcp.project,
  2752. zone=instance_group.zone,
  2753. instanceGroupManager=instance_group.name,
  2754. size=new_size).execute(num_retries=_GCP_API_RETRIES)
  2755. wait_for_zone_operation(gcp,
  2756. instance_group.zone,
  2757. result['name'],
  2758. timeout_sec=360)
  2759. wait_for_instance_group_to_reach_expected_size(gcp, instance_group,
  2760. new_size, timeout_sec)
  2761. def patch_url_map_backend_service(gcp,
  2762. backend_service=None,
  2763. services_with_weights=None,
  2764. route_rules=None,
  2765. url_map=None):
  2766. if url_map:
  2767. url_map_name = url_map.name
  2768. else:
  2769. url_map_name = gcp.url_maps[0].name
  2770. '''change url_map's backend service
  2771. Only one of backend_service and service_with_weights can be not None.
  2772. '''
  2773. if gcp.alpha_compute:
  2774. compute_to_use = gcp.alpha_compute
  2775. else:
  2776. compute_to_use = gcp.compute
  2777. if backend_service and services_with_weights:
  2778. raise ValueError(
  2779. 'both backend_service and service_with_weights are not None.')
  2780. default_service = backend_service.url if backend_service else None
  2781. default_route_action = {
  2782. 'weightedBackendServices': [{
  2783. 'backendService': service.url,
  2784. 'weight': w,
  2785. } for service, w in list(services_with_weights.items())]
  2786. } if services_with_weights else None
  2787. config = {
  2788. 'pathMatchers': [{
  2789. 'name': _PATH_MATCHER_NAME,
  2790. 'defaultService': default_service,
  2791. 'defaultRouteAction': default_route_action,
  2792. 'routeRules': route_rules,
  2793. }]
  2794. }
  2795. logger.debug('Sending GCP request with body=%s', config)
  2796. result = compute_to_use.urlMaps().patch(
  2797. project=gcp.project, urlMap=url_map_name,
  2798. body=config).execute(num_retries=_GCP_API_RETRIES)
  2799. wait_for_global_operation(gcp, result['name'])
  2800. def wait_for_instance_group_to_reach_expected_size(gcp, instance_group,
  2801. expected_size, timeout_sec):
  2802. start_time = time.time()
  2803. while True:
  2804. current_size = len(get_instance_names(gcp, instance_group))
  2805. if current_size == expected_size:
  2806. break
  2807. if time.time() - start_time > timeout_sec:
  2808. raise Exception(
  2809. 'Instance group had expected size %d but actual size %d' %
  2810. (expected_size, current_size))
  2811. time.sleep(2)
  2812. def wait_for_global_operation(gcp,
  2813. operation,
  2814. timeout_sec=_WAIT_FOR_OPERATION_SEC):
  2815. start_time = time.time()
  2816. while time.time() - start_time <= timeout_sec:
  2817. result = gcp.compute.globalOperations().get(
  2818. project=gcp.project,
  2819. operation=operation).execute(num_retries=_GCP_API_RETRIES)
  2820. if result['status'] == 'DONE':
  2821. if 'error' in result:
  2822. raise Exception(result['error'])
  2823. return
  2824. time.sleep(2)
  2825. raise Exception('Operation %s did not complete within %d' %
  2826. (operation, timeout_sec))
  2827. def wait_for_zone_operation(gcp,
  2828. zone,
  2829. operation,
  2830. timeout_sec=_WAIT_FOR_OPERATION_SEC):
  2831. start_time = time.time()
  2832. while time.time() - start_time <= timeout_sec:
  2833. result = gcp.compute.zoneOperations().get(
  2834. project=gcp.project, zone=zone,
  2835. operation=operation).execute(num_retries=_GCP_API_RETRIES)
  2836. if result['status'] == 'DONE':
  2837. if 'error' in result:
  2838. raise Exception(result['error'])
  2839. return
  2840. time.sleep(2)
  2841. raise Exception('Operation %s did not complete within %d' %
  2842. (operation, timeout_sec))
  2843. def wait_for_healthy_backends(gcp,
  2844. backend_service,
  2845. instance_group,
  2846. timeout_sec=_WAIT_FOR_BACKEND_SEC):
  2847. start_time = time.time()
  2848. config = {'group': instance_group.url}
  2849. instance_names = get_instance_names(gcp, instance_group)
  2850. expected_size = len(instance_names)
  2851. while time.time() - start_time <= timeout_sec:
  2852. for instance_name in instance_names:
  2853. try:
  2854. status = get_serving_status(instance_name, gcp.service_port)
  2855. logger.info('serving status response from %s: %s',
  2856. instance_name, status)
  2857. except grpc.RpcError as rpc_error:
  2858. logger.info('checking serving status of %s failed: %s',
  2859. instance_name, rpc_error)
  2860. result = gcp.compute.backendServices().getHealth(
  2861. project=gcp.project,
  2862. backendService=backend_service.name,
  2863. body=config).execute(num_retries=_GCP_API_RETRIES)
  2864. if 'healthStatus' in result:
  2865. logger.info('received GCP healthStatus: %s', result['healthStatus'])
  2866. healthy = True
  2867. for instance in result['healthStatus']:
  2868. if instance['healthState'] != 'HEALTHY':
  2869. healthy = False
  2870. break
  2871. if healthy and expected_size == len(result['healthStatus']):
  2872. return
  2873. else:
  2874. logger.info('no healthStatus received from GCP')
  2875. time.sleep(5)
  2876. raise Exception('Not all backends became healthy within %d seconds: %s' %
  2877. (timeout_sec, result))
  2878. def get_instance_names(gcp, instance_group):
  2879. instance_names = []
  2880. result = gcp.compute.instanceGroups().listInstances(
  2881. project=gcp.project,
  2882. zone=instance_group.zone,
  2883. instanceGroup=instance_group.name,
  2884. body={
  2885. 'instanceState': 'ALL'
  2886. }).execute(num_retries=_GCP_API_RETRIES)
  2887. if 'items' not in result:
  2888. return []
  2889. for item in result['items']:
  2890. # listInstances() returns the full URL of the instance, which ends with
  2891. # the instance name. compute.instances().get() requires using the
  2892. # instance name (not the full URL) to look up instance details, so we
  2893. # just extract the name manually.
  2894. instance_name = item['instance'].split('/')[-1]
  2895. instance_names.append(instance_name)
  2896. logger.info('retrieved instance names: %s', instance_names)
  2897. return instance_names
  2898. def clean_up(gcp):
  2899. delete_global_forwarding_rules(gcp)
  2900. delete_target_proxies(gcp)
  2901. delete_url_maps(gcp)
  2902. delete_backend_services(gcp)
  2903. if gcp.health_check_firewall_rule:
  2904. delete_firewall(gcp)
  2905. if gcp.health_check:
  2906. delete_health_check(gcp)
  2907. delete_instance_groups(gcp)
  2908. if gcp.instance_template:
  2909. delete_instance_template(gcp)
  2910. class InstanceGroup(object):
  2911. def __init__(self, name, url, zone):
  2912. self.name = name
  2913. self.url = url
  2914. self.zone = zone
  2915. class GcpResource(object):
  2916. def __init__(self, name, url):
  2917. self.name = name
  2918. self.url = url
  2919. class GcpState(object):
  2920. def __init__(self, compute, alpha_compute, project, project_num):
  2921. self.compute = compute
  2922. self.alpha_compute = alpha_compute
  2923. self.project = project
  2924. self.project_num = project_num
  2925. self.health_check = None
  2926. self.health_check_firewall_rule = None
  2927. self.backend_services = []
  2928. self.url_maps = []
  2929. self.target_proxies = []
  2930. self.global_forwarding_rules = []
  2931. self.service_port = None
  2932. self.instance_template = None
  2933. self.instance_groups = []
  2934. self.errors = []
  2935. logging.debug(
  2936. "script start time: %s",
  2937. datetime.datetime.now(
  2938. datetime.timezone.utc).astimezone().strftime("%Y-%m-%dT%H:%M:%S %Z"))
  2939. logging.debug("logging local timezone: %s",
  2940. datetime.datetime.now(datetime.timezone.utc).astimezone().tzinfo)
  2941. alpha_compute = None
  2942. if args.compute_discovery_document:
  2943. with open(args.compute_discovery_document, 'r') as discovery_doc:
  2944. compute = googleapiclient.discovery.build_from_document(
  2945. discovery_doc.read())
  2946. if not args.only_stable_gcp_apis and args.alpha_compute_discovery_document:
  2947. with open(args.alpha_compute_discovery_document, 'r') as discovery_doc:
  2948. alpha_compute = googleapiclient.discovery.build_from_document(
  2949. discovery_doc.read())
  2950. else:
  2951. compute = googleapiclient.discovery.build('compute', 'v1')
  2952. if not args.only_stable_gcp_apis:
  2953. alpha_compute = googleapiclient.discovery.build('compute', 'alpha')
  2954. test_results = {}
  2955. failed_tests = []
  2956. try:
  2957. gcp = GcpState(compute, alpha_compute, args.project_id, args.project_num)
  2958. gcp_suffix = args.gcp_suffix
  2959. health_check_name = _BASE_HEALTH_CHECK_NAME + gcp_suffix
  2960. if not args.use_existing_gcp_resources:
  2961. if args.keep_gcp_resources:
  2962. # Auto-generating a unique suffix in case of conflict should not be
  2963. # combined with --keep_gcp_resources, as the suffix actually used
  2964. # for GCP resources will not match the provided --gcp_suffix value.
  2965. num_attempts = 1
  2966. else:
  2967. num_attempts = 5
  2968. for i in range(num_attempts):
  2969. try:
  2970. logger.info('Using GCP suffix %s', gcp_suffix)
  2971. create_health_check(gcp, health_check_name)
  2972. break
  2973. except googleapiclient.errors.HttpError as http_error:
  2974. gcp_suffix = '%s-%04d' % (gcp_suffix, random.randint(0, 9999))
  2975. health_check_name = _BASE_HEALTH_CHECK_NAME + gcp_suffix
  2976. logger.exception('HttpError when creating health check')
  2977. if gcp.health_check is None:
  2978. raise Exception('Failed to create health check name after %d '
  2979. 'attempts' % num_attempts)
  2980. firewall_name = _BASE_FIREWALL_RULE_NAME + gcp_suffix
  2981. backend_service_name = _BASE_BACKEND_SERVICE_NAME + gcp_suffix
  2982. alternate_backend_service_name = _BASE_BACKEND_SERVICE_NAME + '-alternate' + gcp_suffix
  2983. extra_backend_service_name = _BASE_BACKEND_SERVICE_NAME + '-extra' + gcp_suffix
  2984. more_extra_backend_service_name = _BASE_BACKEND_SERVICE_NAME + '-more-extra' + gcp_suffix
  2985. url_map_name = _BASE_URL_MAP_NAME + gcp_suffix
  2986. url_map_name_2 = url_map_name + '2'
  2987. service_host_name = _BASE_SERVICE_HOST + gcp_suffix
  2988. target_proxy_name = _BASE_TARGET_PROXY_NAME + gcp_suffix
  2989. target_proxy_name_2 = target_proxy_name + '2'
  2990. forwarding_rule_name = _BASE_FORWARDING_RULE_NAME + gcp_suffix
  2991. forwarding_rule_name_2 = forwarding_rule_name + '2'
  2992. template_name = _BASE_TEMPLATE_NAME + gcp_suffix
  2993. instance_group_name = _BASE_INSTANCE_GROUP_NAME + gcp_suffix
  2994. same_zone_instance_group_name = _BASE_INSTANCE_GROUP_NAME + '-same-zone' + gcp_suffix
  2995. secondary_zone_instance_group_name = _BASE_INSTANCE_GROUP_NAME + '-secondary-zone' + gcp_suffix
  2996. potential_service_ports = list(args.service_port_range)
  2997. random.shuffle(potential_service_ports)
  2998. if args.use_existing_gcp_resources:
  2999. logger.info('Reusing existing GCP resources')
  3000. get_health_check(gcp, health_check_name)
  3001. get_health_check_firewall_rule(gcp, firewall_name)
  3002. backend_service = get_backend_service(gcp, backend_service_name)
  3003. alternate_backend_service = get_backend_service(
  3004. gcp, alternate_backend_service_name)
  3005. extra_backend_service = get_backend_service(gcp,
  3006. extra_backend_service_name,
  3007. record_error=False)
  3008. more_extra_backend_service = get_backend_service(
  3009. gcp, more_extra_backend_service_name, record_error=False)
  3010. get_url_map(gcp, url_map_name)
  3011. get_target_proxy(gcp, target_proxy_name)
  3012. get_global_forwarding_rule(gcp, forwarding_rule_name)
  3013. get_url_map(gcp, url_map_name_2, record_error=False)
  3014. get_target_proxy(gcp, target_proxy_name_2, record_error=False)
  3015. get_global_forwarding_rule(gcp,
  3016. forwarding_rule_name_2,
  3017. record_error=False)
  3018. get_instance_template(gcp, template_name)
  3019. instance_group = get_instance_group(gcp, args.zone, instance_group_name)
  3020. same_zone_instance_group = get_instance_group(
  3021. gcp, args.zone, same_zone_instance_group_name)
  3022. secondary_zone_instance_group = get_instance_group(
  3023. gcp, args.secondary_zone, secondary_zone_instance_group_name)
  3024. if gcp.errors:
  3025. raise Exception(gcp.errors)
  3026. else:
  3027. create_health_check_firewall_rule(gcp, firewall_name)
  3028. backend_service = add_backend_service(gcp, backend_service_name)
  3029. alternate_backend_service = add_backend_service(
  3030. gcp, alternate_backend_service_name)
  3031. create_url_map(gcp, url_map_name, backend_service, service_host_name)
  3032. create_target_proxy(gcp, target_proxy_name)
  3033. create_global_forwarding_rule(gcp, forwarding_rule_name,
  3034. potential_service_ports)
  3035. if not gcp.service_port:
  3036. raise Exception(
  3037. 'Failed to find a valid ip:port for the forwarding rule')
  3038. if gcp.service_port != _DEFAULT_SERVICE_PORT:
  3039. patch_url_map_host_rule_with_port(gcp, url_map_name,
  3040. backend_service,
  3041. service_host_name)
  3042. startup_script = get_startup_script(args.path_to_server_binary,
  3043. gcp.service_port)
  3044. create_instance_template(gcp, template_name, args.network,
  3045. args.source_image, args.machine_type,
  3046. startup_script)
  3047. instance_group = add_instance_group(gcp, args.zone, instance_group_name,
  3048. _INSTANCE_GROUP_SIZE)
  3049. patch_backend_service(gcp, backend_service, [instance_group])
  3050. same_zone_instance_group = add_instance_group(
  3051. gcp, args.zone, same_zone_instance_group_name, _INSTANCE_GROUP_SIZE)
  3052. secondary_zone_instance_group = add_instance_group(
  3053. gcp, args.secondary_zone, secondary_zone_instance_group_name,
  3054. _INSTANCE_GROUP_SIZE)
  3055. wait_for_healthy_backends(gcp, backend_service, instance_group)
  3056. if args.test_case:
  3057. client_env = dict(os.environ)
  3058. if original_grpc_trace:
  3059. client_env['GRPC_TRACE'] = original_grpc_trace
  3060. if original_grpc_verbosity:
  3061. client_env['GRPC_VERBOSITY'] = original_grpc_verbosity
  3062. bootstrap_server_features = []
  3063. if gcp.service_port == _DEFAULT_SERVICE_PORT:
  3064. server_uri = service_host_name
  3065. else:
  3066. server_uri = service_host_name + ':' + str(gcp.service_port)
  3067. if args.xds_v3_support:
  3068. client_env['GRPC_XDS_EXPERIMENTAL_V3_SUPPORT'] = 'true'
  3069. bootstrap_server_features.append('xds_v3')
  3070. if args.bootstrap_file:
  3071. bootstrap_path = os.path.abspath(args.bootstrap_file)
  3072. else:
  3073. with tempfile.NamedTemporaryFile(delete=False) as bootstrap_file:
  3074. bootstrap_file.write(
  3075. _BOOTSTRAP_TEMPLATE.format(
  3076. node_id='projects/%s/networks/%s/nodes/%s' %
  3077. (gcp.project_num, args.network.split('/')[-1],
  3078. uuid.uuid1()),
  3079. server_features=json.dumps(
  3080. bootstrap_server_features)).encode('utf-8'))
  3081. bootstrap_path = bootstrap_file.name
  3082. client_env['GRPC_XDS_BOOTSTRAP'] = bootstrap_path
  3083. client_env['GRPC_XDS_EXPERIMENTAL_CIRCUIT_BREAKING'] = 'true'
  3084. client_env['GRPC_XDS_EXPERIMENTAL_ENABLE_TIMEOUT'] = 'true'
  3085. client_env['GRPC_XDS_EXPERIMENTAL_FAULT_INJECTION'] = 'true'
  3086. for test_case in args.test_case:
  3087. if test_case in _V3_TEST_CASES and not args.xds_v3_support:
  3088. logger.info('skipping test %s due to missing v3 support',
  3089. test_case)
  3090. continue
  3091. if test_case in _ALPHA_TEST_CASES and not gcp.alpha_compute:
  3092. logger.info('skipping test %s due to missing alpha support',
  3093. test_case)
  3094. continue
  3095. if test_case in [
  3096. 'api_listener', 'forwarding_rule_port_match',
  3097. 'forwarding_rule_default_port'
  3098. ] and CLIENT_HOSTS:
  3099. logger.info(
  3100. 'skipping test %s because test configuration is'
  3101. 'not compatible with client processes on existing'
  3102. 'client hosts', test_case)
  3103. continue
  3104. if test_case == 'forwarding_rule_default_port':
  3105. server_uri = service_host_name
  3106. result = jobset.JobResult()
  3107. log_dir = os.path.join(_TEST_LOG_BASE_DIR, test_case)
  3108. if not os.path.exists(log_dir):
  3109. os.makedirs(log_dir)
  3110. test_log_filename = os.path.join(log_dir, _SPONGE_LOG_NAME)
  3111. test_log_file = open(test_log_filename, 'w+')
  3112. client_process = None
  3113. if test_case in _TESTS_TO_RUN_MULTIPLE_RPCS:
  3114. rpcs_to_send = '--rpc="UnaryCall,EmptyCall"'
  3115. else:
  3116. rpcs_to_send = '--rpc="UnaryCall"'
  3117. if test_case in _TESTS_TO_SEND_METADATA:
  3118. metadata_to_send = '--metadata="EmptyCall:{keyE}:{valueE},UnaryCall:{keyU}:{valueU},UnaryCall:{keyNU}:{valueNU}"'.format(
  3119. keyE=_TEST_METADATA_KEY,
  3120. valueE=_TEST_METADATA_VALUE_EMPTY,
  3121. keyU=_TEST_METADATA_KEY,
  3122. valueU=_TEST_METADATA_VALUE_UNARY,
  3123. keyNU=_TEST_METADATA_NUMERIC_KEY,
  3124. valueNU=_TEST_METADATA_NUMERIC_VALUE)
  3125. else:
  3126. # Setting the arg explicitly to empty with '--metadata=""'
  3127. # makes C# client fail
  3128. # (see https://github.com/commandlineparser/commandline/issues/412),
  3129. # so instead we just rely on clients using the default when
  3130. # metadata arg is not specified.
  3131. metadata_to_send = ''
  3132. # TODO(ericgribkoff) Temporarily disable fail_on_failed_rpc checks
  3133. # in the client. This means we will ignore intermittent RPC
  3134. # failures (but this framework still checks that the final result
  3135. # is as expected).
  3136. #
  3137. # Reason for disabling this is, the resources are shared by
  3138. # multiple tests, and a change in previous test could be delayed
  3139. # until the second test starts. The second test may see
  3140. # intermittent failures because of that.
  3141. #
  3142. # A fix is to not share resources between tests (though that does
  3143. # mean the tests will be significantly slower due to creating new
  3144. # resources).
  3145. fail_on_failed_rpc = ''
  3146. try:
  3147. if not CLIENT_HOSTS:
  3148. client_cmd_formatted = args.client_cmd.format(
  3149. server_uri=server_uri,
  3150. stats_port=args.stats_port,
  3151. qps=args.qps,
  3152. fail_on_failed_rpc=fail_on_failed_rpc,
  3153. rpcs_to_send=rpcs_to_send,
  3154. metadata_to_send=metadata_to_send)
  3155. logger.debug('running client: %s', client_cmd_formatted)
  3156. client_cmd = shlex.split(client_cmd_formatted)
  3157. client_process = subprocess.Popen(client_cmd,
  3158. env=client_env,
  3159. stderr=subprocess.STDOUT,
  3160. stdout=test_log_file)
  3161. if test_case == 'backends_restart':
  3162. test_backends_restart(gcp, backend_service, instance_group)
  3163. elif test_case == 'change_backend_service':
  3164. test_change_backend_service(gcp, backend_service,
  3165. instance_group,
  3166. alternate_backend_service,
  3167. same_zone_instance_group)
  3168. elif test_case == 'gentle_failover':
  3169. test_gentle_failover(gcp, backend_service, instance_group,
  3170. secondary_zone_instance_group)
  3171. elif test_case == 'load_report_based_failover':
  3172. test_load_report_based_failover(
  3173. gcp, backend_service, instance_group,
  3174. secondary_zone_instance_group)
  3175. elif test_case == 'ping_pong':
  3176. test_ping_pong(gcp, backend_service, instance_group)
  3177. elif test_case == 'remove_instance_group':
  3178. test_remove_instance_group(gcp, backend_service,
  3179. instance_group,
  3180. same_zone_instance_group)
  3181. elif test_case == 'round_robin':
  3182. test_round_robin(gcp, backend_service, instance_group)
  3183. elif test_case == 'secondary_locality_gets_no_requests_on_partial_primary_failure':
  3184. test_secondary_locality_gets_no_requests_on_partial_primary_failure(
  3185. gcp, backend_service, instance_group,
  3186. secondary_zone_instance_group)
  3187. elif test_case == 'secondary_locality_gets_requests_on_primary_failure':
  3188. test_secondary_locality_gets_requests_on_primary_failure(
  3189. gcp, backend_service, instance_group,
  3190. secondary_zone_instance_group)
  3191. elif test_case == 'traffic_splitting':
  3192. test_traffic_splitting(gcp, backend_service, instance_group,
  3193. alternate_backend_service,
  3194. same_zone_instance_group)
  3195. elif test_case == 'path_matching':
  3196. test_path_matching(gcp, backend_service, instance_group,
  3197. alternate_backend_service,
  3198. same_zone_instance_group)
  3199. elif test_case == 'header_matching':
  3200. test_header_matching(gcp, backend_service, instance_group,
  3201. alternate_backend_service,
  3202. same_zone_instance_group)
  3203. elif test_case == 'circuit_breaking':
  3204. test_circuit_breaking(gcp, backend_service, instance_group,
  3205. same_zone_instance_group)
  3206. elif test_case == 'timeout':
  3207. test_timeout(gcp, backend_service, instance_group)
  3208. elif test_case == 'fault_injection':
  3209. test_fault_injection(gcp, backend_service, instance_group)
  3210. elif test_case == 'api_listener':
  3211. server_uri = test_api_listener(gcp, backend_service,
  3212. instance_group,
  3213. alternate_backend_service)
  3214. elif test_case == 'forwarding_rule_port_match':
  3215. server_uri = test_forwarding_rule_port_match(
  3216. gcp, backend_service, instance_group)
  3217. elif test_case == 'forwarding_rule_default_port':
  3218. server_uri = test_forwarding_rule_default_port(
  3219. gcp, backend_service, instance_group)
  3220. elif test_case == 'metadata_filter':
  3221. test_metadata_filter(gcp, backend_service, instance_group,
  3222. alternate_backend_service,
  3223. same_zone_instance_group)
  3224. elif test_case == 'csds':
  3225. test_csds(gcp, backend_service, instance_group, server_uri)
  3226. else:
  3227. logger.error('Unknown test case: %s', test_case)
  3228. sys.exit(1)
  3229. if client_process and client_process.poll() is not None:
  3230. raise Exception(
  3231. 'Client process exited prematurely with exit code %d' %
  3232. client_process.returncode)
  3233. result.state = 'PASSED'
  3234. result.returncode = 0
  3235. except Exception as e:
  3236. logger.exception('Test case %s failed', test_case)
  3237. failed_tests.append(test_case)
  3238. result.state = 'FAILED'
  3239. result.message = str(e)
  3240. if args.halt_after_fail:
  3241. # Stop the test suite if one case failed.
  3242. raise
  3243. finally:
  3244. if client_process:
  3245. if client_process.returncode:
  3246. logger.info('Client exited with code %d' %
  3247. client_process.returncode)
  3248. else:
  3249. client_process.terminate()
  3250. test_log_file.close()
  3251. # Workaround for Python 3, as report_utils will invoke decode() on
  3252. # result.message, which has a default value of ''.
  3253. result.message = result.message.encode('UTF-8')
  3254. test_results[test_case] = [result]
  3255. if args.log_client_output:
  3256. logger.info('Client output:')
  3257. with open(test_log_filename, 'r') as client_output:
  3258. logger.info(client_output.read())
  3259. if not os.path.exists(_TEST_LOG_BASE_DIR):
  3260. os.makedirs(_TEST_LOG_BASE_DIR)
  3261. report_utils.render_junit_xml_report(test_results,
  3262. os.path.join(
  3263. _TEST_LOG_BASE_DIR,
  3264. _SPONGE_XML_NAME),
  3265. suite_name='xds_tests',
  3266. multi_target=True)
  3267. if failed_tests:
  3268. logger.error('Test case(s) %s failed', failed_tests)
  3269. sys.exit(1)
  3270. finally:
  3271. keep_resources = args.keep_gcp_resources
  3272. if args.halt_after_fail and failed_tests:
  3273. logger.info(
  3274. 'Halt after fail triggered, exiting without cleaning up resources')
  3275. keep_resources = True
  3276. if not keep_resources:
  3277. logger.info('Cleaning up GCP resources. This may take some time.')
  3278. clean_up(gcp)