flaky_network_test.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
  1. /*
  2. *
  3. * Copyright 2019 gRPC authors.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. */
  18. #include <grpc/support/port_platform.h>
  19. #include <algorithm>
  20. #include <condition_variable>
  21. #include <memory>
  22. #include <mutex>
  23. #include <random>
  24. #include <thread>
  25. #include <gtest/gtest.h>
  26. #include "absl/memory/memory.h"
  27. #include <grpc/grpc.h>
  28. #include <grpc/support/alloc.h>
  29. #include <grpc/support/atm.h>
  30. #include <grpc/support/log.h>
  31. #include <grpc/support/string_util.h>
  32. #include <grpc/support/time.h>
  33. #include <grpcpp/channel.h>
  34. #include <grpcpp/client_context.h>
  35. #include <grpcpp/create_channel.h>
  36. #include <grpcpp/health_check_service_interface.h>
  37. #include <grpcpp/server.h>
  38. #include <grpcpp/server_builder.h>
  39. #include "src/core/lib/backoff/backoff.h"
  40. #include "src/core/lib/gpr/env.h"
  41. #include "src/proto/grpc/testing/echo.grpc.pb.h"
  42. #include "test/core/util/port.h"
  43. #include "test/core/util/test_config.h"
  44. #include "test/cpp/end2end/test_service_impl.h"
  45. #include "test/cpp/util/test_credentials_provider.h"
  46. #ifdef GPR_LINUX
  47. using grpc::testing::EchoRequest;
  48. using grpc::testing::EchoResponse;
  49. namespace grpc {
  50. namespace testing {
  51. namespace {
  52. struct TestScenario {
  53. TestScenario(const std::string& creds_type, const std::string& content)
  54. : credentials_type(creds_type), message_content(content) {}
  55. const std::string credentials_type;
  56. const std::string message_content;
  57. };
  58. class FlakyNetworkTest : public ::testing::TestWithParam<TestScenario> {
  59. protected:
  60. FlakyNetworkTest()
  61. : server_host_("grpctest"),
  62. interface_("lo:1"),
  63. ipv4_address_("10.0.0.1"),
  64. netmask_("/32") {}
  65. void InterfaceUp() {
  66. std::ostringstream cmd;
  67. // create interface_ with address ipv4_address_
  68. cmd << "ip addr add " << ipv4_address_ << netmask_ << " dev " << interface_;
  69. std::system(cmd.str().c_str());
  70. }
  71. void InterfaceDown() {
  72. std::ostringstream cmd;
  73. // remove interface_
  74. cmd << "ip addr del " << ipv4_address_ << netmask_ << " dev " << interface_;
  75. std::system(cmd.str().c_str());
  76. }
  77. void DNSUp() {
  78. std::ostringstream cmd;
  79. // Add DNS entry for server_host_ in /etc/hosts
  80. cmd << "echo '" << ipv4_address_ << " " << server_host_
  81. << "' >> /etc/hosts";
  82. std::system(cmd.str().c_str());
  83. }
  84. void DNSDown() {
  85. std::ostringstream cmd;
  86. // Remove DNS entry for server_host_ from /etc/hosts
  87. // NOTE: we can't do this in one step with sed -i because when we are
  88. // running under docker, the file is mounted by docker so we can't change
  89. // its inode from within the container (sed -i creates a new file and
  90. // replaces the old file, which changes the inode)
  91. cmd << "sed '/" << server_host_ << "/d' /etc/hosts > /etc/hosts.orig";
  92. std::system(cmd.str().c_str());
  93. // clear the stream
  94. cmd.str("");
  95. cmd << "cat /etc/hosts.orig > /etc/hosts";
  96. std::system(cmd.str().c_str());
  97. }
  98. void DropPackets() {
  99. std::ostringstream cmd;
  100. // drop packets with src IP = ipv4_address_
  101. cmd << "iptables -A INPUT -s " << ipv4_address_ << " -j DROP";
  102. std::system(cmd.str().c_str());
  103. // clear the stream
  104. cmd.str("");
  105. // drop packets with dst IP = ipv4_address_
  106. cmd << "iptables -A INPUT -d " << ipv4_address_ << " -j DROP";
  107. }
  108. void RestoreNetwork() {
  109. std::ostringstream cmd;
  110. // remove iptables rule to drop packets with src IP = ipv4_address_
  111. cmd << "iptables -D INPUT -s " << ipv4_address_ << " -j DROP";
  112. std::system(cmd.str().c_str());
  113. // clear the stream
  114. cmd.str("");
  115. // remove iptables rule to drop packets with dest IP = ipv4_address_
  116. cmd << "iptables -D INPUT -d " << ipv4_address_ << " -j DROP";
  117. }
  118. void FlakeNetwork() {
  119. std::ostringstream cmd;
  120. // Emulate a flaky network connection over interface_. Add a delay of 100ms
  121. // +/- 20ms, 0.1% packet loss, 1% duplicates and 0.01% corrupt packets.
  122. cmd << "tc qdisc replace dev " << interface_
  123. << " root netem delay 100ms 20ms distribution normal loss 0.1% "
  124. "duplicate "
  125. "0.1% corrupt 0.01% ";
  126. std::system(cmd.str().c_str());
  127. }
  128. void UnflakeNetwork() {
  129. // Remove simulated network flake on interface_
  130. std::ostringstream cmd;
  131. cmd << "tc qdisc del dev " << interface_ << " root netem";
  132. std::system(cmd.str().c_str());
  133. }
  134. void NetworkUp() {
  135. InterfaceUp();
  136. DNSUp();
  137. }
  138. void NetworkDown() {
  139. InterfaceDown();
  140. DNSDown();
  141. }
  142. void SetUp() override {
  143. NetworkUp();
  144. grpc_init();
  145. StartServer();
  146. }
  147. void TearDown() override {
  148. NetworkDown();
  149. StopServer();
  150. grpc_shutdown();
  151. }
  152. void StartServer() {
  153. // TODO (pjaikumar): Ideally, we should allocate the port dynamically using
  154. // grpc_pick_unused_port_or_die(). That doesn't work inside some docker
  155. // containers because port_server listens on localhost which maps to
  156. // ip6-looopback, but ipv6 support is not enabled by default in docker.
  157. port_ = SERVER_PORT;
  158. server_ = absl::make_unique<ServerData>(port_, GetParam().credentials_type);
  159. server_->Start(server_host_);
  160. }
  161. void StopServer() { server_->Shutdown(); }
  162. std::unique_ptr<grpc::testing::EchoTestService::Stub> BuildStub(
  163. const std::shared_ptr<Channel>& channel) {
  164. return grpc::testing::EchoTestService::NewStub(channel);
  165. }
  166. std::shared_ptr<Channel> BuildChannel(
  167. const std::string& lb_policy_name,
  168. ChannelArguments args = ChannelArguments()) {
  169. if (!lb_policy_name.empty()) {
  170. args.SetLoadBalancingPolicyName(lb_policy_name);
  171. } // else, default to pick first
  172. auto channel_creds = GetCredentialsProvider()->GetChannelCredentials(
  173. GetParam().credentials_type, &args);
  174. std::ostringstream server_address;
  175. server_address << server_host_ << ":" << port_;
  176. return CreateCustomChannel(server_address.str(), channel_creds, args);
  177. }
  178. bool SendRpc(
  179. const std::unique_ptr<grpc::testing::EchoTestService::Stub>& stub,
  180. int timeout_ms = 0, bool wait_for_ready = false) {
  181. auto response = absl::make_unique<EchoResponse>();
  182. EchoRequest request;
  183. auto& msg = GetParam().message_content;
  184. request.set_message(msg);
  185. ClientContext context;
  186. if (timeout_ms > 0) {
  187. context.set_deadline(grpc_timeout_milliseconds_to_deadline(timeout_ms));
  188. // Allow an RPC to be canceled (for deadline exceeded) after it has
  189. // reached the server.
  190. request.mutable_param()->set_skip_cancelled_check(true);
  191. }
  192. // See https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md for
  193. // details of wait-for-ready semantics
  194. if (wait_for_ready) {
  195. context.set_wait_for_ready(true);
  196. }
  197. Status status = stub->Echo(&context, request, response.get());
  198. auto ok = status.ok();
  199. if (ok) {
  200. gpr_log(GPR_DEBUG, "RPC succeeded");
  201. } else {
  202. gpr_log(GPR_DEBUG, "RPC failed: %s", status.error_message().c_str());
  203. }
  204. return ok;
  205. }
  206. struct ServerData {
  207. int port_;
  208. const std::string creds_;
  209. std::unique_ptr<Server> server_;
  210. TestServiceImpl service_;
  211. std::unique_ptr<std::thread> thread_;
  212. bool server_ready_ = false;
  213. ServerData(int port, const std::string& creds)
  214. : port_(port), creds_(creds) {}
  215. void Start(const std::string& server_host) {
  216. gpr_log(GPR_INFO, "starting server on port %d", port_);
  217. std::mutex mu;
  218. std::unique_lock<std::mutex> lock(mu);
  219. std::condition_variable cond;
  220. thread_ = absl::make_unique<std::thread>(
  221. std::bind(&ServerData::Serve, this, server_host, &mu, &cond));
  222. cond.wait(lock, [this] { return server_ready_; });
  223. server_ready_ = false;
  224. gpr_log(GPR_INFO, "server startup complete");
  225. }
  226. void Serve(const std::string& server_host, std::mutex* mu,
  227. std::condition_variable* cond) {
  228. std::ostringstream server_address;
  229. server_address << server_host << ":" << port_;
  230. ServerBuilder builder;
  231. auto server_creds =
  232. GetCredentialsProvider()->GetServerCredentials(creds_);
  233. builder.AddListeningPort(server_address.str(), server_creds);
  234. builder.RegisterService(&service_);
  235. server_ = builder.BuildAndStart();
  236. std::lock_guard<std::mutex> lock(*mu);
  237. server_ready_ = true;
  238. cond->notify_one();
  239. }
  240. void Shutdown() {
  241. server_->Shutdown(grpc_timeout_milliseconds_to_deadline(0));
  242. thread_->join();
  243. }
  244. };
  245. bool WaitForChannelNotReady(Channel* channel, int timeout_seconds = 5) {
  246. const gpr_timespec deadline =
  247. grpc_timeout_seconds_to_deadline(timeout_seconds);
  248. grpc_connectivity_state state;
  249. while ((state = channel->GetState(false /* try_to_connect */)) ==
  250. GRPC_CHANNEL_READY) {
  251. if (!channel->WaitForStateChange(state, deadline)) return false;
  252. }
  253. return true;
  254. }
  255. bool WaitForChannelReady(Channel* channel, int timeout_seconds = 5) {
  256. const gpr_timespec deadline =
  257. grpc_timeout_seconds_to_deadline(timeout_seconds);
  258. grpc_connectivity_state state;
  259. while ((state = channel->GetState(true /* try_to_connect */)) !=
  260. GRPC_CHANNEL_READY) {
  261. if (!channel->WaitForStateChange(state, deadline)) return false;
  262. }
  263. return true;
  264. }
  265. private:
  266. const std::string server_host_;
  267. const std::string interface_;
  268. const std::string ipv4_address_;
  269. const std::string netmask_;
  270. std::unique_ptr<grpc::testing::EchoTestService::Stub> stub_;
  271. std::unique_ptr<ServerData> server_;
  272. const int SERVER_PORT = 32750;
  273. int port_;
  274. };
  275. std::vector<TestScenario> CreateTestScenarios() {
  276. std::vector<TestScenario> scenarios;
  277. std::vector<std::string> credentials_types;
  278. std::vector<std::string> messages;
  279. credentials_types.push_back(kInsecureCredentialsType);
  280. auto sec_list = GetCredentialsProvider()->GetSecureCredentialsTypeList();
  281. for (auto sec = sec_list.begin(); sec != sec_list.end(); sec++) {
  282. credentials_types.push_back(*sec);
  283. }
  284. messages.push_back("🖖");
  285. for (size_t k = 1; k < GRPC_DEFAULT_MAX_RECV_MESSAGE_LENGTH / 1024; k *= 32) {
  286. std::string big_msg;
  287. for (size_t i = 0; i < k * 1024; ++i) {
  288. char c = 'a' + (i % 26);
  289. big_msg += c;
  290. }
  291. messages.push_back(big_msg);
  292. }
  293. for (auto cred = credentials_types.begin(); cred != credentials_types.end();
  294. ++cred) {
  295. for (auto msg = messages.begin(); msg != messages.end(); msg++) {
  296. scenarios.emplace_back(*cred, *msg);
  297. }
  298. }
  299. return scenarios;
  300. }
  301. INSTANTIATE_TEST_SUITE_P(FlakyNetworkTest, FlakyNetworkTest,
  302. ::testing::ValuesIn(CreateTestScenarios()));
  303. // Network interface connected to server flaps
  304. TEST_P(FlakyNetworkTest, NetworkTransition) {
  305. const int kKeepAliveTimeMs = 1000;
  306. const int kKeepAliveTimeoutMs = 1000;
  307. ChannelArguments args;
  308. args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
  309. args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
  310. args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
  311. args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
  312. auto channel = BuildChannel("pick_first", args);
  313. auto stub = BuildStub(channel);
  314. // Channel should be in READY state after we send an RPC
  315. EXPECT_TRUE(SendRpc(stub));
  316. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  317. std::atomic_bool shutdown{false};
  318. std::thread sender = std::thread([this, &stub, &shutdown]() {
  319. while (true) {
  320. if (shutdown.load()) {
  321. return;
  322. }
  323. SendRpc(stub);
  324. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  325. }
  326. });
  327. // bring down network
  328. NetworkDown();
  329. EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
  330. // bring network interface back up
  331. InterfaceUp();
  332. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  333. // Restore DNS entry for server
  334. DNSUp();
  335. EXPECT_TRUE(WaitForChannelReady(channel.get()));
  336. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  337. shutdown.store(true);
  338. sender.join();
  339. }
  340. // Traffic to server server is blackholed temporarily with keepalives enabled
  341. TEST_P(FlakyNetworkTest, ServerUnreachableWithKeepalive) {
  342. const int kKeepAliveTimeMs = 1000;
  343. const int kKeepAliveTimeoutMs = 1000;
  344. const int kReconnectBackoffMs = 1000;
  345. ChannelArguments args;
  346. args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
  347. args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
  348. args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
  349. args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
  350. // max time for a connection attempt
  351. args.SetInt(GRPC_ARG_MIN_RECONNECT_BACKOFF_MS, kReconnectBackoffMs);
  352. // max time between reconnect attempts
  353. args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, kReconnectBackoffMs);
  354. gpr_log(GPR_DEBUG, "FlakyNetworkTest.ServerUnreachableWithKeepalive start");
  355. auto channel = BuildChannel("pick_first", args);
  356. auto stub = BuildStub(channel);
  357. // Channel should be in READY state after we send an RPC
  358. EXPECT_TRUE(SendRpc(stub));
  359. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  360. std::atomic_bool shutdown{false};
  361. std::thread sender = std::thread([this, &stub, &shutdown]() {
  362. while (true) {
  363. if (shutdown.load()) {
  364. return;
  365. }
  366. SendRpc(stub);
  367. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  368. }
  369. });
  370. // break network connectivity
  371. gpr_log(GPR_DEBUG, "Adding iptables rule to drop packets");
  372. DropPackets();
  373. std::this_thread::sleep_for(std::chrono::milliseconds(10000));
  374. EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
  375. // bring network interface back up
  376. RestoreNetwork();
  377. gpr_log(GPR_DEBUG, "Removed iptables rule to drop packets");
  378. EXPECT_TRUE(WaitForChannelReady(channel.get()));
  379. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  380. shutdown.store(true);
  381. sender.join();
  382. gpr_log(GPR_DEBUG, "FlakyNetworkTest.ServerUnreachableWithKeepalive end");
  383. }
  384. //
  385. // Traffic to server server is blackholed temporarily with keepalives disabled
  386. TEST_P(FlakyNetworkTest, ServerUnreachableNoKeepalive) {
  387. auto channel = BuildChannel("pick_first", ChannelArguments());
  388. auto stub = BuildStub(channel);
  389. // Channel should be in READY state after we send an RPC
  390. EXPECT_TRUE(SendRpc(stub));
  391. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  392. // break network connectivity
  393. DropPackets();
  394. std::thread sender = std::thread([this, &stub]() {
  395. // RPC with deadline should timeout
  396. EXPECT_FALSE(SendRpc(stub, /*timeout_ms=*/500, /*wait_for_ready=*/true));
  397. // RPC without deadline forever until call finishes
  398. EXPECT_TRUE(SendRpc(stub, /*timeout_ms=*/0, /*wait_for_ready=*/true));
  399. });
  400. std::this_thread::sleep_for(std::chrono::milliseconds(2000));
  401. // bring network interface back up
  402. RestoreNetwork();
  403. // wait for RPC to finish
  404. sender.join();
  405. }
  406. // Send RPCs over a flaky network connection
  407. TEST_P(FlakyNetworkTest, FlakyNetwork) {
  408. const int kKeepAliveTimeMs = 1000;
  409. const int kKeepAliveTimeoutMs = 1000;
  410. const int kMessageCount = 100;
  411. ChannelArguments args;
  412. args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
  413. args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
  414. args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
  415. args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
  416. auto channel = BuildChannel("pick_first", args);
  417. auto stub = BuildStub(channel);
  418. // Channel should be in READY state after we send an RPC
  419. EXPECT_TRUE(SendRpc(stub));
  420. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  421. // simulate flaky network (packet loss, corruption and delays)
  422. FlakeNetwork();
  423. for (int i = 0; i < kMessageCount; ++i) {
  424. SendRpc(stub);
  425. }
  426. // remove network flakiness
  427. UnflakeNetwork();
  428. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  429. }
  430. // Server is shutdown gracefully and restarted. Client keepalives are enabled
  431. TEST_P(FlakyNetworkTest, ServerRestartKeepaliveEnabled) {
  432. const int kKeepAliveTimeMs = 1000;
  433. const int kKeepAliveTimeoutMs = 1000;
  434. ChannelArguments args;
  435. args.SetInt(GRPC_ARG_KEEPALIVE_TIME_MS, kKeepAliveTimeMs);
  436. args.SetInt(GRPC_ARG_KEEPALIVE_TIMEOUT_MS, kKeepAliveTimeoutMs);
  437. args.SetInt(GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1);
  438. args.SetInt(GRPC_ARG_HTTP2_MAX_PINGS_WITHOUT_DATA, 0);
  439. auto channel = BuildChannel("pick_first", args);
  440. auto stub = BuildStub(channel);
  441. // Channel should be in READY state after we send an RPC
  442. EXPECT_TRUE(SendRpc(stub));
  443. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  444. // server goes down, client should detect server going down and calls should
  445. // fail
  446. StopServer();
  447. EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
  448. EXPECT_FALSE(SendRpc(stub));
  449. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  450. // server restarts, calls succeed
  451. StartServer();
  452. EXPECT_TRUE(WaitForChannelReady(channel.get()));
  453. // EXPECT_TRUE(SendRpc(stub));
  454. }
  455. // Server is shutdown gracefully and restarted. Client keepalives are enabled
  456. TEST_P(FlakyNetworkTest, ServerRestartKeepaliveDisabled) {
  457. auto channel = BuildChannel("pick_first", ChannelArguments());
  458. auto stub = BuildStub(channel);
  459. // Channel should be in READY state after we send an RPC
  460. EXPECT_TRUE(SendRpc(stub));
  461. EXPECT_EQ(channel->GetState(false), GRPC_CHANNEL_READY);
  462. // server sends GOAWAY when it's shutdown, so client attempts to reconnect
  463. StopServer();
  464. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  465. EXPECT_TRUE(WaitForChannelNotReady(channel.get()));
  466. std::this_thread::sleep_for(std::chrono::milliseconds(1000));
  467. // server restarts, calls succeed
  468. StartServer();
  469. EXPECT_TRUE(WaitForChannelReady(channel.get()));
  470. }
  471. } // namespace
  472. } // namespace testing
  473. } // namespace grpc
  474. #endif // GPR_LINUX
  475. int main(int argc, char** argv) {
  476. ::testing::InitGoogleTest(&argc, argv);
  477. grpc::testing::TestEnvironment env(argc, argv);
  478. auto result = RUN_ALL_TESTS();
  479. return result;
  480. }