grpc_aws_run_remote_test.sh 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #!/usr/bin/env bash
  2. # Copyright 2021 The gRPC Authors
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. # WARNING: this script has been reviewed by the security team, so
  16. # any changes need to be made with great care.
  17. # Contact @jtattermusch or @amidlash if in doubt.
  18. # This script is responsible for remotely running tests on an ARM instance.
  19. # At the start, it provisions a new AWS ARM64 instance and then uses
  20. # it to execute a test script (and cleans up afterwards).
  21. # It should return a status code useful to the kokoro infrastructure.
  22. # TODO(jtattermusch): make the script safe to run under "set -ex"
  23. set -e
  24. if [ -z "$KOKORO_KEYSTORE_DIR" ]; then
  25. echo "KOKORO_KEYSTORE_DIR is unset. This must be run from kokoro"
  26. exit 1
  27. fi
  28. AWS_CREDENTIALS=${KOKORO_KEYSTORE_DIR}/73836_grpc_aws_ec2_credentials
  29. # Setup aws cli
  30. curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
  31. unzip -q awscliv2.zip
  32. sudo ./aws/install
  33. aws --version
  34. # authenticate with aws cli
  35. mkdir ~/.aws/
  36. echo "[default]" >> ~/.aws/config
  37. ln -s $AWS_CREDENTIALS ~/.aws/credentials
  38. # setup instance
  39. sudo apt update && sudo apt install -y jq
  40. # ubuntu 18.04 lts(arm64)
  41. # https://aws.amazon.com/amazon-linux-ami/
  42. AWS_MACHINE_IMAGE=ami-026141f3d5c6d2d0c
  43. # use 4-core instance by default
  44. AWS_INSTANCE_TYPE=${AWS_INSTANCE_TYPE:-t4g.xlarge}
  45. AWS_SECURITY_GROUP=sg-021240e886feba750
  46. # Max allowed lifespan of the AWS instance. After this period of time, the instance will
  47. # self-terminate (delete itself). This is very important to ensure that there will
  48. # be no orphaned AWS instances if the initiating kokoro job fails / gets cancelled etc.
  49. AWS_INSTANCE_MAX_LIFESPAN_MINS=120
  50. # increase the size of the root volume so that builds don't run out of disk space
  51. AWS_STORAGE_SIZE_GB=75
  52. AWS_DEVICE_MAPPING="DeviceName='/dev/sda1',Ebs={VolumeSize=${AWS_STORAGE_SIZE_GB}}"
  53. AWS_INSTANCE_TAGS="ResourceType='instance',Tags=[{Key='kokoro_job_name',Value='${KOKORO_JOB_NAME}'},{Key='kokoro_build_number',Value='${KOKORO_BUILD_NUMBER}'},{Key='kokoro_aws_integration',Value='true'}]"
  54. ssh-keygen -N '' -t rsa -b 4096 -f ~/.ssh/temp_client_key
  55. ssh-keygen -N '' -t ecdsa -b 256 -f ~/.ssh/temp_server_key
  56. SERVER_PRIVATE_KEY=$(cat ~/.ssh/temp_server_key | sed 's/\(.*\)/ \1/')
  57. SERVER_PUBLIC_KEY=$(cat ~/.ssh/temp_server_key.pub | awk '{print $1 " " $2 " root@localhost"}')
  58. SERVER_HOST_KEY_ENTRY=$(cat ~/.ssh/temp_server_key.pub | awk '{print $1 " " $2}')
  59. CLIENT_PUBLIC_KEY=$(cat ~/.ssh/temp_client_key.pub)
  60. echo '#cloud-config' > userdata
  61. echo 'ssh_authorized_keys:' >> userdata
  62. echo " - $CLIENT_PUBLIC_KEY" >> userdata
  63. echo 'ssh_keys:' >> userdata
  64. echo ' ecdsa_private: |' >> userdata
  65. echo "$SERVER_PRIVATE_KEY" >> userdata
  66. echo " ecdsa_public: $SERVER_PUBLIC_KEY" >> userdata
  67. echo '' >> userdata
  68. echo 'runcmd:' >> userdata
  69. echo " - sleep ${AWS_INSTANCE_MAX_LIFESPAN_MINS}m" >> userdata
  70. echo ' - shutdown' >> userdata
  71. ID=$(aws ec2 run-instances --image-id $AWS_MACHINE_IMAGE --instance-initiated-shutdown-behavior=terminate \
  72. --instance-type $AWS_INSTANCE_TYPE \
  73. --security-group-ids $AWS_SECURITY_GROUP \
  74. --user-data file://userdata \
  75. --block-device-mapping "$AWS_DEVICE_MAPPING" \
  76. --tag-specifications "$AWS_INSTANCE_TAGS" \
  77. --region us-east-2 | jq .Instances[0].InstanceId | sed 's/"//g')
  78. echo "instance-id=$ID"
  79. echo "Waiting 1m for instance ip..."
  80. sleep 1m
  81. IP=$(aws ec2 describe-instances \
  82. --instance-id=$ID \
  83. --region us-east-2 | jq .Reservations[0].Instances[0].NetworkInterfaces[0].Association.PublicIp | sed 's/"//g')
  84. SERVER_HOST_KEY_ENTRY="$IP $SERVER_HOST_KEY_ENTRY"
  85. echo $SERVER_HOST_KEY_ENTRY >> ~/.ssh/known_hosts
  86. echo "Waiting 2m for instance to initialize..."
  87. sleep 2m
  88. echo "Copying workspace to remote instance..."
  89. # use rsync over ssh since it's much faster than scp
  90. time rsync -e "ssh -i ~/.ssh/temp_client_key" -a github/grpc ubuntu@$IP:~/workspace
  91. echo "Beginning CI workload..."
  92. # filename of the test script to execute remotely, relative to gRPC repository root
  93. # use a default value if the env variable is not set
  94. REMOTE_WORKLOAD_SCRIPT=${REMOTE_WORKLOAD_SCRIPT:-tools/internal_ci/linux/aws/grpc_aws_experiment_remote.sh}
  95. # run remote workload script in the background, with redirected stdout and stderr
  96. # to avoid problems with ssh session not closing after the remote script finishes
  97. # but stdout and stderr are still open because the remote has spawned subprocesses
  98. # that keep stdout and stderr open.
  99. # * PID of the process that executes the remote script will be stored in aws_build.pid
  100. # * stderr and stdout will be streamed to aws_build.log
  101. # * once done, the exitcode of the remote script will be in aws_build.exitcode
  102. REMOTE_WORKLOAD_COMMAND="nohup bash -c '(bash grpc/${REMOTE_WORKLOAD_SCRIPT}; echo \$? >/tmp/aws_build.exitcode) >>/tmp/aws_build.log 2>&1' >/dev/null 2>&1 & echo \$! >/tmp/aws_build.pid"
  103. # the tail command simply streams the contents of aws_build.log as they become available
  104. # and stops when the remote workload exits (determined based on the PID)
  105. SSH_COMMAND='uname -a; rm -f /tmp/aws_build.log /tmp/aws_build.exitcode /tmp/aws_build.pid; touch /tmp/aws_build.log; cd ~/workspace; '"${REMOTE_WORKLOAD_COMMAND};"' tail -f /tmp/aws_build.log --pid $(cat /tmp/aws_build.pid); exit $(cat /tmp/aws_build.exitcode)'
  106. REMOTE_SCRIPT_EXITCODE=0
  107. time ssh -i ~/.ssh/temp_client_key ubuntu@$IP "${SSH_COMMAND}" || REMOTE_SCRIPT_EXITCODE=$?
  108. echo "Copying artifacts from the remote instance..."
  109. ARTIFACT_RSYNC_PATTERN="**/*sponge_log.*"
  110. # NOTE: the include "*/" rule and --prune-empty-dirs are important for not
  111. # excluding parent directories that contain artifacts before they have
  112. # get a chance to be examined (see man rsync)
  113. COPY_ARTIFACTS_EXITCODE=0
  114. time rsync -av -e "ssh -i ~/.ssh/temp_client_key" --include="${ARTIFACT_RSYNC_PATTERN}" --include="*/" --exclude="*" --prune-empty-dirs ubuntu@$IP:~/workspace/grpc github || COPY_ARTIFACTS_EXITCODE=$?
  115. # Regardless of the remote script's result (success or failure), initiate shutdown of AWS instance a minute from now.
  116. # The small delay is useful to make sure the ssh session doesn't hang up on us if shutdown happens too quickly.
  117. echo "Shutting down instance $ID."
  118. ssh -i ~/.ssh/temp_client_key ubuntu@$IP "sudo shutdown +1" || echo "WARNING: Failed to initiate AWS instance shutdown."
  119. if [ "$REMOTE_SCRIPT_EXITCODE" == "0" ] && [ "$COPY_ARTIFACTS_EXITCODE" != "0" ]
  120. then
  121. echo "Exiting with exitcode $COPY_ARTIFACTS_EXITCODE since remote script has passed, but copying artifacts has failed."
  122. exit $COPY_ARTIFACTS_EXITCODE
  123. fi
  124. # Match exitcode
  125. echo "Exiting with exitcode $REMOTE_SCRIPT_EXITCODE based on remote script output."
  126. exit $REMOTE_SCRIPT_EXITCODE