big_query_utils.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #!/usr/bin/env python2.7
  2. from __future__ import print_function
  3. import argparse
  4. import json
  5. import uuid
  6. import httplib2
  7. from apiclient import discovery
  8. from apiclient.errors import HttpError
  9. from oauth2client.client import GoogleCredentials
  10. # 30 days in milliseconds
  11. _EXPIRATION_MS = 30 * 24 * 60 * 60 * 1000
  12. NUM_RETRIES = 3
  13. def create_big_query():
  14. """Authenticates with cloud platform and gets a BiqQuery service object
  15. """
  16. creds = GoogleCredentials.get_application_default()
  17. return discovery.build(
  18. 'bigquery', 'v2', credentials=creds, cache_discovery=False)
  19. def create_dataset(biq_query, project_id, dataset_id):
  20. is_success = True
  21. body = {
  22. 'datasetReference': {
  23. 'projectId': project_id,
  24. 'datasetId': dataset_id
  25. }
  26. }
  27. try:
  28. dataset_req = biq_query.datasets().insert(
  29. projectId=project_id, body=body)
  30. dataset_req.execute(num_retries=NUM_RETRIES)
  31. except HttpError as http_error:
  32. if http_error.resp.status == 409:
  33. print('Warning: The dataset %s already exists' % dataset_id)
  34. else:
  35. # Note: For more debugging info, print "http_error.content"
  36. print('Error in creating dataset: %s. Err: %s' % (dataset_id,
  37. http_error))
  38. is_success = False
  39. return is_success
  40. def create_table(big_query, project_id, dataset_id, table_id, table_schema,
  41. description):
  42. fields = [{
  43. 'name': field_name,
  44. 'type': field_type,
  45. 'description': field_description
  46. } for (field_name, field_type, field_description) in table_schema]
  47. return create_table2(big_query, project_id, dataset_id, table_id, fields,
  48. description)
  49. def create_partitioned_table(big_query,
  50. project_id,
  51. dataset_id,
  52. table_id,
  53. table_schema,
  54. description,
  55. partition_type='DAY',
  56. expiration_ms=_EXPIRATION_MS):
  57. """Creates a partitioned table. By default, a date-paritioned table is created with
  58. each partition lasting 30 days after it was last modified.
  59. """
  60. fields = [{
  61. 'name': field_name,
  62. 'type': field_type,
  63. 'description': field_description
  64. } for (field_name, field_type, field_description) in table_schema]
  65. return create_table2(big_query, project_id, dataset_id, table_id, fields,
  66. description, partition_type, expiration_ms)
  67. def create_table2(big_query,
  68. project_id,
  69. dataset_id,
  70. table_id,
  71. fields_schema,
  72. description,
  73. partition_type=None,
  74. expiration_ms=None):
  75. is_success = True
  76. body = {
  77. 'description': description,
  78. 'schema': {
  79. 'fields': fields_schema
  80. },
  81. 'tableReference': {
  82. 'datasetId': dataset_id,
  83. 'projectId': project_id,
  84. 'tableId': table_id
  85. }
  86. }
  87. if partition_type and expiration_ms:
  88. body["timePartitioning"] = {
  89. "type": partition_type,
  90. "expirationMs": expiration_ms
  91. }
  92. try:
  93. table_req = big_query.tables().insert(
  94. projectId=project_id, datasetId=dataset_id, body=body)
  95. res = table_req.execute(num_retries=NUM_RETRIES)
  96. print('Successfully created %s "%s"' % (res['kind'], res['id']))
  97. except HttpError as http_error:
  98. if http_error.resp.status == 409:
  99. print('Warning: Table %s already exists' % table_id)
  100. else:
  101. print('Error in creating table: %s. Err: %s' % (table_id,
  102. http_error))
  103. is_success = False
  104. return is_success
  105. def patch_table(big_query, project_id, dataset_id, table_id, fields_schema):
  106. is_success = True
  107. body = {
  108. 'schema': {
  109. 'fields': fields_schema
  110. },
  111. 'tableReference': {
  112. 'datasetId': dataset_id,
  113. 'projectId': project_id,
  114. 'tableId': table_id
  115. }
  116. }
  117. try:
  118. table_req = big_query.tables().patch(
  119. projectId=project_id,
  120. datasetId=dataset_id,
  121. tableId=table_id,
  122. body=body)
  123. res = table_req.execute(num_retries=NUM_RETRIES)
  124. print('Successfully patched %s "%s"' % (res['kind'], res['id']))
  125. except HttpError as http_error:
  126. print('Error in creating table: %s. Err: %s' % (table_id, http_error))
  127. is_success = False
  128. return is_success
  129. def insert_rows(big_query, project_id, dataset_id, table_id, rows_list):
  130. is_success = True
  131. body = {'rows': rows_list}
  132. try:
  133. insert_req = big_query.tabledata().insertAll(
  134. projectId=project_id,
  135. datasetId=dataset_id,
  136. tableId=table_id,
  137. body=body)
  138. res = insert_req.execute(num_retries=NUM_RETRIES)
  139. if res.get('insertErrors', None):
  140. print('Error inserting rows! Response: %s' % res)
  141. is_success = False
  142. except HttpError as http_error:
  143. print('Error inserting rows to the table %s' % table_id)
  144. is_success = False
  145. return is_success
  146. def sync_query_job(big_query, project_id, query, timeout=5000):
  147. query_data = {'query': query, 'timeoutMs': timeout}
  148. query_job = None
  149. try:
  150. query_job = big_query.jobs().query(
  151. projectId=project_id,
  152. body=query_data).execute(num_retries=NUM_RETRIES)
  153. except HttpError as http_error:
  154. print('Query execute job failed with error: %s' % http_error)
  155. print(http_error.content)
  156. return query_job
  157. # List of (column name, column type, description) tuples
  158. def make_row(unique_row_id, row_values_dict):
  159. """row_values_dict is a dictionary of column name and column value.
  160. """
  161. return {'insertId': unique_row_id, 'json': row_values_dict}