tables.proto 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.automl.v1beta1;
  16. import "google/cloud/automl/v1beta1/classification.proto";
  17. import "google/cloud/automl/v1beta1/column_spec.proto";
  18. import "google/cloud/automl/v1beta1/data_items.proto";
  19. import "google/cloud/automl/v1beta1/data_stats.proto";
  20. import "google/cloud/automl/v1beta1/ranges.proto";
  21. import "google/cloud/automl/v1beta1/regression.proto";
  22. import "google/cloud/automl/v1beta1/temporal.proto";
  23. import "google/protobuf/struct.proto";
  24. import "google/protobuf/timestamp.proto";
  25. import "google/api/annotations.proto";
  26. option go_package = "google.golang.org/genproto/googleapis/cloud/automl/v1beta1;automl";
  27. option java_multiple_files = true;
  28. option java_package = "com.google.cloud.automl.v1beta1";
  29. option php_namespace = "Google\\Cloud\\AutoMl\\V1beta1";
  30. option ruby_package = "Google::Cloud::AutoML::V1beta1";
  31. // Metadata for a dataset used for AutoML Tables.
  32. message TablesDatasetMetadata {
  33. // Output only. The table_spec_id of the primary table of this dataset.
  34. string primary_table_spec_id = 1;
  35. // column_spec_id of the primary table's column that should be used as the
  36. // training & prediction target.
  37. // This column must be non-nullable and have one of following data types
  38. // (otherwise model creation will error):
  39. //
  40. // * CATEGORY
  41. //
  42. // * FLOAT64
  43. //
  44. // If the type is CATEGORY , only up to
  45. // 100 unique values may exist in that column across all rows.
  46. //
  47. // NOTE: Updates of this field will instantly affect any other users
  48. // concurrently working with the dataset.
  49. string target_column_spec_id = 2;
  50. // column_spec_id of the primary table's column that should be used as the
  51. // weight column, i.e. the higher the value the more important the row will be
  52. // during model training.
  53. // Required type: FLOAT64.
  54. // Allowed values: 0 to 10000, inclusive on both ends; 0 means the row is
  55. // ignored for training.
  56. // If not set all rows are assumed to have equal weight of 1.
  57. // NOTE: Updates of this field will instantly affect any other users
  58. // concurrently working with the dataset.
  59. string weight_column_spec_id = 3;
  60. // column_spec_id of the primary table column which specifies a possible ML
  61. // use of the row, i.e. the column will be used to split the rows into TRAIN,
  62. // VALIDATE and TEST sets.
  63. // Required type: STRING.
  64. // This column, if set, must either have all of `TRAIN`, `VALIDATE`, `TEST`
  65. // among its values, or only have `TEST`, `UNASSIGNED` values. In the latter
  66. // case the rows with `UNASSIGNED` value will be assigned by AutoML. Note
  67. // that if a given ml use distribution makes it impossible to create a "good"
  68. // model, that call will error describing the issue.
  69. // If both this column_spec_id and primary table's time_column_spec_id are not
  70. // set, then all rows are treated as `UNASSIGNED`.
  71. // NOTE: Updates of this field will instantly affect any other users
  72. // concurrently working with the dataset.
  73. string ml_use_column_spec_id = 4;
  74. // Output only. Correlations between
  75. //
  76. // [TablesDatasetMetadata.target_column_spec_id][google.cloud.automl.v1beta1.TablesDatasetMetadata.target_column_spec_id],
  77. // and other columns of the
  78. //
  79. // [TablesDatasetMetadataprimary_table][google.cloud.automl.v1beta1.TablesDatasetMetadata.primary_table_spec_id].
  80. // Only set if the target column is set. Mapping from other column spec id to
  81. // its CorrelationStats with the target column.
  82. // This field may be stale, see the stats_update_time field for
  83. // for the timestamp at which these stats were last updated.
  84. map<string, CorrelationStats> target_column_correlations = 6;
  85. // Output only. The most recent timestamp when target_column_correlations
  86. // field and all descendant ColumnSpec.data_stats and
  87. // ColumnSpec.top_correlated_columns fields were last (re-)generated. Any
  88. // changes that happened to the dataset afterwards are not reflected in these
  89. // fields values. The regeneration happens in the background on a best effort
  90. // basis.
  91. google.protobuf.Timestamp stats_update_time = 7;
  92. }
  93. // Model metadata specific to AutoML Tables.
  94. message TablesModelMetadata {
  95. // Additional optimization objective configuration. Required for
  96. // `MAXIMIZE_PRECISION_AT_RECALL` and `MAXIMIZE_RECALL_AT_PRECISION`,
  97. // otherwise unused.
  98. oneof additional_optimization_objective_config {
  99. // Required when optimization_objective is "MAXIMIZE_PRECISION_AT_RECALL".
  100. // Must be between 0 and 1, inclusive.
  101. float optimization_objective_recall_value = 17;
  102. // Required when optimization_objective is "MAXIMIZE_RECALL_AT_PRECISION".
  103. // Must be between 0 and 1, inclusive.
  104. float optimization_objective_precision_value = 18;
  105. }
  106. // Column spec of the dataset's primary table's column the model is
  107. // predicting. Snapshotted when model creation started.
  108. // Only 3 fields are used:
  109. // name - May be set on CreateModel, if it's not then the ColumnSpec
  110. // corresponding to the current target_column_spec_id of the dataset
  111. // the model is trained from is used.
  112. // If neither is set, CreateModel will error.
  113. // display_name - Output only.
  114. // data_type - Output only.
  115. ColumnSpec target_column_spec = 2;
  116. // Column specs of the dataset's primary table's columns, on which
  117. // the model is trained and which are used as the input for predictions.
  118. // The
  119. //
  120. // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
  121. // as well as, according to dataset's state upon model creation,
  122. //
  123. // [weight_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.weight_column_spec_id],
  124. // and
  125. //
  126. // [ml_use_column][google.cloud.automl.v1beta1.TablesDatasetMetadata.ml_use_column_spec_id]
  127. // must never be included here.
  128. //
  129. // Only 3 fields are used:
  130. //
  131. // * name - May be set on CreateModel, if set only the columns specified are
  132. // used, otherwise all primary table's columns (except the ones listed
  133. // above) are used for the training and prediction input.
  134. //
  135. // * display_name - Output only.
  136. //
  137. // * data_type - Output only.
  138. repeated ColumnSpec input_feature_column_specs = 3;
  139. // Objective function the model is optimizing towards. The training process
  140. // creates a model that maximizes/minimizes the value of the objective
  141. // function over the validation set.
  142. //
  143. // The supported optimization objectives depend on the prediction type.
  144. // If the field is not set, a default objective function is used.
  145. //
  146. // CLASSIFICATION_BINARY:
  147. // "MAXIMIZE_AU_ROC" (default) - Maximize the area under the receiver
  148. // operating characteristic (ROC) curve.
  149. // "MINIMIZE_LOG_LOSS" - Minimize log loss.
  150. // "MAXIMIZE_AU_PRC" - Maximize the area under the precision-recall curve.
  151. // "MAXIMIZE_PRECISION_AT_RECALL" - Maximize precision for a specified
  152. // recall value.
  153. // "MAXIMIZE_RECALL_AT_PRECISION" - Maximize recall for a specified
  154. // precision value.
  155. //
  156. // CLASSIFICATION_MULTI_CLASS :
  157. // "MINIMIZE_LOG_LOSS" (default) - Minimize log loss.
  158. //
  159. //
  160. // REGRESSION:
  161. // "MINIMIZE_RMSE" (default) - Minimize root-mean-squared error (RMSE).
  162. // "MINIMIZE_MAE" - Minimize mean-absolute error (MAE).
  163. // "MINIMIZE_RMSLE" - Minimize root-mean-squared log error (RMSLE).
  164. string optimization_objective = 4;
  165. // Output only. Auxiliary information for each of the
  166. // input_feature_column_specs with respect to this particular model.
  167. repeated TablesModelColumnInfo tables_model_column_info = 5;
  168. // Required. The train budget of creating this model, expressed in milli node
  169. // hours i.e. 1,000 value in this field means 1 node hour.
  170. //
  171. // The training cost of the model will not exceed this budget. The final cost
  172. // will be attempted to be close to the budget, though may end up being (even)
  173. // noticeably smaller - at the backend's discretion. This especially may
  174. // happen when further model training ceases to provide any improvements.
  175. //
  176. // If the budget is set to a value known to be insufficient to train a
  177. // model for the given dataset, the training won't be attempted and
  178. // will error.
  179. //
  180. // The train budget must be between 1,000 and 72,000 milli node hours,
  181. // inclusive.
  182. int64 train_budget_milli_node_hours = 6;
  183. // Output only. The actual training cost of the model, expressed in milli
  184. // node hours, i.e. 1,000 value in this field means 1 node hour. Guaranteed
  185. // to not exceed the train budget.
  186. int64 train_cost_milli_node_hours = 7;
  187. // Use the entire training budget. This disables the early stopping feature.
  188. // By default, the early stopping feature is enabled, which means that AutoML
  189. // Tables might stop training before the entire training budget has been used.
  190. bool disable_early_stopping = 12;
  191. }
  192. // Contains annotation details specific to Tables.
  193. message TablesAnnotation {
  194. // Output only. A confidence estimate between 0.0 and 1.0, inclusive. A higher
  195. // value means greater confidence in the returned value.
  196. // For
  197. //
  198. // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
  199. // of FLOAT64 data type the score is not populated.
  200. float score = 1;
  201. // Output only. Only populated when
  202. //
  203. // [target_column_spec][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec]
  204. // has FLOAT64 data type. An interval in which the exactly correct target
  205. // value has 95% chance to be in.
  206. DoubleRange prediction_interval = 4;
  207. // The predicted value of the row's
  208. //
  209. // [target_column][google.cloud.automl.v1beta1.TablesModelMetadata.target_column_spec].
  210. // The value depends on the column's DataType:
  211. //
  212. // * CATEGORY - the predicted (with the above confidence `score`) CATEGORY
  213. // value.
  214. //
  215. // * FLOAT64 - the predicted (with above `prediction_interval`) FLOAT64 value.
  216. google.protobuf.Value value = 2;
  217. // Output only. Auxiliary information for each of the model's
  218. //
  219. // [input_feature_column_specs][google.cloud.automl.v1beta1.TablesModelMetadata.input_feature_column_specs]
  220. // with respect to this particular prediction.
  221. // If no other fields than
  222. //
  223. // [column_spec_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_spec_name]
  224. // and
  225. //
  226. // [column_display_name][google.cloud.automl.v1beta1.TablesModelColumnInfo.column_display_name]
  227. // would be populated, then this whole field is not.
  228. repeated TablesModelColumnInfo tables_model_column_info = 3;
  229. // Output only. Stores the prediction score for the baseline example, which
  230. // is defined as the example with all values set to their baseline values.
  231. // This is used as part of the Sampled Shapley explanation of the model's
  232. // prediction. This field is populated only when feature importance is
  233. // requested. For regression models, this holds the baseline prediction for
  234. // the baseline example. For classification models, this holds the baseline
  235. // prediction for the baseline example for the argmax class.
  236. float baseline_score = 5;
  237. }
  238. // An information specific to given column and Tables Model, in context
  239. // of the Model and the predictions created by it.
  240. message TablesModelColumnInfo {
  241. // Output only. The name of the ColumnSpec describing the column. Not
  242. // populated when this proto is outputted to BigQuery.
  243. string column_spec_name = 1;
  244. // Output only. The display name of the column (same as the display_name of
  245. // its ColumnSpec).
  246. string column_display_name = 2;
  247. // Output only. When given as part of a Model (always populated):
  248. // Measurement of how much model predictions correctness on the TEST data
  249. // depend on values in this column. A value between 0 and 1, higher means
  250. // higher influence. These values are normalized - for all input feature
  251. // columns of a given model they add to 1.
  252. //
  253. // When given back by Predict (populated iff
  254. // [feature_importance
  255. // param][google.cloud.automl.v1beta1.PredictRequest.params] is set) or Batch
  256. // Predict (populated iff
  257. // [feature_importance][google.cloud.automl.v1beta1.PredictRequest.params]
  258. // param is set):
  259. // Measurement of how impactful for the prediction returned for the given row
  260. // the value in this column was. Specifically, the feature importance
  261. // specifies the marginal contribution that the feature made to the prediction
  262. // score compared to the baseline score. These values are computed using the
  263. // Sampled Shapley method.
  264. float feature_importance = 3;
  265. }