job_service.proto 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613
  1. // Copyright 2017 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.ml.v1;
  16. import "google/api/annotations.proto";
  17. import "google/api/auth.proto";
  18. import "google/protobuf/empty.proto";
  19. import "google/protobuf/timestamp.proto";
  20. option go_package = "google.golang.org/genproto/googleapis/cloud/ml/v1;ml";
  21. option java_multiple_files = true;
  22. option java_outer_classname = "JobServiceProto";
  23. option java_package = "com.google.cloud.ml.api.v1";
  24. // Copyright 2017 Google Inc. All Rights Reserved.
  25. //
  26. // Proto file for the Google Cloud Machine Learning Engine.
  27. // Describes the 'job service' to manage training and prediction jobs.
  28. // Service to create and manage training and batch prediction jobs.
  29. service JobService {
  30. // Creates a training or a batch prediction job.
  31. rpc CreateJob(CreateJobRequest) returns (Job) {
  32. option (google.api.http) = {
  33. post: "/v1/{parent=projects/*}/jobs"
  34. body: "job"
  35. };
  36. }
  37. // Lists the jobs in the project.
  38. rpc ListJobs(ListJobsRequest) returns (ListJobsResponse) {
  39. option (google.api.http) = {
  40. get: "/v1/{parent=projects/*}/jobs"
  41. };
  42. }
  43. // Describes a job.
  44. rpc GetJob(GetJobRequest) returns (Job) {
  45. option (google.api.http) = {
  46. get: "/v1/{name=projects/*/jobs/*}"
  47. };
  48. }
  49. // Cancels a running job.
  50. rpc CancelJob(CancelJobRequest) returns (google.protobuf.Empty) {
  51. option (google.api.http) = {
  52. post: "/v1/{name=projects/*/jobs/*}:cancel"
  53. body: "*"
  54. };
  55. }
  56. }
  57. // Represents input parameters for a training job.
  58. message TrainingInput {
  59. // A scale tier is an abstract representation of the resources Cloud ML
  60. // will allocate to a training job. When selecting a scale tier for your
  61. // training job, you should consider the size of your training dataset and
  62. // the complexity of your model. As the tiers increase, virtual machines are
  63. // added to handle your job, and the individual machines in the cluster
  64. // generally have more memory and greater processing power than they do at
  65. // lower tiers. The number of training units charged per hour of processing
  66. // increases as tiers get more advanced. Refer to the
  67. // [pricing guide](/ml/pricing) for more details. Note that in addition to
  68. // incurring costs, your use of training resources is constrained by the
  69. // [quota policy](/ml/quota).
  70. enum ScaleTier {
  71. // A single worker instance. This tier is suitable for learning how to use
  72. // Cloud ML, and for experimenting with new models using small datasets.
  73. BASIC = 0;
  74. // Many workers and a few parameter servers.
  75. STANDARD_1 = 1;
  76. // A large number of workers with many parameter servers.
  77. PREMIUM_1 = 3;
  78. // A single worker instance [with a GPU](ml/docs/how-tos/using-gpus).
  79. BASIC_GPU = 6;
  80. // The CUSTOM tier is not a set tier, but rather enables you to use your
  81. // own cluster specification. When you use this tier, set values to
  82. // configure your processing cluster according to these guidelines:
  83. //
  84. // * You _must_ set `TrainingInput.masterType` to specify the type
  85. // of machine to use for your master node. This is the only required
  86. // setting.
  87. //
  88. // * You _may_ set `TrainingInput.workerCount` to specify the number of
  89. // workers to use. If you specify one or more workers, you _must_ also
  90. // set `TrainingInput.workerType` to specify the type of machine to use
  91. // for your worker nodes.
  92. //
  93. // * You _may_ set `TrainingInput.parameterServerCount` to specify the
  94. // number of parameter servers to use. If you specify one or more
  95. // parameter servers, you _must_ also set
  96. // `TrainingInput.parameterServerType` to specify the type of machine to
  97. // use for your parameter servers.
  98. //
  99. // Note that all of your workers must use the same machine type, which can
  100. // be different from your parameter server type and master type. Your
  101. // parameter servers must likewise use the same machine type, which can be
  102. // different from your worker type and master type.
  103. CUSTOM = 5;
  104. }
  105. // Required. Specifies the machine types, the number of replicas for workers
  106. // and parameter servers.
  107. ScaleTier scale_tier = 1;
  108. // Optional. Specifies the type of virtual machine to use for your training
  109. // job's master worker.
  110. //
  111. // The following types are supported:
  112. //
  113. // <dl>
  114. // <dt>standard</dt>
  115. // <dd>
  116. // A basic machine configuration suitable for training simple models with
  117. // small to moderate datasets.
  118. // </dd>
  119. // <dt>large_model</dt>
  120. // <dd>
  121. // A machine with a lot of memory, specially suited for parameter servers
  122. // when your model is large (having many hidden layers or layers with very
  123. // large numbers of nodes).
  124. // </dd>
  125. // <dt>complex_model_s</dt>
  126. // <dd>
  127. // A machine suitable for the master and workers of the cluster when your
  128. // model requires more computation than the standard machine can handle
  129. // satisfactorily.
  130. // </dd>
  131. // <dt>complex_model_m</dt>
  132. // <dd>
  133. // A machine with roughly twice the number of cores and roughly double the
  134. // memory of <code suppresswarning="true">complex_model_s</code>.
  135. // </dd>
  136. // <dt>complex_model_l</dt>
  137. // <dd>
  138. // A machine with roughly twice the number of cores and roughly double the
  139. // memory of <code suppresswarning="true">complex_model_m</code>.
  140. // </dd>
  141. // <dt>standard_gpu</dt>
  142. // <dd>
  143. // A machine equivalent to <code suppresswarning="true">standard</code> that
  144. // also includes a
  145. // <a href="ml/docs/how-tos/using-gpus">
  146. // GPU that you can use in your trainer</a>.
  147. // </dd>
  148. // <dt>complex_model_m_gpu</dt>
  149. // <dd>
  150. // A machine equivalent to
  151. // <code suppresswarning="true">coplex_model_m</code> that also includes
  152. // four GPUs.
  153. // </dd>
  154. // </dl>
  155. //
  156. // You must set this value when `scaleTier` is set to `CUSTOM`.
  157. string master_type = 2;
  158. // Optional. Specifies the type of virtual machine to use for your training
  159. // job's worker nodes.
  160. //
  161. // The supported values are the same as those described in the entry for
  162. // `masterType`.
  163. //
  164. // This value must be present when `scaleTier` is set to `CUSTOM` and
  165. // `workerCount` is greater than zero.
  166. string worker_type = 3;
  167. // Optional. Specifies the type of virtual machine to use for your training
  168. // job's parameter server.
  169. //
  170. // The supported values are the same as those described in the entry for
  171. // `master_type`.
  172. //
  173. // This value must be present when `scaleTier` is set to `CUSTOM` and
  174. // `parameter_server_count` is greater than zero.
  175. string parameter_server_type = 4;
  176. // Optional. The number of worker replicas to use for the training job. Each
  177. // replica in the cluster will be of the type specified in `worker_type`.
  178. //
  179. // This value can only be used when `scale_tier` is set to `CUSTOM`. If you
  180. // set this value, you must also set `worker_type`.
  181. int64 worker_count = 5;
  182. // Optional. The number of parameter server replicas to use for the training
  183. // job. Each replica in the cluster will be of the type specified in
  184. // `parameter_server_type`.
  185. //
  186. // This value can only be used when `scale_tier` is set to `CUSTOM`.If you
  187. // set this value, you must also set `parameter_server_type`.
  188. int64 parameter_server_count = 6;
  189. // Required. The Google Cloud Storage location of the packages with
  190. // the training program and any additional dependencies.
  191. repeated string package_uris = 7;
  192. // Required. The Python module name to run after installing the packages.
  193. string python_module = 8;
  194. // Optional. Command line arguments to pass to the program.
  195. repeated string args = 10;
  196. // Optional. The set of Hyperparameters to tune.
  197. HyperparameterSpec hyperparameters = 12;
  198. // Required. The Google Compute Engine region to run the training job in.
  199. string region = 14;
  200. // Optional. A Google Cloud Storage path in which to store training outputs
  201. // and other data needed for training. This path is passed to your TensorFlow
  202. // program as the 'job_dir' command-line argument. The benefit of specifying
  203. // this field is that Cloud ML validates the path for use in training.
  204. string job_dir = 16;
  205. // Optional. The Google Cloud ML runtime version to use for training. If not
  206. // set, Google Cloud ML will choose the latest stable version.
  207. string runtime_version = 15;
  208. }
  209. // Represents a set of hyperparameters to optimize.
  210. message HyperparameterSpec {
  211. // The available types of optimization goals.
  212. enum GoalType {
  213. // Goal Type will default to maximize.
  214. GOAL_TYPE_UNSPECIFIED = 0;
  215. // Maximize the goal metric.
  216. MAXIMIZE = 1;
  217. // Minimize the goal metric.
  218. MINIMIZE = 2;
  219. }
  220. // Required. The type of goal to use for tuning. Available types are
  221. // `MAXIMIZE` and `MINIMIZE`.
  222. //
  223. // Defaults to `MAXIMIZE`.
  224. GoalType goal = 1;
  225. // Required. The set of parameters to tune.
  226. repeated ParameterSpec params = 2;
  227. // Optional. How many training trials should be attempted to optimize
  228. // the specified hyperparameters.
  229. //
  230. // Defaults to one.
  231. int32 max_trials = 3;
  232. // Optional. The number of training trials to run concurrently.
  233. // You can reduce the time it takes to perform hyperparameter tuning by adding
  234. // trials in parallel. However, each trail only benefits from the information
  235. // gained in completed trials. That means that a trial does not get access to
  236. // the results of trials running at the same time, which could reduce the
  237. // quality of the overall optimization.
  238. //
  239. // Each trial will use the same scale tier and machine types.
  240. //
  241. // Defaults to one.
  242. int32 max_parallel_trials = 4;
  243. // Optional. The Tensorflow summary tag name to use for optimizing trials. For
  244. // current versions of Tensorflow, this tag name should exactly match what is
  245. // shown in Tensorboard, including all scopes. For versions of Tensorflow
  246. // prior to 0.12, this should be only the tag passed to tf.Summary.
  247. // By default, "training/hptuning/metric" will be used.
  248. string hyperparameter_metric_tag = 5;
  249. }
  250. // Represents a single hyperparameter to optimize.
  251. message ParameterSpec {
  252. // The type of the parameter.
  253. enum ParameterType {
  254. // You must specify a valid type. Using this unspecified type will result in
  255. // an error.
  256. PARAMETER_TYPE_UNSPECIFIED = 0;
  257. // Type for real-valued parameters.
  258. DOUBLE = 1;
  259. // Type for integral parameters.
  260. INTEGER = 2;
  261. // The parameter is categorical, with a value chosen from the categories
  262. // field.
  263. CATEGORICAL = 3;
  264. // The parameter is real valued, with a fixed set of feasible points. If
  265. // `type==DISCRETE`, feasible_points must be provided, and
  266. // {`min_value`, `max_value`} will be ignored.
  267. DISCRETE = 4;
  268. }
  269. // The type of scaling that should be applied to this parameter.
  270. enum ScaleType {
  271. // By default, no scaling is applied.
  272. NONE = 0;
  273. // Scales the feasible space to (0, 1) linearly.
  274. UNIT_LINEAR_SCALE = 1;
  275. // Scales the feasible space logarithmically to (0, 1). The entire feasible
  276. // space must be strictly positive.
  277. UNIT_LOG_SCALE = 2;
  278. // Scales the feasible space "reverse" logarithmically to (0, 1). The result
  279. // is that values close to the top of the feasible space are spread out more
  280. // than points near the bottom. The entire feasible space must be strictly
  281. // positive.
  282. UNIT_REVERSE_LOG_SCALE = 3;
  283. }
  284. // Required. The parameter name must be unique amongst all ParameterConfigs in
  285. // a HyperparameterSpec message. E.g., "learning_rate".
  286. string parameter_name = 1;
  287. // Required. The type of the parameter.
  288. ParameterType type = 4;
  289. // Required if type is `DOUBLE` or `INTEGER`. This field
  290. // should be unset if type is `CATEGORICAL`. This value should be integers if
  291. // type is INTEGER.
  292. double min_value = 2;
  293. // Required if typeis `DOUBLE` or `INTEGER`. This field
  294. // should be unset if type is `CATEGORICAL`. This value should be integers if
  295. // type is `INTEGER`.
  296. double max_value = 3;
  297. // Required if type is `CATEGORICAL`. The list of possible categories.
  298. repeated string categorical_values = 5;
  299. // Required if type is `DISCRETE`.
  300. // A list of feasible points.
  301. // The list should be in strictly increasing order. For instance, this
  302. // parameter might have possible settings of 1.5, 2.5, and 4.0. This list
  303. // should not contain more than 1,000 values.
  304. repeated double discrete_values = 6;
  305. // Optional. How the parameter should be scaled to the hypercube.
  306. // Leave unset for categorical parameters.
  307. // Some kind of scaling is strongly recommended for real or integral
  308. // parameters (e.g., `UNIT_LINEAR_SCALE`).
  309. ScaleType scale_type = 7;
  310. }
  311. // Represents the result of a single hyperparameter tuning trial from a
  312. // training job. The TrainingOutput object that is returned on successful
  313. // completion of a training job with hyperparameter tuning includes a list
  314. // of HyperparameterOutput objects, one for each successful trial.
  315. message HyperparameterOutput {
  316. // An observed value of a metric.
  317. message HyperparameterMetric {
  318. // The global training step for this metric.
  319. int64 training_step = 1;
  320. // The objective value at this training step.
  321. double objective_value = 2;
  322. }
  323. // The trial id for these results.
  324. string trial_id = 1;
  325. // The hyperparameters given to this trial.
  326. map<string, string> hyperparameters = 2;
  327. // The final objective metric seen for this trial.
  328. HyperparameterMetric final_metric = 3;
  329. // All recorded object metrics for this trial.
  330. repeated HyperparameterMetric all_metrics = 4;
  331. }
  332. // Represents results of a training job. Output only.
  333. message TrainingOutput {
  334. // The number of hyperparameter tuning trials that completed successfully.
  335. // Only set for hyperparameter tuning jobs.
  336. int64 completed_trial_count = 1;
  337. // Results for individual Hyperparameter trials.
  338. // Only set for hyperparameter tuning jobs.
  339. repeated HyperparameterOutput trials = 2;
  340. // The amount of ML units consumed by the job.
  341. double consumed_ml_units = 3;
  342. // Whether this job is a hyperparameter tuning job.
  343. bool is_hyperparameter_tuning_job = 4;
  344. }
  345. // Represents input parameters for a prediction job.
  346. message PredictionInput {
  347. // The format used to separate data instances in the source files.
  348. enum DataFormat {
  349. // Unspecified format.
  350. DATA_FORMAT_UNSPECIFIED = 0;
  351. // The source file is a text file with instances separated by the
  352. // new-line character.
  353. TEXT = 1;
  354. // The source file is a TFRecord file.
  355. TF_RECORD = 2;
  356. // The source file is a GZIP-compressed TFRecord file.
  357. TF_RECORD_GZIP = 3;
  358. }
  359. // Required. The model or the version to use for prediction.
  360. oneof model_version {
  361. // Use this field if you want to use the default version for the specified
  362. // model. The string must use the following format:
  363. //
  364. // `"projects/<var>[YOUR_PROJECT]</var>/models/<var>[YOUR_MODEL]</var>"`
  365. string model_name = 1;
  366. // Use this field if you want to specify a version of the model to use. The
  367. // string is formatted the same way as `model_version`, with the addition
  368. // of the version information:
  369. //
  370. // `"projects/<var>[YOUR_PROJECT]</var>/models/<var>YOUR_MODEL/versions/<var>[YOUR_VERSION]</var>"`
  371. string version_name = 2;
  372. // Use this field if you want to specify a Google Cloud Storage path for
  373. // the model to use.
  374. string uri = 9;
  375. }
  376. // Required. The format of the input data files.
  377. DataFormat data_format = 3;
  378. // Required. The Google Cloud Storage location of the input data files.
  379. // May contain wildcards.
  380. repeated string input_paths = 4;
  381. // Required. The output Google Cloud Storage location.
  382. string output_path = 5;
  383. // Optional. The maximum number of workers to be used for parallel processing.
  384. // Defaults to 10 if not specified.
  385. int64 max_worker_count = 6;
  386. // Required. The Google Compute Engine region to run the prediction job in.
  387. string region = 7;
  388. // Optional. The Google Cloud ML runtime version to use for this batch
  389. // prediction. If not set, Google Cloud ML will pick the runtime version used
  390. // during the CreateVersion request for this model version, or choose the
  391. // latest stable version when model version information is not available
  392. // such as when the model is specified by uri.
  393. string runtime_version = 8;
  394. }
  395. // Represents results of a prediction job.
  396. message PredictionOutput {
  397. // The output Google Cloud Storage location provided at the job creation time.
  398. string output_path = 1;
  399. // The number of generated predictions.
  400. int64 prediction_count = 2;
  401. // The number of data instances which resulted in errors.
  402. int64 error_count = 3;
  403. // Node hours used by the batch prediction job.
  404. double node_hours = 4;
  405. }
  406. // Represents a training or prediction job.
  407. message Job {
  408. // Describes the job state.
  409. enum State {
  410. // The job state is unspecified.
  411. STATE_UNSPECIFIED = 0;
  412. // The job has been just created and processing has not yet begun.
  413. QUEUED = 1;
  414. // The service is preparing to run the job.
  415. PREPARING = 2;
  416. // The job is in progress.
  417. RUNNING = 3;
  418. // The job completed successfully.
  419. SUCCEEDED = 4;
  420. // The job failed.
  421. // `error_message` should contain the details of the failure.
  422. FAILED = 5;
  423. // The job is being cancelled.
  424. // `error_message` should describe the reason for the cancellation.
  425. CANCELLING = 6;
  426. // The job has been cancelled.
  427. // `error_message` should describe the reason for the cancellation.
  428. CANCELLED = 7;
  429. }
  430. // Required. The user-specified id of the job.
  431. string job_id = 1;
  432. // Required. Parameters to create a job.
  433. oneof input {
  434. // Input parameters to create a training job.
  435. TrainingInput training_input = 2;
  436. // Input parameters to create a prediction job.
  437. PredictionInput prediction_input = 3;
  438. }
  439. // Output only. When the job was created.
  440. google.protobuf.Timestamp create_time = 4;
  441. // Output only. When the job processing was started.
  442. google.protobuf.Timestamp start_time = 5;
  443. // Output only. When the job processing was completed.
  444. google.protobuf.Timestamp end_time = 6;
  445. // Output only. The detailed state of a job.
  446. State state = 7;
  447. // Output only. The details of a failure or a cancellation.
  448. string error_message = 8;
  449. // Output only. The current result of the job.
  450. oneof output {
  451. // The current training job result.
  452. TrainingOutput training_output = 9;
  453. // The current prediction job result.
  454. PredictionOutput prediction_output = 10;
  455. }
  456. }
  457. // Request message for the CreateJob method.
  458. message CreateJobRequest {
  459. // Required. The project name.
  460. //
  461. // Authorization: requires `Editor` role on the specified project.
  462. string parent = 1;
  463. // Required. The job to create.
  464. Job job = 2;
  465. }
  466. // Request message for the ListJobs method.
  467. message ListJobsRequest {
  468. // Required. The name of the project for which to list jobs.
  469. //
  470. // Authorization: requires `Viewer` role on the specified project.
  471. string parent = 1;
  472. // Optional. Specifies the subset of jobs to retrieve.
  473. string filter = 2;
  474. // Optional. A page token to request the next page of results.
  475. //
  476. // You get the token from the `next_page_token` field of the response from
  477. // the previous call.
  478. string page_token = 4;
  479. // Optional. The number of jobs to retrieve per "page" of results. If there
  480. // are more remaining results than this number, the response message will
  481. // contain a valid value in the `next_page_token` field.
  482. //
  483. // The default value is 20, and the maximum page size is 100.
  484. int32 page_size = 5;
  485. }
  486. // Response message for the ListJobs method.
  487. message ListJobsResponse {
  488. // The list of jobs.
  489. repeated Job jobs = 1;
  490. // Optional. Pass this token as the `page_token` field of the request for a
  491. // subsequent call.
  492. string next_page_token = 2;
  493. }
  494. // Request message for the GetJob method.
  495. message GetJobRequest {
  496. // Required. The name of the job to get the description of.
  497. //
  498. // Authorization: requires `Viewer` role on the parent project.
  499. string name = 1;
  500. }
  501. // Request message for the CancelJob method.
  502. message CancelJobRequest {
  503. // Required. The name of the job to cancel.
  504. //
  505. // Authorization: requires `Editor` role on the parent project.
  506. string name = 1;
  507. }