video_intelligence.proto 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089
  1. // Copyright 2020 Google LLC
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.cloud.videointelligence.v1p3beta1;
  16. import "google/api/annotations.proto";
  17. import "google/api/client.proto";
  18. import "google/api/field_behavior.proto";
  19. import "google/longrunning/operations.proto";
  20. import "google/protobuf/duration.proto";
  21. import "google/protobuf/timestamp.proto";
  22. import "google/rpc/status.proto";
  23. option csharp_namespace = "Google.Cloud.VideoIntelligence.V1P3Beta1";
  24. option go_package = "google.golang.org/genproto/googleapis/cloud/videointelligence/v1p3beta1;videointelligence";
  25. option java_multiple_files = true;
  26. option java_outer_classname = "VideoIntelligenceServiceProto";
  27. option java_package = "com.google.cloud.videointelligence.v1p3beta1";
  28. option php_namespace = "Google\\Cloud\\VideoIntelligence\\V1p3beta1";
  29. // Service that implements the Video Intelligence API.
  30. service VideoIntelligenceService {
  31. option (google.api.default_host) = "videointelligence.googleapis.com";
  32. option (google.api.oauth_scopes) =
  33. "https://www.googleapis.com/auth/cloud-platform";
  34. // Performs asynchronous video annotation. Progress and results can be
  35. // retrieved through the `google.longrunning.Operations` interface.
  36. // `Operation.metadata` contains `AnnotateVideoProgress` (progress).
  37. // `Operation.response` contains `AnnotateVideoResponse` (results).
  38. rpc AnnotateVideo(AnnotateVideoRequest)
  39. returns (google.longrunning.Operation) {
  40. option (google.api.http) = {
  41. post: "/v1p3beta1/videos:annotate"
  42. body: "*"
  43. };
  44. option (google.api.method_signature) = "input_uri,features";
  45. option (google.longrunning.operation_info) = {
  46. response_type: "AnnotateVideoResponse"
  47. metadata_type: "AnnotateVideoProgress"
  48. };
  49. }
  50. }
  51. // Service that implements streaming Video Intelligence API.
  52. service StreamingVideoIntelligenceService {
  53. option (google.api.default_host) = "videointelligence.googleapis.com";
  54. option (google.api.oauth_scopes) =
  55. "https://www.googleapis.com/auth/cloud-platform";
  56. // Performs video annotation with bidirectional streaming: emitting results
  57. // while sending video/audio bytes.
  58. // This method is only available via the gRPC API (not REST).
  59. rpc StreamingAnnotateVideo(stream StreamingAnnotateVideoRequest)
  60. returns (stream StreamingAnnotateVideoResponse) {}
  61. }
  62. // Video annotation request.
  63. message AnnotateVideoRequest {
  64. // Input video location. Currently, only
  65. // [Cloud Storage](https://cloud.google.com/storage/) URIs are
  66. // supported. URIs must be specified in the following format:
  67. // `gs://bucket-id/object-id` (other URI formats return
  68. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  69. // more information, see [Request
  70. // URIs](https://cloud.google.com/storage/docs/request-endpoints). To identify
  71. // multiple videos, a video URI may include wildcards in the `object-id`.
  72. // Supported wildcards: '*' to match 0 or more characters;
  73. // '?' to match 1 character. If unset, the input video should be embedded
  74. // in the request as `input_content`. If set, `input_content` must be unset.
  75. string input_uri = 1;
  76. // The video data bytes.
  77. // If unset, the input video(s) should be specified via the `input_uri`.
  78. // If set, `input_uri` must be unset.
  79. bytes input_content = 6;
  80. // Required. Requested video annotation features.
  81. repeated Feature features = 2 [(google.api.field_behavior) = REQUIRED];
  82. // Additional video context and/or feature-specific parameters.
  83. VideoContext video_context = 3;
  84. // Optional. Location where the output (in JSON format) should be stored.
  85. // Currently, only [Cloud Storage](https://cloud.google.com/storage/)
  86. // URIs are supported. These must be specified in the following format:
  87. // `gs://bucket-id/object-id` (other URI formats return
  88. // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For
  89. // more information, see [Request
  90. // URIs](https://cloud.google.com/storage/docs/request-endpoints).
  91. string output_uri = 4 [(google.api.field_behavior) = OPTIONAL];
  92. // Optional. Cloud region where annotation should take place. Supported cloud
  93. // regions are: `us-east1`, `us-west1`, `europe-west1`, `asia-east1`. If no
  94. // region is specified, the region will be determined based on video file
  95. // location.
  96. string location_id = 5 [(google.api.field_behavior) = OPTIONAL];
  97. }
  98. // Video context and/or feature-specific parameters.
  99. message VideoContext {
  100. // Video segments to annotate. The segments may overlap and are not required
  101. // to be contiguous or span the whole video. If unspecified, each video is
  102. // treated as a single segment.
  103. repeated VideoSegment segments = 1;
  104. // Config for LABEL_DETECTION.
  105. LabelDetectionConfig label_detection_config = 2;
  106. // Config for SHOT_CHANGE_DETECTION.
  107. ShotChangeDetectionConfig shot_change_detection_config = 3;
  108. // Config for EXPLICIT_CONTENT_DETECTION.
  109. ExplicitContentDetectionConfig explicit_content_detection_config = 4;
  110. // Config for FACE_DETECTION.
  111. FaceDetectionConfig face_detection_config = 5;
  112. // Config for SPEECH_TRANSCRIPTION.
  113. SpeechTranscriptionConfig speech_transcription_config = 6;
  114. // Config for TEXT_DETECTION.
  115. TextDetectionConfig text_detection_config = 8;
  116. // Config for PERSON_DETECTION.
  117. PersonDetectionConfig person_detection_config = 11;
  118. // Config for OBJECT_TRACKING.
  119. ObjectTrackingConfig object_tracking_config = 13;
  120. }
  121. // Label detection mode.
  122. enum LabelDetectionMode {
  123. // Unspecified.
  124. LABEL_DETECTION_MODE_UNSPECIFIED = 0;
  125. // Detect shot-level labels.
  126. SHOT_MODE = 1;
  127. // Detect frame-level labels.
  128. FRAME_MODE = 2;
  129. // Detect both shot-level and frame-level labels.
  130. SHOT_AND_FRAME_MODE = 3;
  131. }
  132. // Bucketized representation of likelihood.
  133. enum Likelihood {
  134. // Unspecified likelihood.
  135. LIKELIHOOD_UNSPECIFIED = 0;
  136. // Very unlikely.
  137. VERY_UNLIKELY = 1;
  138. // Unlikely.
  139. UNLIKELY = 2;
  140. // Possible.
  141. POSSIBLE = 3;
  142. // Likely.
  143. LIKELY = 4;
  144. // Very likely.
  145. VERY_LIKELY = 5;
  146. }
  147. // Config for LABEL_DETECTION.
  148. message LabelDetectionConfig {
  149. // What labels should be detected with LABEL_DETECTION, in addition to
  150. // video-level labels or segment-level labels.
  151. // If unspecified, defaults to `SHOT_MODE`.
  152. LabelDetectionMode label_detection_mode = 1;
  153. // Whether the video has been shot from a stationary (i.e., non-moving)
  154. // camera. When set to true, might improve detection accuracy for moving
  155. // objects. Should be used with `SHOT_AND_FRAME_MODE` enabled.
  156. bool stationary_camera = 2;
  157. // Model to use for label detection.
  158. // Supported values: "builtin/stable" (the default if unset) and
  159. // "builtin/latest".
  160. string model = 3;
  161. // The confidence threshold we perform filtering on the labels from
  162. // frame-level detection. If not set, it is set to 0.4 by default. The valid
  163. // range for this threshold is [0.1, 0.9]. Any value set outside of this
  164. // range will be clipped.
  165. // Note: For best results, follow the default threshold. We will update
  166. // the default threshold everytime when we release a new model.
  167. float frame_confidence_threshold = 4;
  168. // The confidence threshold we perform filtering on the labels from
  169. // video-level and shot-level detections. If not set, it's set to 0.3 by
  170. // default. The valid range for this threshold is [0.1, 0.9]. Any value set
  171. // outside of this range will be clipped.
  172. // Note: For best results, follow the default threshold. We will update
  173. // the default threshold everytime when we release a new model.
  174. float video_confidence_threshold = 5;
  175. }
  176. // Streaming video annotation feature.
  177. enum StreamingFeature {
  178. // Unspecified.
  179. STREAMING_FEATURE_UNSPECIFIED = 0;
  180. // Label detection. Detect objects, such as dog or flower.
  181. STREAMING_LABEL_DETECTION = 1;
  182. // Shot change detection.
  183. STREAMING_SHOT_CHANGE_DETECTION = 2;
  184. // Explicit content detection.
  185. STREAMING_EXPLICIT_CONTENT_DETECTION = 3;
  186. // Object detection and tracking.
  187. STREAMING_OBJECT_TRACKING = 4;
  188. // Action recognition based on AutoML model.
  189. STREAMING_AUTOML_ACTION_RECOGNITION = 23;
  190. // Video classification based on AutoML model.
  191. STREAMING_AUTOML_CLASSIFICATION = 21;
  192. // Object detection and tracking based on AutoML model.
  193. STREAMING_AUTOML_OBJECT_TRACKING = 22;
  194. }
  195. // Video annotation feature.
  196. enum Feature {
  197. // Unspecified.
  198. FEATURE_UNSPECIFIED = 0;
  199. // Label detection. Detect objects, such as dog or flower.
  200. LABEL_DETECTION = 1;
  201. // Shot change detection.
  202. SHOT_CHANGE_DETECTION = 2;
  203. // Explicit content detection.
  204. EXPLICIT_CONTENT_DETECTION = 3;
  205. // Human face detection.
  206. FACE_DETECTION = 4;
  207. // Speech transcription.
  208. SPEECH_TRANSCRIPTION = 6;
  209. // OCR text detection and tracking.
  210. TEXT_DETECTION = 7;
  211. // Object detection and tracking.
  212. OBJECT_TRACKING = 9;
  213. // Logo detection, tracking, and recognition.
  214. LOGO_RECOGNITION = 12;
  215. // Celebrity recognition.
  216. CELEBRITY_RECOGNITION = 13;
  217. // Person detection.
  218. PERSON_DETECTION = 14;
  219. }
  220. // Config for SHOT_CHANGE_DETECTION.
  221. message ShotChangeDetectionConfig {
  222. // Model to use for shot change detection.
  223. // Supported values: "builtin/stable" (the default if unset) and
  224. // "builtin/latest".
  225. string model = 1;
  226. }
  227. // Config for OBJECT_TRACKING.
  228. message ObjectTrackingConfig {
  229. // Model to use for object tracking.
  230. // Supported values: "builtin/stable" (the default if unset) and
  231. // "builtin/latest".
  232. string model = 1;
  233. }
  234. // Config for EXPLICIT_CONTENT_DETECTION.
  235. message ExplicitContentDetectionConfig {
  236. // Model to use for explicit content detection.
  237. // Supported values: "builtin/stable" (the default if unset) and
  238. // "builtin/latest".
  239. string model = 1;
  240. }
  241. // Config for FACE_DETECTION.
  242. message FaceDetectionConfig {
  243. // Model to use for face detection.
  244. // Supported values: "builtin/stable" (the default if unset) and
  245. // "builtin/latest".
  246. string model = 1;
  247. // Whether bounding boxes are included in the face annotation output.
  248. bool include_bounding_boxes = 2;
  249. // Whether to enable face attributes detection, such as glasses, dark_glasses,
  250. // mouth_open etc. Ignored if 'include_bounding_boxes' is set to false.
  251. bool include_attributes = 5;
  252. }
  253. // Config for PERSON_DETECTION.
  254. message PersonDetectionConfig {
  255. // Whether bounding boxes are included in the person detection annotation
  256. // output.
  257. bool include_bounding_boxes = 1;
  258. // Whether to enable pose landmarks detection. Ignored if
  259. // 'include_bounding_boxes' is set to false.
  260. bool include_pose_landmarks = 2;
  261. // Whether to enable person attributes detection, such as cloth color (black,
  262. // blue, etc), type (coat, dress, etc), pattern (plain, floral, etc), hair,
  263. // etc.
  264. // Ignored if 'include_bounding_boxes' is set to false.
  265. bool include_attributes = 3;
  266. }
  267. // Config for TEXT_DETECTION.
  268. message TextDetectionConfig {
  269. // Language hint can be specified if the language to be detected is known a
  270. // priori. It can increase the accuracy of the detection. Language hint must
  271. // be language code in BCP-47 format.
  272. //
  273. // Automatic language detection is performed if no hint is provided.
  274. repeated string language_hints = 1;
  275. // Model to use for text detection.
  276. // Supported values: "builtin/stable" (the default if unset) and
  277. // "builtin/latest".
  278. string model = 2;
  279. }
  280. // Video segment.
  281. message VideoSegment {
  282. // Time-offset, relative to the beginning of the video,
  283. // corresponding to the start of the segment (inclusive).
  284. google.protobuf.Duration start_time_offset = 1;
  285. // Time-offset, relative to the beginning of the video,
  286. // corresponding to the end of the segment (inclusive).
  287. google.protobuf.Duration end_time_offset = 2;
  288. }
  289. // Video segment level annotation results for label detection.
  290. message LabelSegment {
  291. // Video segment where a label was detected.
  292. VideoSegment segment = 1;
  293. // Confidence that the label is accurate. Range: [0, 1].
  294. float confidence = 2;
  295. }
  296. // Video frame level annotation results for label detection.
  297. message LabelFrame {
  298. // Time-offset, relative to the beginning of the video, corresponding to the
  299. // video frame for this location.
  300. google.protobuf.Duration time_offset = 1;
  301. // Confidence that the label is accurate. Range: [0, 1].
  302. float confidence = 2;
  303. }
  304. // Detected entity from video analysis.
  305. message Entity {
  306. // Opaque entity ID. Some IDs may be available in
  307. // [Google Knowledge Graph Search
  308. // API](https://developers.google.com/knowledge-graph/).
  309. string entity_id = 1;
  310. // Textual description, e.g., `Fixed-gear bicycle`.
  311. string description = 2;
  312. // Language code for `description` in BCP-47 format.
  313. string language_code = 3;
  314. }
  315. // Label annotation.
  316. message LabelAnnotation {
  317. // Detected entity.
  318. Entity entity = 1;
  319. // Common categories for the detected entity.
  320. // For example, when the label is `Terrier`, the category is likely `dog`. And
  321. // in some cases there might be more than one categories e.g., `Terrier` could
  322. // also be a `pet`.
  323. repeated Entity category_entities = 2;
  324. // All video segments where a label was detected.
  325. repeated LabelSegment segments = 3;
  326. // All video frames where a label was detected.
  327. repeated LabelFrame frames = 4;
  328. }
  329. // Video frame level annotation results for explicit content.
  330. message ExplicitContentFrame {
  331. // Time-offset, relative to the beginning of the video, corresponding to the
  332. // video frame for this location.
  333. google.protobuf.Duration time_offset = 1;
  334. // Likelihood of the pornography content..
  335. Likelihood pornography_likelihood = 2;
  336. }
  337. // Explicit content annotation (based on per-frame visual signals only).
  338. // If no explicit content has been detected in a frame, no annotations are
  339. // present for that frame.
  340. message ExplicitContentAnnotation {
  341. // All video frames where explicit content was detected.
  342. repeated ExplicitContentFrame frames = 1;
  343. }
  344. // Normalized bounding box.
  345. // The normalized vertex coordinates are relative to the original image.
  346. // Range: [0, 1].
  347. message NormalizedBoundingBox {
  348. // Left X coordinate.
  349. float left = 1;
  350. // Top Y coordinate.
  351. float top = 2;
  352. // Right X coordinate.
  353. float right = 3;
  354. // Bottom Y coordinate.
  355. float bottom = 4;
  356. }
  357. // For tracking related features.
  358. // An object at time_offset with attributes, and located with
  359. // normalized_bounding_box.
  360. message TimestampedObject {
  361. // Normalized Bounding box in a frame, where the object is located.
  362. NormalizedBoundingBox normalized_bounding_box = 1;
  363. // Time-offset, relative to the beginning of the video,
  364. // corresponding to the video frame for this object.
  365. google.protobuf.Duration time_offset = 2;
  366. // Optional. The attributes of the object in the bounding box.
  367. repeated DetectedAttribute attributes = 3
  368. [(google.api.field_behavior) = OPTIONAL];
  369. // Optional. The detected landmarks.
  370. repeated DetectedLandmark landmarks = 4
  371. [(google.api.field_behavior) = OPTIONAL];
  372. }
  373. // A track of an object instance.
  374. message Track {
  375. // Video segment of a track.
  376. VideoSegment segment = 1;
  377. // The object with timestamp and attributes per frame in the track.
  378. repeated TimestampedObject timestamped_objects = 2;
  379. // Optional. Attributes in the track level.
  380. repeated DetectedAttribute attributes = 3
  381. [(google.api.field_behavior) = OPTIONAL];
  382. // Optional. The confidence score of the tracked object.
  383. float confidence = 4 [(google.api.field_behavior) = OPTIONAL];
  384. }
  385. // A generic detected attribute represented by name in string format.
  386. message DetectedAttribute {
  387. // The name of the attribute, for example, glasses, dark_glasses, mouth_open.
  388. // A full list of supported type names will be provided in the document.
  389. string name = 1;
  390. // Detected attribute confidence. Range [0, 1].
  391. float confidence = 2;
  392. // Text value of the detection result. For example, the value for "HairColor"
  393. // can be "black", "blonde", etc.
  394. string value = 3;
  395. }
  396. // Celebrity definition.
  397. message Celebrity {
  398. // The resource name of the celebrity. Have the format
  399. // `video-intelligence/kg-mid` indicates a celebrity from preloaded gallery.
  400. // kg-mid is the id in Google knowledge graph, which is unique for the
  401. // celebrity.
  402. string name = 1;
  403. // The celebrity name.
  404. string display_name = 2;
  405. // Textual description of additional information about the celebrity, if
  406. // applicable.
  407. string description = 3;
  408. }
  409. // The annotation result of a celebrity face track. RecognizedCelebrity field
  410. // could be empty if the face track does not have any matched celebrities.
  411. message CelebrityTrack {
  412. // The recognized celebrity with confidence score.
  413. message RecognizedCelebrity {
  414. // The recognized celebrity.
  415. Celebrity celebrity = 1;
  416. // Recognition confidence. Range [0, 1].
  417. float confidence = 2;
  418. }
  419. // Top N match of the celebrities for the face in this track.
  420. repeated RecognizedCelebrity celebrities = 1;
  421. // A track of a person's face.
  422. Track face_track = 3;
  423. }
  424. // Celebrity recognition annotation per video.
  425. message CelebrityRecognitionAnnotation {
  426. // The tracks detected from the input video, including recognized celebrities
  427. // and other detected faces in the video.
  428. repeated CelebrityTrack celebrity_tracks = 1;
  429. }
  430. // A generic detected landmark represented by name in string format and a 2D
  431. // location.
  432. message DetectedLandmark {
  433. // The name of this landmark, for example, left_hand, right_shoulder.
  434. string name = 1;
  435. // The 2D point of the detected landmark using the normalized image
  436. // coordindate system. The normalized coordinates have the range from 0 to 1.
  437. NormalizedVertex point = 2;
  438. // The confidence score of the detected landmark. Range [0, 1].
  439. float confidence = 3;
  440. }
  441. // Face detection annotation.
  442. message FaceDetectionAnnotation {
  443. // The face tracks with attributes.
  444. repeated Track tracks = 3;
  445. // The thumbnail of a person's face.
  446. bytes thumbnail = 4;
  447. }
  448. // Person detection annotation per video.
  449. message PersonDetectionAnnotation {
  450. // The detected tracks of a person.
  451. repeated Track tracks = 1;
  452. }
  453. // Annotation results for a single video.
  454. message VideoAnnotationResults {
  455. // Video file location in
  456. // [Cloud Storage](https://cloud.google.com/storage/).
  457. string input_uri = 1;
  458. // Video segment on which the annotation is run.
  459. VideoSegment segment = 10;
  460. // Topical label annotations on video level or user-specified segment level.
  461. // There is exactly one element for each unique label.
  462. repeated LabelAnnotation segment_label_annotations = 2;
  463. // Presence label annotations on video level or user-specified segment level.
  464. // There is exactly one element for each unique label. Compared to the
  465. // existing topical `segment_label_annotations`, this field presents more
  466. // fine-grained, segment-level labels detected in video content and is made
  467. // available only when the client sets `LabelDetectionConfig.model` to
  468. // "builtin/latest" in the request.
  469. repeated LabelAnnotation segment_presence_label_annotations = 23;
  470. // Topical label annotations on shot level.
  471. // There is exactly one element for each unique label.
  472. repeated LabelAnnotation shot_label_annotations = 3;
  473. // Presence label annotations on shot level. There is exactly one element for
  474. // each unique label. Compared to the existing topical
  475. // `shot_label_annotations`, this field presents more fine-grained, shot-level
  476. // labels detected in video content and is made available only when the client
  477. // sets `LabelDetectionConfig.model` to "builtin/latest" in the request.
  478. repeated LabelAnnotation shot_presence_label_annotations = 24;
  479. // Label annotations on frame level.
  480. // There is exactly one element for each unique label.
  481. repeated LabelAnnotation frame_label_annotations = 4;
  482. // Face detection annotations.
  483. repeated FaceDetectionAnnotation face_detection_annotations = 13;
  484. // Shot annotations. Each shot is represented as a video segment.
  485. repeated VideoSegment shot_annotations = 6;
  486. // Explicit content annotation.
  487. ExplicitContentAnnotation explicit_annotation = 7;
  488. // Speech transcription.
  489. repeated SpeechTranscription speech_transcriptions = 11;
  490. // OCR text detection and tracking.
  491. // Annotations for list of detected text snippets. Each will have list of
  492. // frame information associated with it.
  493. repeated TextAnnotation text_annotations = 12;
  494. // Annotations for list of objects detected and tracked in video.
  495. repeated ObjectTrackingAnnotation object_annotations = 14;
  496. // Annotations for list of logos detected, tracked and recognized in video.
  497. repeated LogoRecognitionAnnotation logo_recognition_annotations = 19;
  498. // Person detection annotations.
  499. repeated PersonDetectionAnnotation person_detection_annotations = 20;
  500. // Celebrity recognition annotations.
  501. CelebrityRecognitionAnnotation celebrity_recognition_annotations = 21;
  502. // If set, indicates an error. Note that for a single `AnnotateVideoRequest`
  503. // some videos may succeed and some may fail.
  504. google.rpc.Status error = 9;
  505. }
  506. // Video annotation response. Included in the `response`
  507. // field of the `Operation` returned by the `GetOperation`
  508. // call of the `google::longrunning::Operations` service.
  509. message AnnotateVideoResponse {
  510. // Annotation results for all videos specified in `AnnotateVideoRequest`.
  511. repeated VideoAnnotationResults annotation_results = 1;
  512. }
  513. // Annotation progress for a single video.
  514. message VideoAnnotationProgress {
  515. // Video file location in
  516. // [Cloud Storage](https://cloud.google.com/storage/).
  517. string input_uri = 1;
  518. // Approximate percentage processed thus far. Guaranteed to be
  519. // 100 when fully processed.
  520. int32 progress_percent = 2;
  521. // Time when the request was received.
  522. google.protobuf.Timestamp start_time = 3;
  523. // Time of the most recent update.
  524. google.protobuf.Timestamp update_time = 4;
  525. // Specifies which feature is being tracked if the request contains more than
  526. // one feature.
  527. Feature feature = 5;
  528. // Specifies which segment is being tracked if the request contains more than
  529. // one segment.
  530. VideoSegment segment = 6;
  531. }
  532. // Video annotation progress. Included in the `metadata`
  533. // field of the `Operation` returned by the `GetOperation`
  534. // call of the `google::longrunning::Operations` service.
  535. message AnnotateVideoProgress {
  536. // Progress metadata for all videos specified in `AnnotateVideoRequest`.
  537. repeated VideoAnnotationProgress annotation_progress = 1;
  538. }
  539. // Config for SPEECH_TRANSCRIPTION.
  540. message SpeechTranscriptionConfig {
  541. // Required. *Required* The language of the supplied audio as a
  542. // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
  543. // Example: "en-US".
  544. // See [Language Support](https://cloud.google.com/speech/docs/languages)
  545. // for a list of the currently supported language codes.
  546. string language_code = 1 [(google.api.field_behavior) = REQUIRED];
  547. // Optional. Maximum number of recognition hypotheses to be returned.
  548. // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
  549. // within each `SpeechTranscription`. The server may return fewer than
  550. // `max_alternatives`. Valid values are `0`-`30`. A value of `0` or `1` will
  551. // return a maximum of one. If omitted, will return a maximum of one.
  552. int32 max_alternatives = 2 [(google.api.field_behavior) = OPTIONAL];
  553. // Optional. If set to `true`, the server will attempt to filter out
  554. // profanities, replacing all but the initial character in each filtered word
  555. // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
  556. // won't be filtered out.
  557. bool filter_profanity = 3 [(google.api.field_behavior) = OPTIONAL];
  558. // Optional. A means to provide context to assist the speech recognition.
  559. repeated SpeechContext speech_contexts = 4
  560. [(google.api.field_behavior) = OPTIONAL];
  561. // Optional. If 'true', adds punctuation to recognition result hypotheses.
  562. // This feature is only available in select languages. Setting this for
  563. // requests in other languages has no effect at all. The default 'false' value
  564. // does not add punctuation to result hypotheses. NOTE: "This is currently
  565. // offered as an experimental service, complimentary to all users. In the
  566. // future this may be exclusively available as a premium feature."
  567. bool enable_automatic_punctuation = 5
  568. [(google.api.field_behavior) = OPTIONAL];
  569. // Optional. For file formats, such as MXF or MKV, supporting multiple audio
  570. // tracks, specify up to two tracks. Default: track 0.
  571. repeated int32 audio_tracks = 6 [(google.api.field_behavior) = OPTIONAL];
  572. // Optional. If 'true', enables speaker detection for each recognized word in
  573. // the top alternative of the recognition result using a speaker_tag provided
  574. // in the WordInfo.
  575. // Note: When this is true, we send all the words from the beginning of the
  576. // audio for the top alternative in every consecutive response.
  577. // This is done in order to improve our speaker tags as our models learn to
  578. // identify the speakers in the conversation over time.
  579. bool enable_speaker_diarization = 7 [(google.api.field_behavior) = OPTIONAL];
  580. // Optional. If set, specifies the estimated number of speakers in the
  581. // conversation. If not set, defaults to '2'. Ignored unless
  582. // enable_speaker_diarization is set to true.
  583. int32 diarization_speaker_count = 8 [(google.api.field_behavior) = OPTIONAL];
  584. // Optional. If `true`, the top result includes a list of words and the
  585. // confidence for those words. If `false`, no word-level confidence
  586. // information is returned. The default is `false`.
  587. bool enable_word_confidence = 9 [(google.api.field_behavior) = OPTIONAL];
  588. }
  589. // Provides "hints" to the speech recognizer to favor specific words and phrases
  590. // in the results.
  591. message SpeechContext {
  592. // Optional. A list of strings containing words and phrases "hints" so that
  593. // the speech recognition is more likely to recognize them. This can be used
  594. // to improve the accuracy for specific words and phrases, for example, if
  595. // specific commands are typically spoken by the user. This can also be used
  596. // to add additional words to the vocabulary of the recognizer. See
  597. // [usage limits](https://cloud.google.com/speech/limits#content).
  598. repeated string phrases = 1 [(google.api.field_behavior) = OPTIONAL];
  599. }
  600. // A speech recognition result corresponding to a portion of the audio.
  601. message SpeechTranscription {
  602. // May contain one or more recognition hypotheses (up to the maximum specified
  603. // in `max_alternatives`). These alternatives are ordered in terms of
  604. // accuracy, with the top (first) alternative being the most probable, as
  605. // ranked by the recognizer.
  606. repeated SpeechRecognitionAlternative alternatives = 1;
  607. // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
  608. // language tag of the language in this result. This language code was
  609. // detected to have the most likelihood of being spoken in the audio.
  610. string language_code = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  611. }
  612. // Alternative hypotheses (a.k.a. n-best list).
  613. message SpeechRecognitionAlternative {
  614. // Transcript text representing the words that the user spoke.
  615. string transcript = 1;
  616. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  617. // indicates an estimated greater likelihood that the recognized words are
  618. // correct. This field is set only for the top alternative.
  619. // This field is not guaranteed to be accurate and users should not rely on it
  620. // to be always provided.
  621. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  622. float confidence = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
  623. // Output only. A list of word-specific information for each recognized word.
  624. // Note: When `enable_speaker_diarization` is set to true, you will see all
  625. // the words from the beginning of the audio.
  626. repeated WordInfo words = 3 [(google.api.field_behavior) = OUTPUT_ONLY];
  627. }
  628. // Word-specific information for recognized words. Word information is only
  629. // included in the response when certain request parameters are set, such
  630. // as `enable_word_time_offsets`.
  631. message WordInfo {
  632. // Time offset relative to the beginning of the audio, and
  633. // corresponding to the start of the spoken word. This field is only set if
  634. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  635. // experimental feature and the accuracy of the time offset can vary.
  636. google.protobuf.Duration start_time = 1;
  637. // Time offset relative to the beginning of the audio, and
  638. // corresponding to the end of the spoken word. This field is only set if
  639. // `enable_word_time_offsets=true` and only in the top hypothesis. This is an
  640. // experimental feature and the accuracy of the time offset can vary.
  641. google.protobuf.Duration end_time = 2;
  642. // The word corresponding to this set of information.
  643. string word = 3;
  644. // Output only. The confidence estimate between 0.0 and 1.0. A higher number
  645. // indicates an estimated greater likelihood that the recognized words are
  646. // correct. This field is set only for the top alternative.
  647. // This field is not guaranteed to be accurate and users should not rely on it
  648. // to be always provided.
  649. // The default of 0.0 is a sentinel value indicating `confidence` was not set.
  650. float confidence = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
  651. // Output only. A distinct integer value is assigned for every speaker within
  652. // the audio. This field specifies which one of those speakers was detected to
  653. // have spoken this word. Value ranges from 1 up to diarization_speaker_count,
  654. // and is only set if speaker diarization is enabled.
  655. int32 speaker_tag = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
  656. }
  657. // A vertex represents a 2D point in the image.
  658. // NOTE: the normalized vertex coordinates are relative to the original image
  659. // and range from 0 to 1.
  660. message NormalizedVertex {
  661. // X coordinate.
  662. float x = 1;
  663. // Y coordinate.
  664. float y = 2;
  665. }
  666. // Normalized bounding polygon for text (that might not be aligned with axis).
  667. // Contains list of the corner points in clockwise order starting from
  668. // top-left corner. For example, for a rectangular bounding box:
  669. // When the text is horizontal it might look like:
  670. // 0----1
  671. // | |
  672. // 3----2
  673. //
  674. // When it's clockwise rotated 180 degrees around the top-left corner it
  675. // becomes:
  676. // 2----3
  677. // | |
  678. // 1----0
  679. //
  680. // and the vertex order will still be (0, 1, 2, 3). Note that values can be less
  681. // than 0, or greater than 1 due to trignometric calculations for location of
  682. // the box.
  683. message NormalizedBoundingPoly {
  684. // Normalized vertices of the bounding polygon.
  685. repeated NormalizedVertex vertices = 1;
  686. }
  687. // Video segment level annotation results for text detection.
  688. message TextSegment {
  689. // Video segment where a text snippet was detected.
  690. VideoSegment segment = 1;
  691. // Confidence for the track of detected text. It is calculated as the highest
  692. // over all frames where OCR detected text appears.
  693. float confidence = 2;
  694. // Information related to the frames where OCR detected text appears.
  695. repeated TextFrame frames = 3;
  696. }
  697. // Video frame level annotation results for text annotation (OCR).
  698. // Contains information regarding timestamp and bounding box locations for the
  699. // frames containing detected OCR text snippets.
  700. message TextFrame {
  701. // Bounding polygon of the detected text for this frame.
  702. NormalizedBoundingPoly rotated_bounding_box = 1;
  703. // Timestamp of this frame.
  704. google.protobuf.Duration time_offset = 2;
  705. }
  706. // Annotations related to one detected OCR text snippet. This will contain the
  707. // corresponding text, confidence value, and frame level information for each
  708. // detection.
  709. message TextAnnotation {
  710. // The detected text.
  711. string text = 1;
  712. // All video segments where OCR detected text appears.
  713. repeated TextSegment segments = 2;
  714. }
  715. // Video frame level annotations for object detection and tracking. This field
  716. // stores per frame location, time offset, and confidence.
  717. message ObjectTrackingFrame {
  718. // The normalized bounding box location of this object track for the frame.
  719. NormalizedBoundingBox normalized_bounding_box = 1;
  720. // The timestamp of the frame in microseconds.
  721. google.protobuf.Duration time_offset = 2;
  722. }
  723. // Annotations corresponding to one tracked object.
  724. message ObjectTrackingAnnotation {
  725. // Different representation of tracking info in non-streaming batch
  726. // and streaming modes.
  727. oneof track_info {
  728. // Non-streaming batch mode ONLY.
  729. // Each object track corresponds to one video segment where it appears.
  730. VideoSegment segment = 3;
  731. // Streaming mode ONLY.
  732. // In streaming mode, we do not know the end time of a tracked object
  733. // before it is completed. Hence, there is no VideoSegment info returned.
  734. // Instead, we provide a unique identifiable integer track_id so that
  735. // the customers can correlate the results of the ongoing
  736. // ObjectTrackAnnotation of the same track_id over time.
  737. int64 track_id = 5;
  738. }
  739. // Entity to specify the object category that this track is labeled as.
  740. Entity entity = 1;
  741. // Object category's labeling confidence of this track.
  742. float confidence = 4;
  743. // Information corresponding to all frames where this object track appears.
  744. // Non-streaming batch mode: it may be one or multiple ObjectTrackingFrame
  745. // messages in frames.
  746. // Streaming mode: it can only be one ObjectTrackingFrame message in frames.
  747. repeated ObjectTrackingFrame frames = 2;
  748. }
  749. // Annotation corresponding to one detected, tracked and recognized logo class.
  750. message LogoRecognitionAnnotation {
  751. // Entity category information to specify the logo class that all the logo
  752. // tracks within this LogoRecognitionAnnotation are recognized as.
  753. Entity entity = 1;
  754. // All logo tracks where the recognized logo appears. Each track corresponds
  755. // to one logo instance appearing in consecutive frames.
  756. repeated Track tracks = 2;
  757. // All video segments where the recognized logo appears. There might be
  758. // multiple instances of the same logo class appearing in one VideoSegment.
  759. repeated VideoSegment segments = 3;
  760. }
  761. // The top-level message sent by the client for the `StreamingAnnotateVideo`
  762. // method. Multiple `StreamingAnnotateVideoRequest` messages are sent.
  763. // The first message must only contain a `StreamingVideoConfig` message.
  764. // All subsequent messages must only contain `input_content` data.
  765. message StreamingAnnotateVideoRequest {
  766. // *Required* The streaming request, which is either a streaming config or
  767. // video content.
  768. oneof streaming_request {
  769. // Provides information to the annotator, specifing how to process the
  770. // request. The first `AnnotateStreamingVideoRequest` message must only
  771. // contain a `video_config` message.
  772. StreamingVideoConfig video_config = 1;
  773. // The video data to be annotated. Chunks of video data are sequentially
  774. // sent in `StreamingAnnotateVideoRequest` messages. Except the initial
  775. // `StreamingAnnotateVideoRequest` message containing only
  776. // `video_config`, all subsequent `AnnotateStreamingVideoRequest`
  777. // messages must only contain `input_content` field.
  778. // Note: as with all bytes fields, protobuffers use a pure binary
  779. // representation (not base64).
  780. bytes input_content = 2;
  781. }
  782. }
  783. // Provides information to the annotator that specifies how to process the
  784. // request.
  785. message StreamingVideoConfig {
  786. // Config for requested annotation feature.
  787. oneof streaming_config {
  788. // Config for STREAMING_SHOT_CHANGE_DETECTION.
  789. StreamingShotChangeDetectionConfig shot_change_detection_config = 2;
  790. // Config for STREAMING_LABEL_DETECTION.
  791. StreamingLabelDetectionConfig label_detection_config = 3;
  792. // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
  793. StreamingExplicitContentDetectionConfig explicit_content_detection_config =
  794. 4;
  795. // Config for STREAMING_OBJECT_TRACKING.
  796. StreamingObjectTrackingConfig object_tracking_config = 5;
  797. // Config for STREAMING_AUTOML_ACTION_RECOGNITION.
  798. StreamingAutomlActionRecognitionConfig automl_action_recognition_config =
  799. 23;
  800. // Config for STREAMING_AUTOML_CLASSIFICATION.
  801. StreamingAutomlClassificationConfig automl_classification_config = 21;
  802. // Config for STREAMING_AUTOML_OBJECT_TRACKING.
  803. StreamingAutomlObjectTrackingConfig automl_object_tracking_config = 22;
  804. }
  805. // Requested annotation feature.
  806. StreamingFeature feature = 1;
  807. // Streaming storage option. By default: storage is disabled.
  808. StreamingStorageConfig storage_config = 30;
  809. }
  810. // `StreamingAnnotateVideoResponse` is the only message returned to the client
  811. // by `StreamingAnnotateVideo`. A series of zero or more
  812. // `StreamingAnnotateVideoResponse` messages are streamed back to the client.
  813. message StreamingAnnotateVideoResponse {
  814. // If set, returns a [google.rpc.Status][google.rpc.Status] message that
  815. // specifies the error for the operation.
  816. google.rpc.Status error = 1;
  817. // Streaming annotation results.
  818. StreamingVideoAnnotationResults annotation_results = 2;
  819. // Google Cloud Storage(GCS) URI that stores annotation results of one
  820. // streaming session in JSON format.
  821. // It is the annotation_result_storage_directory
  822. // from the request followed by '/cloud_project_number-session_id'.
  823. string annotation_results_uri = 3;
  824. }
  825. // Streaming annotation results corresponding to a portion of the video
  826. // that is currently being processed.
  827. message StreamingVideoAnnotationResults {
  828. // Shot annotation results. Each shot is represented as a video segment.
  829. repeated VideoSegment shot_annotations = 1;
  830. // Label annotation results.
  831. repeated LabelAnnotation label_annotations = 2;
  832. // Explicit content annotation results.
  833. ExplicitContentAnnotation explicit_annotation = 3;
  834. // Object tracking results.
  835. repeated ObjectTrackingAnnotation object_annotations = 4;
  836. }
  837. // Config for STREAMING_SHOT_CHANGE_DETECTION.
  838. message StreamingShotChangeDetectionConfig {}
  839. // Config for STREAMING_LABEL_DETECTION.
  840. message StreamingLabelDetectionConfig {
  841. // Whether the video has been captured from a stationary (i.e. non-moving)
  842. // camera. When set to true, might improve detection accuracy for moving
  843. // objects. Default: false.
  844. bool stationary_camera = 1;
  845. }
  846. // Config for STREAMING_EXPLICIT_CONTENT_DETECTION.
  847. message StreamingExplicitContentDetectionConfig {}
  848. // Config for STREAMING_OBJECT_TRACKING.
  849. message StreamingObjectTrackingConfig {}
  850. // Config for STREAMING_AUTOML_ACTION_RECOGNITION.
  851. message StreamingAutomlActionRecognitionConfig {
  852. // Resource name of AutoML model.
  853. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
  854. string model_name = 1;
  855. }
  856. // Config for STREAMING_AUTOML_CLASSIFICATION.
  857. message StreamingAutomlClassificationConfig {
  858. // Resource name of AutoML model.
  859. // Format:
  860. // `projects/{project_number}/locations/{location_id}/models/{model_id}`
  861. string model_name = 1;
  862. }
  863. // Config for STREAMING_AUTOML_OBJECT_TRACKING.
  864. message StreamingAutomlObjectTrackingConfig {
  865. // Resource name of AutoML model.
  866. // Format: `projects/{project_id}/locations/{location_id}/models/{model_id}`
  867. string model_name = 1;
  868. }
  869. // Config for streaming storage option.
  870. message StreamingStorageConfig {
  871. // Enable streaming storage. Default: false.
  872. bool enable_storage_annotation_result = 1;
  873. // Cloud Storage URI to store all annotation results for one client. Client
  874. // should specify this field as the top-level storage directory. Annotation
  875. // results of different sessions will be put into different sub-directories
  876. // denoted by project_name and session_id. All sub-directories will be auto
  877. // generated by program and will be made accessible to client in response
  878. // proto. URIs must be specified in the following format:
  879. // `gs://bucket-id/object-id` `bucket-id` should be a valid Cloud Storage
  880. // bucket created by client and bucket permission shall also be configured
  881. // properly. `object-id` can be arbitrary string that make sense to client.
  882. // Other URI formats will return error and cause Cloud Storage write failure.
  883. string annotation_result_storage_directory = 3;
  884. }