reads.proto 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.genomics.v1;
  16. import "google/api/annotations.proto";
  17. import "google/genomics/v1/range.proto";
  18. import "google/genomics/v1/readalignment.proto";
  19. import "google/genomics/v1/readgroupset.proto";
  20. import "google/longrunning/operations.proto";
  21. import "google/protobuf/empty.proto";
  22. import "google/protobuf/field_mask.proto";
  23. option cc_enable_arenas = true;
  24. option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
  25. option java_multiple_files = true;
  26. option java_outer_classname = "ReadsProto";
  27. option java_package = "com.google.genomics.v1";
  28. service StreamingReadService {
  29. // Returns a stream of all the reads matching the search request, ordered
  30. // by reference name, position, and ID.
  31. rpc StreamReads(StreamReadsRequest) returns (stream StreamReadsResponse) {
  32. option (google.api.http) = {
  33. post: "/v1/reads:stream"
  34. body: "*"
  35. };
  36. }
  37. }
  38. // The Readstore. A data store for DNA sequencing Reads.
  39. service ReadServiceV1 {
  40. // Creates read group sets by asynchronously importing the provided
  41. // information.
  42. //
  43. // For the definitions of read group sets and other genomics resources, see
  44. // [Fundamentals of Google
  45. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  46. //
  47. // The caller must have WRITE permissions to the dataset.
  48. //
  49. // ## Notes on [BAM](https://samtools.github.io/hts-specs/SAMv1.pdf) import
  50. //
  51. // - Tags will be converted to strings - tag types are not preserved
  52. // - Comments (`@CO`) in the input file header will not be preserved
  53. // - Original header order of references (`@SQ`) will not be preserved
  54. // - Any reverse stranded unmapped reads will be reverse complemented, and
  55. // their qualities (also the "BQ" and "OQ" tags, if any) will be reversed
  56. // - Unmapped reads will be stripped of positional information (reference name
  57. // and position)
  58. rpc ImportReadGroupSets(ImportReadGroupSetsRequest)
  59. returns (google.longrunning.Operation) {
  60. option (google.api.http) = {
  61. post: "/v1/readgroupsets:import"
  62. body: "*"
  63. };
  64. }
  65. // Exports a read group set to a BAM file in Google Cloud Storage.
  66. //
  67. // For the definitions of read group sets and other genomics resources, see
  68. // [Fundamentals of Google
  69. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  70. //
  71. // Note that currently there may be some differences between exported BAM
  72. // files and the original BAM file at the time of import. See
  73. // [ImportReadGroupSets][google.genomics.v1.ReadServiceV1.ImportReadGroupSets]
  74. // for caveats.
  75. rpc ExportReadGroupSet(ExportReadGroupSetRequest)
  76. returns (google.longrunning.Operation) {
  77. option (google.api.http) = {
  78. post: "/v1/readgroupsets/{read_group_set_id}:export"
  79. body: "*"
  80. };
  81. }
  82. // Searches for read group sets matching the criteria.
  83. //
  84. // For the definitions of read group sets and other genomics resources, see
  85. // [Fundamentals of Google
  86. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  87. //
  88. // Implements
  89. // [GlobalAllianceApi.searchReadGroupSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L135).
  90. rpc SearchReadGroupSets(SearchReadGroupSetsRequest)
  91. returns (SearchReadGroupSetsResponse) {
  92. option (google.api.http) = {
  93. post: "/v1/readgroupsets/search"
  94. body: "*"
  95. };
  96. }
  97. // Updates a read group set.
  98. //
  99. // For the definitions of read group sets and other genomics resources, see
  100. // [Fundamentals of Google
  101. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  102. //
  103. // This method supports patch semantics.
  104. rpc UpdateReadGroupSet(UpdateReadGroupSetRequest) returns (ReadGroupSet) {
  105. option (google.api.http) = {
  106. patch: "/v1/readgroupsets/{read_group_set_id}"
  107. body: "read_group_set"
  108. };
  109. }
  110. // Deletes a read group set.
  111. //
  112. // For the definitions of read group sets and other genomics resources, see
  113. // [Fundamentals of Google
  114. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  115. rpc DeleteReadGroupSet(DeleteReadGroupSetRequest)
  116. returns (google.protobuf.Empty) {
  117. option (google.api.http) = {
  118. delete: "/v1/readgroupsets/{read_group_set_id}"
  119. };
  120. }
  121. // Gets a read group set by ID.
  122. //
  123. // For the definitions of read group sets and other genomics resources, see
  124. // [Fundamentals of Google
  125. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  126. rpc GetReadGroupSet(GetReadGroupSetRequest) returns (ReadGroupSet) {
  127. option (google.api.http) = {
  128. get: "/v1/readgroupsets/{read_group_set_id}"
  129. };
  130. }
  131. // Lists fixed width coverage buckets for a read group set, each of which
  132. // correspond to a range of a reference sequence. Each bucket summarizes
  133. // coverage information across its corresponding genomic range.
  134. //
  135. // For the definitions of read group sets and other genomics resources, see
  136. // [Fundamentals of Google
  137. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  138. //
  139. // Coverage is defined as the number of reads which are aligned to a given
  140. // base in the reference sequence. Coverage buckets are available at several
  141. // precomputed bucket widths, enabling retrieval of various coverage 'zoom
  142. // levels'. The caller must have READ permissions for the target read group
  143. // set.
  144. rpc ListCoverageBuckets(ListCoverageBucketsRequest)
  145. returns (ListCoverageBucketsResponse) {
  146. option (google.api.http) = {
  147. get: "/v1/readgroupsets/{read_group_set_id}/coveragebuckets"
  148. };
  149. }
  150. // Gets a list of reads for one or more read group sets.
  151. //
  152. // For the definitions of read group sets and other genomics resources, see
  153. // [Fundamentals of Google
  154. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  155. //
  156. // Reads search operates over a genomic coordinate space of reference sequence
  157. // & position defined over the reference sequences to which the requested
  158. // read group sets are aligned.
  159. //
  160. // If a target positional range is specified, search returns all reads whose
  161. // alignment to the reference genome overlap the range. A query which
  162. // specifies only read group set IDs yields all reads in those read group
  163. // sets, including unmapped reads.
  164. //
  165. // All reads returned (including reads on subsequent pages) are ordered by
  166. // genomic coordinate (by reference sequence, then position). Reads with
  167. // equivalent genomic coordinates are returned in an unspecified order. This
  168. // order is consistent, such that two queries for the same content (regardless
  169. // of page size) yield reads in the same order across their respective streams
  170. // of paginated responses.
  171. //
  172. // Implements
  173. // [GlobalAllianceApi.searchReads](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L85).
  174. rpc SearchReads(SearchReadsRequest) returns (SearchReadsResponse) {
  175. option (google.api.http) = {
  176. post: "/v1/reads/search"
  177. body: "*"
  178. };
  179. }
  180. }
  181. // The read group set search request.
  182. message SearchReadGroupSetsRequest {
  183. // Restricts this query to read group sets within the given datasets. At least
  184. // one ID must be provided.
  185. repeated string dataset_ids = 1;
  186. // Only return read group sets for which a substring of the name matches this
  187. // string.
  188. string name = 3;
  189. // The continuation token, which is used to page through large result sets.
  190. // To get the next page of results, set this parameter to the value of
  191. // `nextPageToken` from the previous response.
  192. string page_token = 2;
  193. // The maximum number of results to return in a single page. If unspecified,
  194. // defaults to 256. The maximum value is 1024.
  195. int32 page_size = 4;
  196. }
  197. // The read group set search response.
  198. message SearchReadGroupSetsResponse {
  199. // The list of matching read group sets.
  200. repeated ReadGroupSet read_group_sets = 1;
  201. // The continuation token, which is used to page through large result sets.
  202. // Provide this value in a subsequent request to return the next page of
  203. // results. This field will be empty if there aren't any additional results.
  204. string next_page_token = 2;
  205. }
  206. // The read group set import request.
  207. message ImportReadGroupSetsRequest {
  208. enum PartitionStrategy {
  209. PARTITION_STRATEGY_UNSPECIFIED = 0;
  210. // In most cases, this strategy yields one read group set per file. This is
  211. // the default behavior.
  212. //
  213. // Allocate one read group set per file per sample. For BAM files, read
  214. // groups are considered to share a sample if they have identical sample
  215. // names. Furthermore, all reads for each file which do not belong to a read
  216. // group, if any, will be grouped into a single read group set per-file.
  217. PER_FILE_PER_SAMPLE = 1;
  218. // Includes all read groups in all imported files into a single read group
  219. // set. Requires that the headers for all imported files are equivalent. All
  220. // reads which do not belong to a read group, if any, will be grouped into a
  221. // separate read group set.
  222. MERGE_ALL = 2;
  223. }
  224. // Required. The ID of the dataset these read group sets will belong to. The
  225. // caller must have WRITE permissions to this dataset.
  226. string dataset_id = 1;
  227. // The reference set to which the imported read group sets are aligned to, if
  228. // any. The reference names of this reference set must be a superset of those
  229. // found in the imported file headers. If no reference set id is provided, a
  230. // best effort is made to associate with a matching reference set.
  231. string reference_set_id = 4;
  232. // A list of URIs pointing at [BAM
  233. // files](https://samtools.github.io/hts-specs/SAMv1.pdf)
  234. // in Google Cloud Storage.
  235. // Those URIs can include wildcards (*), but do not add or remove
  236. // matching files before import has completed.
  237. //
  238. // Note that Google Cloud Storage object listing is only eventually
  239. // consistent: files added may be not be immediately visible to
  240. // everyone. Thus, if using a wildcard it is preferable not to start
  241. // the import immediately after the files are created.
  242. repeated string source_uris = 2;
  243. // The partition strategy describes how read groups are partitioned into read
  244. // group sets.
  245. PartitionStrategy partition_strategy = 5;
  246. }
  247. // The read group set import response.
  248. message ImportReadGroupSetsResponse {
  249. // IDs of the read group sets that were created.
  250. repeated string read_group_set_ids = 1;
  251. }
  252. // The read group set export request.
  253. message ExportReadGroupSetRequest {
  254. // Required. The Google Cloud project ID that owns this
  255. // export. The caller must have WRITE access to this project.
  256. string project_id = 1;
  257. // Required. A Google Cloud Storage URI for the exported BAM file.
  258. // The currently authenticated user must have write access to the new file.
  259. // An error will be returned if the URI already contains data.
  260. string export_uri = 2;
  261. // Required. The ID of the read group set to export. The caller must have
  262. // READ access to this read group set.
  263. string read_group_set_id = 3;
  264. // The reference names to export. If this is not specified, all reference
  265. // sequences, including unmapped reads, are exported.
  266. // Use `*` to export only unmapped reads.
  267. repeated string reference_names = 4;
  268. }
  269. message UpdateReadGroupSetRequest {
  270. // The ID of the read group set to be updated. The caller must have WRITE
  271. // permissions to the dataset associated with this read group set.
  272. string read_group_set_id = 1;
  273. // The new read group set data. See `updateMask` for details on mutability of
  274. // fields.
  275. ReadGroupSet read_group_set = 2;
  276. // An optional mask specifying which fields to update. Supported fields:
  277. //
  278. // * [name][google.genomics.v1.ReadGroupSet.name].
  279. // * [referenceSetId][google.genomics.v1.ReadGroupSet.reference_set_id].
  280. //
  281. // Leaving `updateMask` unset is equivalent to specifying all mutable
  282. // fields.
  283. google.protobuf.FieldMask update_mask = 3;
  284. }
  285. message DeleteReadGroupSetRequest {
  286. // The ID of the read group set to be deleted. The caller must have WRITE
  287. // permissions to the dataset associated with this read group set.
  288. string read_group_set_id = 1;
  289. }
  290. message GetReadGroupSetRequest {
  291. // The ID of the read group set.
  292. string read_group_set_id = 1;
  293. }
  294. message ListCoverageBucketsRequest {
  295. // Required. The ID of the read group set over which coverage is requested.
  296. string read_group_set_id = 1;
  297. // The name of the reference to query, within the reference set associated
  298. // with this query. Optional.
  299. string reference_name = 3;
  300. // The start position of the range on the reference, 0-based inclusive. If
  301. // specified, `referenceName` must also be specified. Defaults to 0.
  302. int64 start = 4;
  303. // The end position of the range on the reference, 0-based exclusive. If
  304. // specified, `referenceName` must also be specified. If unset or 0, defaults
  305. // to the length of the reference.
  306. int64 end = 5;
  307. // The desired width of each reported coverage bucket in base pairs. This
  308. // will be rounded down to the nearest precomputed bucket width; the value
  309. // of which is returned as `bucketWidth` in the response. Defaults
  310. // to infinity (each bucket spans an entire reference sequence) or the length
  311. // of the target range, if specified. The smallest precomputed
  312. // `bucketWidth` is currently 2048 base pairs; this is subject to
  313. // change.
  314. int64 target_bucket_width = 6;
  315. // The continuation token, which is used to page through large result sets.
  316. // To get the next page of results, set this parameter to the value of
  317. // `nextPageToken` from the previous response.
  318. string page_token = 7;
  319. // The maximum number of results to return in a single page. If unspecified,
  320. // defaults to 1024. The maximum value is 2048.
  321. int32 page_size = 8;
  322. }
  323. // A bucket over which read coverage has been precomputed. A bucket corresponds
  324. // to a specific range of the reference sequence.
  325. message CoverageBucket {
  326. // The genomic coordinate range spanned by this bucket.
  327. Range range = 1;
  328. // The average number of reads which are aligned to each individual
  329. // reference base in this bucket.
  330. float mean_coverage = 2;
  331. }
  332. message ListCoverageBucketsResponse {
  333. // The length of each coverage bucket in base pairs. Note that buckets at the
  334. // end of a reference sequence may be shorter. This value is omitted if the
  335. // bucket width is infinity (the default behaviour, with no range or
  336. // `targetBucketWidth`).
  337. int64 bucket_width = 1;
  338. // The coverage buckets. The list of buckets is sparse; a bucket with 0
  339. // overlapping reads is not returned. A bucket never crosses more than one
  340. // reference sequence. Each bucket has width `bucketWidth`, unless
  341. // its end is the end of the reference sequence.
  342. repeated CoverageBucket coverage_buckets = 2;
  343. // The continuation token, which is used to page through large result sets.
  344. // Provide this value in a subsequent request to return the next page of
  345. // results. This field will be empty if there aren't any additional results.
  346. string next_page_token = 3;
  347. }
  348. // The read search request.
  349. message SearchReadsRequest {
  350. // The IDs of the read groups sets within which to search for reads. All
  351. // specified read group sets must be aligned against a common set of reference
  352. // sequences; this defines the genomic coordinates for the query. Must specify
  353. // one of `readGroupSetIds` or `readGroupIds`.
  354. repeated string read_group_set_ids = 1;
  355. // The IDs of the read groups within which to search for reads. All specified
  356. // read groups must belong to the same read group sets. Must specify one of
  357. // `readGroupSetIds` or `readGroupIds`.
  358. repeated string read_group_ids = 5;
  359. // The reference sequence name, for example `chr1`, `1`, or `chrX`. If set to
  360. // `*`, only unmapped reads are returned. If unspecified, all reads (mapped
  361. // and unmapped) are returned.
  362. string reference_name = 7;
  363. // The start position of the range on the reference, 0-based inclusive. If
  364. // specified, `referenceName` must also be specified.
  365. int64 start = 8;
  366. // The end position of the range on the reference, 0-based exclusive. If
  367. // specified, `referenceName` must also be specified.
  368. int64 end = 9;
  369. // The continuation token, which is used to page through large result sets.
  370. // To get the next page of results, set this parameter to the value of
  371. // `nextPageToken` from the previous response.
  372. string page_token = 3;
  373. // The maximum number of results to return in a single page. If unspecified,
  374. // defaults to 256. The maximum value is 2048.
  375. int32 page_size = 4;
  376. }
  377. // The read search response.
  378. message SearchReadsResponse {
  379. // The list of matching alignments sorted by mapped genomic coordinate,
  380. // if any, ascending in position within the same reference. Unmapped reads,
  381. // which have no position, are returned contiguously and are sorted in
  382. // ascending lexicographic order by fragment name.
  383. repeated Read alignments = 1;
  384. // The continuation token, which is used to page through large result sets.
  385. // Provide this value in a subsequent request to return the next page of
  386. // results. This field will be empty if there aren't any additional results.
  387. string next_page_token = 2;
  388. }
  389. // The stream reads request.
  390. message StreamReadsRequest {
  391. // The Google Cloud project ID which will be billed
  392. // for this access. The caller must have WRITE access to this project.
  393. // Required.
  394. string project_id = 1;
  395. // The ID of the read group set from which to stream reads.
  396. string read_group_set_id = 2;
  397. // The reference sequence name, for example `chr1`,
  398. // `1`, or `chrX`. If set to *, only unmapped reads are
  399. // returned.
  400. string reference_name = 3;
  401. // The start position of the range on the reference, 0-based inclusive. If
  402. // specified, `referenceName` must also be specified.
  403. int64 start = 4;
  404. // The end position of the range on the reference, 0-based exclusive. If
  405. // specified, `referenceName` must also be specified.
  406. int64 end = 5;
  407. // Restricts results to a shard containing approximately `1/totalShards`
  408. // of the normal response payload for this query. Results from a sharded
  409. // request are disjoint from those returned by all queries which differ only
  410. // in their shard parameter. A shard may yield 0 results; this is especially
  411. // likely for large values of `totalShards`.
  412. //
  413. // Valid values are `[0, totalShards)`.
  414. int32 shard = 6;
  415. // Specifying `totalShards` causes a disjoint subset of the normal response
  416. // payload to be returned for each query with a unique `shard` parameter
  417. // specified. A best effort is made to yield equally sized shards. Sharding
  418. // can be used to distribute processing amongst workers, where each worker is
  419. // assigned a unique `shard` number and all workers specify the same
  420. // `totalShards` number. The union of reads returned for all sharded queries
  421. // `[0, totalShards)` is equal to those returned by a single unsharded query.
  422. //
  423. // Queries for different values of `totalShards` with common divisors will
  424. // share shard boundaries. For example, streaming `shard` 2 of 5
  425. // `totalShards` yields the same results as streaming `shard`s 4 and 5 of 10
  426. // `totalShards`. This property can be leveraged for adaptive retries.
  427. int32 total_shards = 7;
  428. }
  429. message StreamReadsResponse {
  430. repeated Read alignments = 1;
  431. }