Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/insdc/sra.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * INSDC Sequence Read Archive schema | |
29 */ | |
30 version 1; | |
31 | |
32 include 'insdc/seq.vschema'; | |
33 | |
34 | |
35 /*-------------------------------------------------------------------------- | |
36 * types | |
37 */ | |
38 | |
39 /* spotid_t | |
40 * unique id given to every spot | |
41 */ | |
42 typedef U32 INSDC:SRA:spotid_t; | |
43 | |
44 | |
45 /* spot_ids_found | |
46 */ | |
47 typedef U64 INSDC:SRA:spot_ids_found [ 4 ]; | |
48 | |
49 | |
50 /*-------------------------------------------------------------------------- | |
51 * functions | |
52 */ | |
53 | |
54 | |
55 /* format_spot_name | |
56 * given a name format string, X, and Y | |
57 * produce a reconstructed spot name string | |
58 * | |
59 * "name_fmt" [ DATA ] - name format string ( see format explanation below ) | |
60 * | |
61 * "X" [ DATA ] - X coordinate for spot | |
62 * | |
63 * "Y" [ DATA ] - Y coordinate for spot | |
64 * | |
65 * "spot_name" [ DATA, OPTIONAL ] - potential source of unformatted names | |
66 * | |
67 * SYNOPSIS: | |
68 * "name_fmt" may have any ASCII characters | |
69 * the special character '$' is an escape symbol | |
70 * when followed by a recognized format character, | |
71 * both the '$' and its format character will be | |
72 * replaced with a numeral generated from X and/or Y. | |
73 * | |
74 * when "spot_name" is present and the "name_fmt" row is empty, | |
75 * output is taken verbatim from "spot_name" | |
76 */ | |
77 function | |
78 ascii INSDC:SRA:format_spot_name #1 ( ascii name_fmt , I32 X , I32 Y * ascii spot_name ); | |
79 | |
80 function | |
81 ascii INSDC:SRA:format_spot_name_no_coord #1 ( ascii name_fmt * ascii spot_name ); | |
82 | |
83 | |
84 /*-------------------------------------------------------------------------- | |
85 * spotcoord | |
86 * spot coordinate table | |
87 * gives X and Y and potentially other common coordinates | |
88 */ | |
89 table INSDC:SRA:tbl:spotcoord #1 | |
90 { | |
91 /* X, Y | |
92 * 32 ( or 16 ) bit coordinates within plate region | |
93 * the coordinate system ( zero or one-based ) is unspecified | |
94 */ | |
95 extern default column INSDC:coord:val X = out_x_coord; | |
96 extern default column INSDC:coord:val Y = out_y_coord; | |
97 | |
98 // backward compatibility for 16-bit unsigned coordinates | |
99 extern readonly column U16 X = cast ( x_clip_U16 ); | |
100 extern readonly column U16 Y = cast ( y_clip_U16 ); | |
101 | |
102 // clip signed 32-bit coordinates to unsigned 16-bit | |
103 INSDC:coord:val x_clip_U16 | |
104 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_x_coord ); | |
105 INSDC:coord:val y_clip_U16 | |
106 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_y_coord ); | |
107 | |
108 | |
109 /* INSDC:SRA:tbl:spotcoord virtual productions | |
110 * out_x_coord | |
111 * out_y_coord | |
112 */ | |
113 }; | |
114 | |
115 | |
116 /*-------------------------------------------------------------------------- | |
117 * spotname | |
118 * spot name table | |
119 * the name column is normally indexed | |
120 * | |
121 * history: | |
122 * 1.0.1 - split X and Y into spotcoord table | |
123 */ | |
124 table INSDC:SRA:tbl:spotname #1.0.1 = INSDC:SRA:tbl:spotcoord #1 | |
125 { | |
126 /* NAME | |
127 * external name for spot | |
128 */ | |
129 extern column ascii NAME = _out_name; | |
130 | |
131 | |
132 /* SPOT_IDS_FOUND | |
133 * lookup by NAME column | |
134 */ | |
135 readonly column INSDC:SRA:spot_ids_found SPOT_IDS_FOUND | |
136 = spot_ids_found; | |
137 | |
138 | |
139 /* default rules */ | |
140 | |
141 // assemble NAME column output in order of preference | |
142 ascii _out_name | |
143 = INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord, out_spot_name ) | |
144 | INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord ) | |
145 | INSDC:SRA:format_spot_name_no_coord (out_name_fmt) | |
146 | out_spot_name; | |
147 | |
148 | |
149 /* INSDC:SRA:tbl:spotcoord inherited virtual productions | |
150 * out_x_coord | |
151 * out_y_coord | |
152 */ | |
153 | |
154 /* INSDC:SRA:tbl:spotname virtual productions | |
155 * out_name_fmt | |
156 * out_spot_name | |
157 * spot_ids_found | |
158 */ | |
159 }; | |
160 | |
161 | |
162 /*-------------------------------------------------------------------------- | |
163 * spotdesc | |
164 * spot descriptor table | |
165 * | |
166 * history: | |
167 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
168 * 1.0.2 - added alternate taps for in_read_type and in_read_len | |
169 */ | |
170 table INSDC:SRA:tbl:spotdesc #1.0.2 = INSDC:tbl:sequence #1.0.1 | |
171 { | |
172 /* NREADS | |
173 * describes the number of reads within spot | |
174 */ | |
175 extern column U8 NREADS = out_nreads; | |
176 | |
177 | |
178 /* SPOT_LEN | |
179 * length of sequence | |
180 * FIXED_SPOT_LEN | |
181 * non-zero if sequence length is fixed throughout table | |
182 */ | |
183 readonly column INSDC:coord:len SPOT_LEN = spot_len; | |
184 readonly column INSDC:coord:len FIXED_SPOT_LEN = fixed_spot_len; | |
185 | |
186 | |
187 /* TRIM_START | |
188 * TRIM_LEN | |
189 * define the spot segment after applying trimming | |
190 * trimming may be based upon technical segments and read quality | |
191 */ | |
192 readonly column INSDC:coord:zero TRIM_START | |
193 = trim_start | |
194 | < INSDC:coord:zero> echo < 0 > (); | |
195 readonly column INSDC:coord:one TRIM_START | |
196 = ( INSDC:coord:one ) < I32 > sum < 1 > ( trim_start ) | |
197 | < INSDC:coord:one> echo < 1 > (); | |
198 readonly column INSDC:coord:len TRIM_LEN | |
199 = trim_len | |
200 | spot_len; | |
201 | |
202 | |
203 /* LABEL | |
204 * LABEL_START, LABEL_LEN | |
205 * column pair for writing read labels | |
206 * the label text for all reads is concatenated to form the LABEL row | |
207 * starting coordinates and lengths delineate labels by read | |
208 * | |
209 * NB - row length for LABEL_START/LEN === NREADS, | |
210 * row length for LABEL === SUM ( LABEL_LEN [ n ] ) for NREADS | |
211 */ | |
212 extern column ascii LABEL = out_label; | |
213 extern column INSDC:coord:zero LABEL_START = out_label_start; | |
214 extern column INSDC:coord:len LABEL_LEN = out_label_len; | |
215 | |
216 // 16-bit versions | |
217 readonly column U16 LABEL_START = cast ( out_label_start ); | |
218 readonly column U16 LABEL_LEN = cast ( out_label_len ); | |
219 | |
220 | |
221 /* READ_TYPE | |
222 * binary values giving type of a read | |
223 * | |
224 * NB - row length === NREADS | |
225 */ | |
226 extern default column INSDC:SRA:xread_type READ_TYPE = out_read_type; | |
227 | |
228 INSDC:SRA:xread_type in_read_type | |
229 = READ_TYPE | |
230 | _alt_in_read_type; | |
231 | |
232 readonly column INSDC:SRA:read_type READ_TYPE | |
233 = out_read_type | |
234 | < INSDC:SRA:xread_type, INSDC:SRA:read_type > map < [ 0,1,2,3,4,5,6,7 ], [ 0,1,0,1,0,1,0,1 ] > ( out_read_type ); | |
235 | |
236 | |
237 /* READ_START | |
238 * READ_LEN | |
239 * define starting coordinates and length of read segments | |
240 * | |
241 * NB - row length === NREADS | |
242 */ | |
243 extern default column INSDC:coord:zero READ_START | |
244 = out_read_start; | |
245 extern column INSDC:coord:one READ_START | |
246 = ( INSDC:coord:one ) < I32 > sum < 1 > ( out_read_start ); | |
247 extern column INSDC:coord:len READ_LEN = out_read_len; | |
248 | |
249 // 16-bit versions | |
250 readonly column U16 READ_START = cast ( out_read_start ); | |
251 readonly column U16 READ_LEN = cast ( out_read_len ); | |
252 | |
253 INSDC:coord:len in_read_len | |
254 = READ_LEN | |
255 | _alt_in_read_len; | |
256 | |
257 | |
258 /* READ_FILTER | |
259 * bits indicate usability of sequence | |
260 * always available | |
261 */ | |
262 extern column INSDC:SRA:read_filter READ_FILTER | |
263 = out_rd_filter | |
264 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_start ); | |
265 | |
266 // RD_FILTER - only available if physical column is present | |
267 extern readonly column INSDC:SRA:read_filter RD_FILTER = out_rd_filter; | |
268 | |
269 | |
270 /* spot_len is used internally */ | |
271 INSDC:coord:len spot_len | |
272 = base_space_spot_len | |
273 | color_space_spot_len | |
274 | align_spot_len; | |
275 INSDC:coord:len fixed_spot_len | |
276 = static_fixed_spot_len | |
277 | base_space_fixed_spot_len | |
278 | color_space_fixed_spot_len; | |
279 | |
280 | |
281 /* INSDC:tbl:sequence inherited virtual productions | |
282 * out_2cs_packed | |
283 * out_2na_packed | |
284 */ | |
285 | |
286 /* INSDC:SRA:tbl:spotdesc productions | |
287 * trim_len | |
288 * out_label | |
289 * out_nreads | |
290 * trim_start | |
291 * out_read_len | |
292 * out_label_len | |
293 * out_rd_filter | |
294 * out_read_type | |
295 * out_read_start | |
296 * out_label_start | |
297 * static_fixed_spot_len | |
298 */ | |
299 }; | |
300 | |
301 /*-------------------------------------------------------------------------- | |
302 * stats | |
303 * run and spot-group statistics | |
304 * | |
305 * history: | |
306 * 1.1.0 - added CMP_BASE_COUNT | |
307 */ | |
308 table INSDC:SRA:tbl:stats #1.1 | |
309 { | |
310 readonly column INSDC:SRA:spotid_t MIN_SPOT_ID | |
311 = min_spot_id | |
312 | < INSDC:SRA:spotid_t > echo < 1 > (); | |
313 readonly column INSDC:SRA:spotid_t MAX_SPOT_ID | |
314 = max_spot_id | |
315 | cast ( spot_count ); | |
316 readonly column U64 | |
317 SPOT_COUNT = spot_count; | |
318 readonly column U64 | |
319 BASE_COUNT = base_count; | |
320 readonly column U64 | |
321 BIO_BASE_COUNT = bio_base_count; | |
322 readonly column U64 CMP_BASE_COUNT | |
323 = cmp_base_count | |
324 | base_count; | |
325 | |
326 U8 stats_dummy = in_stats_bin; | |
327 | |
328 /* INSDC:SRA:tbl:stats productions | |
329 * base_count | |
330 * spot_count | |
331 * max_spot_id | |
332 * min_spot_id | |
333 * in_stats_bin | |
334 * bio_base_count | |
335 * cmp_base_count | |
336 */ | |
337 }; | |
338 | |
339 /*-------------------------------------------------------------------------- | |
340 * sra | |
341 * the INSDC SRA table | |
342 * | |
343 * history: | |
344 * 1.0.1 - base explicitly upon spotname #1.0.1 | |
345 * 1.0.2 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 | |
346 * 1.0.3 - base upon spotdesc #1.0.2 | |
347 */ | |
348 | |
349 // platform constants from <insdc/sra.h> | |
350 typedef U8 INSDC:SRA:platform_id; | |
351 const INSDC:SRA:platform_id SRA_PLATFORM_UNDEFINED = 0; | |
352 const INSDC:SRA:platform_id SRA_PLATFORM_454 = 1; | |
353 const INSDC:SRA:platform_id SRA_PLATFORM_ILLUMINA = 2; | |
354 const INSDC:SRA:platform_id SRA_PLATFORM_ABSOLID = 3; | |
355 const INSDC:SRA:platform_id SRA_PLATFORM_COMPLETE_GENOMICS = 4; | |
356 const INSDC:SRA:platform_id SRA_PLATFORM_HELICOS = 5; | |
357 const INSDC:SRA:platform_id SRA_PLATFORM_PACBIO_SMRT = 6; | |
358 const INSDC:SRA:platform_id SRA_PLATFORM_ION_TORRENT = 7; | |
359 const INSDC:SRA:platform_id SRA_PLATFORM_CAPILLARY = 8; | |
360 const INSDC:SRA:platform_id SRA_PLATFORM_OXFORD_NANOPORE = 9; | |
361 | |
362 table INSDC:SRA:tbl:sra #1.0.3 = | |
363 INSDC:tbl:sequence #1.0.1, INSDC:SRA:tbl:spotname #1.0.1, | |
364 INSDC:SRA:tbl:spotdesc #1.0.2, INSDC:SRA:tbl:stats #1.1.0 | |
365 { | |
366 /* PLATFORM | |
367 * platform description | |
368 * one version returns a constant defined above | |
369 * while the other returns a textual representation | |
370 */ | |
371 extern column INSDC:SRA:platform_id PLATFORM | |
372 = .PLATFORM | |
373 | out_platform; | |
374 readonly column ascii PLATFORM | |
375 = platform_name; | |
376 | |
377 physical column | |
378 < INSDC:SRA:platform_id > zip_encoding .PLATFORM = PLATFORM; | |
379 | |
380 | |
381 /* SPOT_ID | |
382 * reports spot id of current row | |
383 */ | |
384 extern column INSDC:SRA:spotid_t SPOT_ID | |
385 = < INSDC:SRA:spotid_t > add_row_id ( .SPOT_ID ) | |
386 | cast ( rowid_64 ); | |
387 I64 rowid_64 = row_id (); | |
388 | |
389 physical column < INSDC:SRA:spotid_t > izip_encoding .SPOT_ID | |
390 = < INSDC:SRA:spotid_t > sub_row_id ( SPOT_ID ); | |
391 | |
392 | |
393 /* SPOT_GROUP | |
394 * a name denoting group membership, '' | |
395 * used for "barcode" support | |
396 */ | |
397 extern column ascii SPOT_GROUP | |
398 = out_spot_group | |
399 | .SPOT_GROUP | |
400 | < ascii > echo < '' > (); | |
401 | |
402 ascii in_spot_group = SPOT_GROUP; | |
403 | |
404 physical column | |
405 < ascii > zip_encoding < Z_DEFAULT_STRATEGY, Z_BEST_SPEED > .SPOT_GROUP = in_spot_group; | |
406 | |
407 | |
408 /* INSDC:tbl:sequence inherited virtual productions | |
409 * cs_native | |
410 * in_cs_key | |
411 * out_cs_key | |
412 * out_signal | |
413 * in_dna_text | |
414 * out_2cs_bin | |
415 * out_2na_bin | |
416 * out_4na_bin | |
417 * out_dna_text | |
418 * out_x2cs_bin | |
419 * out_x2na_bin | |
420 * in_color_text | |
421 * out_2cs_packed | |
422 * out_2na_packed | |
423 * out_4na_packed | |
424 * out_color_text | |
425 * out_qual_phred | |
426 * out_color_matrix | |
427 */ | |
428 | |
429 /* INSDC:SRA:tbl:spotcoord inherited virtual productions | |
430 * out_x_coord | |
431 * out_y_coord | |
432 */ | |
433 | |
434 /* INSDC:SRA:tbl:spotname inherited virtual productions | |
435 * out_name_fmt | |
436 * out_spot_name | |
437 * spot_ids_found | |
438 */ | |
439 | |
440 /* INSDC:SRA:tbl:spotdesc inherited productions | |
441 * trim_len | |
442 * out_label | |
443 * out_nreads | |
444 * trim_start | |
445 * out_read_len | |
446 * out_label_len | |
447 * out_rd_filter | |
448 * out_read_type | |
449 * out_read_start | |
450 * out_label_start | |
451 * static_fixed_spot_len | |
452 */ | |
453 | |
454 /* INSDC:SRA:tbl:stats inherited productions | |
455 * base_count | |
456 * spot_count | |
457 * max_spot_id | |
458 * min_spot_id | |
459 * in_stats_bin | |
460 * bio_base_count | |
461 */ | |
462 | |
463 /* INSDC:SRA:tbl:sra productions | |
464 * out_platform | |
465 * platform_name | |
466 */ | |
467 }; |