comparison libs/sratoolkit.2.8.0-centos_linux64/schema/insdc/sra.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * INSDC Sequence Read Archive schema
29 */
30 version 1;
31
32 include 'insdc/seq.vschema';
33
34
35 /*--------------------------------------------------------------------------
36 * types
37 */
38
39 /* spotid_t
40 * unique id given to every spot
41 */
42 typedef U32 INSDC:SRA:spotid_t;
43
44
45 /* spot_ids_found
46 */
47 typedef U64 INSDC:SRA:spot_ids_found [ 4 ];
48
49
50 /*--------------------------------------------------------------------------
51 * functions
52 */
53
54
55 /* format_spot_name
56 * given a name format string, X, and Y
57 * produce a reconstructed spot name string
58 *
59 * "name_fmt" [ DATA ] - name format string ( see format explanation below )
60 *
61 * "X" [ DATA ] - X coordinate for spot
62 *
63 * "Y" [ DATA ] - Y coordinate for spot
64 *
65 * "spot_name" [ DATA, OPTIONAL ] - potential source of unformatted names
66 *
67 * SYNOPSIS:
68 * "name_fmt" may have any ASCII characters
69 * the special character '$' is an escape symbol
70 * when followed by a recognized format character,
71 * both the '$' and its format character will be
72 * replaced with a numeral generated from X and/or Y.
73 *
74 * when "spot_name" is present and the "name_fmt" row is empty,
75 * output is taken verbatim from "spot_name"
76 */
77 function
78 ascii INSDC:SRA:format_spot_name #1 ( ascii name_fmt , I32 X , I32 Y * ascii spot_name );
79
80 function
81 ascii INSDC:SRA:format_spot_name_no_coord #1 ( ascii name_fmt * ascii spot_name );
82
83
84 /*--------------------------------------------------------------------------
85 * spotcoord
86 * spot coordinate table
87 * gives X and Y and potentially other common coordinates
88 */
89 table INSDC:SRA:tbl:spotcoord #1
90 {
91 /* X, Y
92 * 32 ( or 16 ) bit coordinates within plate region
93 * the coordinate system ( zero or one-based ) is unspecified
94 */
95 extern default column INSDC:coord:val X = out_x_coord;
96 extern default column INSDC:coord:val Y = out_y_coord;
97
98 // backward compatibility for 16-bit unsigned coordinates
99 extern readonly column U16 X = cast ( x_clip_U16 );
100 extern readonly column U16 Y = cast ( y_clip_U16 );
101
102 // clip signed 32-bit coordinates to unsigned 16-bit
103 INSDC:coord:val x_clip_U16
104 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_x_coord );
105 INSDC:coord:val y_clip_U16
106 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_y_coord );
107
108
109 /* INSDC:SRA:tbl:spotcoord virtual productions
110 * out_x_coord
111 * out_y_coord
112 */
113 };
114
115
116 /*--------------------------------------------------------------------------
117 * spotname
118 * spot name table
119 * the name column is normally indexed
120 *
121 * history:
122 * 1.0.1 - split X and Y into spotcoord table
123 */
124 table INSDC:SRA:tbl:spotname #1.0.1 = INSDC:SRA:tbl:spotcoord #1
125 {
126 /* NAME
127 * external name for spot
128 */
129 extern column ascii NAME = _out_name;
130
131
132 /* SPOT_IDS_FOUND
133 * lookup by NAME column
134 */
135 readonly column INSDC:SRA:spot_ids_found SPOT_IDS_FOUND
136 = spot_ids_found;
137
138
139 /* default rules */
140
141 // assemble NAME column output in order of preference
142 ascii _out_name
143 = INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord, out_spot_name )
144 | INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord )
145 | INSDC:SRA:format_spot_name_no_coord (out_name_fmt)
146 | out_spot_name;
147
148
149 /* INSDC:SRA:tbl:spotcoord inherited virtual productions
150 * out_x_coord
151 * out_y_coord
152 */
153
154 /* INSDC:SRA:tbl:spotname virtual productions
155 * out_name_fmt
156 * out_spot_name
157 * spot_ids_found
158 */
159 };
160
161
162 /*--------------------------------------------------------------------------
163 * spotdesc
164 * spot descriptor table
165 *
166 * history:
167 * 1.0.1 - base explicitly upon sequence #1.0.1
168 * 1.0.2 - added alternate taps for in_read_type and in_read_len
169 */
170 table INSDC:SRA:tbl:spotdesc #1.0.2 = INSDC:tbl:sequence #1.0.1
171 {
172 /* NREADS
173 * describes the number of reads within spot
174 */
175 extern column U8 NREADS = out_nreads;
176
177
178 /* SPOT_LEN
179 * length of sequence
180 * FIXED_SPOT_LEN
181 * non-zero if sequence length is fixed throughout table
182 */
183 readonly column INSDC:coord:len SPOT_LEN = spot_len;
184 readonly column INSDC:coord:len FIXED_SPOT_LEN = fixed_spot_len;
185
186
187 /* TRIM_START
188 * TRIM_LEN
189 * define the spot segment after applying trimming
190 * trimming may be based upon technical segments and read quality
191 */
192 readonly column INSDC:coord:zero TRIM_START
193 = trim_start
194 | < INSDC:coord:zero> echo < 0 > ();
195 readonly column INSDC:coord:one TRIM_START
196 = ( INSDC:coord:one ) < I32 > sum < 1 > ( trim_start )
197 | < INSDC:coord:one> echo < 1 > ();
198 readonly column INSDC:coord:len TRIM_LEN
199 = trim_len
200 | spot_len;
201
202
203 /* LABEL
204 * LABEL_START, LABEL_LEN
205 * column pair for writing read labels
206 * the label text for all reads is concatenated to form the LABEL row
207 * starting coordinates and lengths delineate labels by read
208 *
209 * NB - row length for LABEL_START/LEN === NREADS,
210 * row length for LABEL === SUM ( LABEL_LEN [ n ] ) for NREADS
211 */
212 extern column ascii LABEL = out_label;
213 extern column INSDC:coord:zero LABEL_START = out_label_start;
214 extern column INSDC:coord:len LABEL_LEN = out_label_len;
215
216 // 16-bit versions
217 readonly column U16 LABEL_START = cast ( out_label_start );
218 readonly column U16 LABEL_LEN = cast ( out_label_len );
219
220
221 /* READ_TYPE
222 * binary values giving type of a read
223 *
224 * NB - row length === NREADS
225 */
226 extern default column INSDC:SRA:xread_type READ_TYPE = out_read_type;
227
228 INSDC:SRA:xread_type in_read_type
229 = READ_TYPE
230 | _alt_in_read_type;
231
232 readonly column INSDC:SRA:read_type READ_TYPE
233 = out_read_type
234 | < INSDC:SRA:xread_type, INSDC:SRA:read_type > map < [ 0,1,2,3,4,5,6,7 ], [ 0,1,0,1,0,1,0,1 ] > ( out_read_type );
235
236
237 /* READ_START
238 * READ_LEN
239 * define starting coordinates and length of read segments
240 *
241 * NB - row length === NREADS
242 */
243 extern default column INSDC:coord:zero READ_START
244 = out_read_start;
245 extern column INSDC:coord:one READ_START
246 = ( INSDC:coord:one ) < I32 > sum < 1 > ( out_read_start );
247 extern column INSDC:coord:len READ_LEN = out_read_len;
248
249 // 16-bit versions
250 readonly column U16 READ_START = cast ( out_read_start );
251 readonly column U16 READ_LEN = cast ( out_read_len );
252
253 INSDC:coord:len in_read_len
254 = READ_LEN
255 | _alt_in_read_len;
256
257
258 /* READ_FILTER
259 * bits indicate usability of sequence
260 * always available
261 */
262 extern column INSDC:SRA:read_filter READ_FILTER
263 = out_rd_filter
264 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_start );
265
266 // RD_FILTER - only available if physical column is present
267 extern readonly column INSDC:SRA:read_filter RD_FILTER = out_rd_filter;
268
269
270 /* spot_len is used internally */
271 INSDC:coord:len spot_len
272 = base_space_spot_len
273 | color_space_spot_len
274 | align_spot_len;
275 INSDC:coord:len fixed_spot_len
276 = static_fixed_spot_len
277 | base_space_fixed_spot_len
278 | color_space_fixed_spot_len;
279
280
281 /* INSDC:tbl:sequence inherited virtual productions
282 * out_2cs_packed
283 * out_2na_packed
284 */
285
286 /* INSDC:SRA:tbl:spotdesc productions
287 * trim_len
288 * out_label
289 * out_nreads
290 * trim_start
291 * out_read_len
292 * out_label_len
293 * out_rd_filter
294 * out_read_type
295 * out_read_start
296 * out_label_start
297 * static_fixed_spot_len
298 */
299 };
300
301 /*--------------------------------------------------------------------------
302 * stats
303 * run and spot-group statistics
304 *
305 * history:
306 * 1.1.0 - added CMP_BASE_COUNT
307 */
308 table INSDC:SRA:tbl:stats #1.1
309 {
310 readonly column INSDC:SRA:spotid_t MIN_SPOT_ID
311 = min_spot_id
312 | < INSDC:SRA:spotid_t > echo < 1 > ();
313 readonly column INSDC:SRA:spotid_t MAX_SPOT_ID
314 = max_spot_id
315 | cast ( spot_count );
316 readonly column U64
317 SPOT_COUNT = spot_count;
318 readonly column U64
319 BASE_COUNT = base_count;
320 readonly column U64
321 BIO_BASE_COUNT = bio_base_count;
322 readonly column U64 CMP_BASE_COUNT
323 = cmp_base_count
324 | base_count;
325
326 U8 stats_dummy = in_stats_bin;
327
328 /* INSDC:SRA:tbl:stats productions
329 * base_count
330 * spot_count
331 * max_spot_id
332 * min_spot_id
333 * in_stats_bin
334 * bio_base_count
335 * cmp_base_count
336 */
337 };
338
339 /*--------------------------------------------------------------------------
340 * sra
341 * the INSDC SRA table
342 *
343 * history:
344 * 1.0.1 - base explicitly upon spotname #1.0.1
345 * 1.0.2 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1
346 * 1.0.3 - base upon spotdesc #1.0.2
347 */
348
349 // platform constants from <insdc/sra.h>
350 typedef U8 INSDC:SRA:platform_id;
351 const INSDC:SRA:platform_id SRA_PLATFORM_UNDEFINED = 0;
352 const INSDC:SRA:platform_id SRA_PLATFORM_454 = 1;
353 const INSDC:SRA:platform_id SRA_PLATFORM_ILLUMINA = 2;
354 const INSDC:SRA:platform_id SRA_PLATFORM_ABSOLID = 3;
355 const INSDC:SRA:platform_id SRA_PLATFORM_COMPLETE_GENOMICS = 4;
356 const INSDC:SRA:platform_id SRA_PLATFORM_HELICOS = 5;
357 const INSDC:SRA:platform_id SRA_PLATFORM_PACBIO_SMRT = 6;
358 const INSDC:SRA:platform_id SRA_PLATFORM_ION_TORRENT = 7;
359 const INSDC:SRA:platform_id SRA_PLATFORM_CAPILLARY = 8;
360 const INSDC:SRA:platform_id SRA_PLATFORM_OXFORD_NANOPORE = 9;
361
362 table INSDC:SRA:tbl:sra #1.0.3 =
363 INSDC:tbl:sequence #1.0.1, INSDC:SRA:tbl:spotname #1.0.1,
364 INSDC:SRA:tbl:spotdesc #1.0.2, INSDC:SRA:tbl:stats #1.1.0
365 {
366 /* PLATFORM
367 * platform description
368 * one version returns a constant defined above
369 * while the other returns a textual representation
370 */
371 extern column INSDC:SRA:platform_id PLATFORM
372 = .PLATFORM
373 | out_platform;
374 readonly column ascii PLATFORM
375 = platform_name;
376
377 physical column
378 < INSDC:SRA:platform_id > zip_encoding .PLATFORM = PLATFORM;
379
380
381 /* SPOT_ID
382 * reports spot id of current row
383 */
384 extern column INSDC:SRA:spotid_t SPOT_ID
385 = < INSDC:SRA:spotid_t > add_row_id ( .SPOT_ID )
386 | cast ( rowid_64 );
387 I64 rowid_64 = row_id ();
388
389 physical column < INSDC:SRA:spotid_t > izip_encoding .SPOT_ID
390 = < INSDC:SRA:spotid_t > sub_row_id ( SPOT_ID );
391
392
393 /* SPOT_GROUP
394 * a name denoting group membership, ''
395 * used for "barcode" support
396 */
397 extern column ascii SPOT_GROUP
398 = out_spot_group
399 | .SPOT_GROUP
400 | < ascii > echo < '' > ();
401
402 ascii in_spot_group = SPOT_GROUP;
403
404 physical column
405 < ascii > zip_encoding < Z_DEFAULT_STRATEGY, Z_BEST_SPEED > .SPOT_GROUP = in_spot_group;
406
407
408 /* INSDC:tbl:sequence inherited virtual productions
409 * cs_native
410 * in_cs_key
411 * out_cs_key
412 * out_signal
413 * in_dna_text
414 * out_2cs_bin
415 * out_2na_bin
416 * out_4na_bin
417 * out_dna_text
418 * out_x2cs_bin
419 * out_x2na_bin
420 * in_color_text
421 * out_2cs_packed
422 * out_2na_packed
423 * out_4na_packed
424 * out_color_text
425 * out_qual_phred
426 * out_color_matrix
427 */
428
429 /* INSDC:SRA:tbl:spotcoord inherited virtual productions
430 * out_x_coord
431 * out_y_coord
432 */
433
434 /* INSDC:SRA:tbl:spotname inherited virtual productions
435 * out_name_fmt
436 * out_spot_name
437 * spot_ids_found
438 */
439
440 /* INSDC:SRA:tbl:spotdesc inherited productions
441 * trim_len
442 * out_label
443 * out_nreads
444 * trim_start
445 * out_read_len
446 * out_label_len
447 * out_rd_filter
448 * out_read_type
449 * out_read_start
450 * out_label_start
451 * static_fixed_spot_len
452 */
453
454 /* INSDC:SRA:tbl:stats inherited productions
455 * base_count
456 * spot_count
457 * max_spot_id
458 * min_spot_id
459 * in_stats_bin
460 * bio_base_count
461 */
462
463 /* INSDC:SRA:tbl:sra productions
464 * out_platform
465 * platform_name
466 */
467 };