Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/insdc/sra.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * INSDC Sequence Read Archive schema | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'insdc/seq.vschema'; | |
| 33 | |
| 34 | |
| 35 /*-------------------------------------------------------------------------- | |
| 36 * types | |
| 37 */ | |
| 38 | |
| 39 /* spotid_t | |
| 40 * unique id given to every spot | |
| 41 */ | |
| 42 typedef U32 INSDC:SRA:spotid_t; | |
| 43 | |
| 44 | |
| 45 /* spot_ids_found | |
| 46 */ | |
| 47 typedef U64 INSDC:SRA:spot_ids_found [ 4 ]; | |
| 48 | |
| 49 | |
| 50 /*-------------------------------------------------------------------------- | |
| 51 * functions | |
| 52 */ | |
| 53 | |
| 54 | |
| 55 /* format_spot_name | |
| 56 * given a name format string, X, and Y | |
| 57 * produce a reconstructed spot name string | |
| 58 * | |
| 59 * "name_fmt" [ DATA ] - name format string ( see format explanation below ) | |
| 60 * | |
| 61 * "X" [ DATA ] - X coordinate for spot | |
| 62 * | |
| 63 * "Y" [ DATA ] - Y coordinate for spot | |
| 64 * | |
| 65 * "spot_name" [ DATA, OPTIONAL ] - potential source of unformatted names | |
| 66 * | |
| 67 * SYNOPSIS: | |
| 68 * "name_fmt" may have any ASCII characters | |
| 69 * the special character '$' is an escape symbol | |
| 70 * when followed by a recognized format character, | |
| 71 * both the '$' and its format character will be | |
| 72 * replaced with a numeral generated from X and/or Y. | |
| 73 * | |
| 74 * when "spot_name" is present and the "name_fmt" row is empty, | |
| 75 * output is taken verbatim from "spot_name" | |
| 76 */ | |
| 77 function | |
| 78 ascii INSDC:SRA:format_spot_name #1 ( ascii name_fmt , I32 X , I32 Y * ascii spot_name ); | |
| 79 | |
| 80 function | |
| 81 ascii INSDC:SRA:format_spot_name_no_coord #1 ( ascii name_fmt * ascii spot_name ); | |
| 82 | |
| 83 | |
| 84 /*-------------------------------------------------------------------------- | |
| 85 * spotcoord | |
| 86 * spot coordinate table | |
| 87 * gives X and Y and potentially other common coordinates | |
| 88 */ | |
| 89 table INSDC:SRA:tbl:spotcoord #1 | |
| 90 { | |
| 91 /* X, Y | |
| 92 * 32 ( or 16 ) bit coordinates within plate region | |
| 93 * the coordinate system ( zero or one-based ) is unspecified | |
| 94 */ | |
| 95 extern default column INSDC:coord:val X = out_x_coord; | |
| 96 extern default column INSDC:coord:val Y = out_y_coord; | |
| 97 | |
| 98 // backward compatibility for 16-bit unsigned coordinates | |
| 99 extern readonly column U16 X = cast ( x_clip_U16 ); | |
| 100 extern readonly column U16 Y = cast ( y_clip_U16 ); | |
| 101 | |
| 102 // clip signed 32-bit coordinates to unsigned 16-bit | |
| 103 INSDC:coord:val x_clip_U16 | |
| 104 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_x_coord ); | |
| 105 INSDC:coord:val y_clip_U16 | |
| 106 = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_y_coord ); | |
| 107 | |
| 108 | |
| 109 /* INSDC:SRA:tbl:spotcoord virtual productions | |
| 110 * out_x_coord | |
| 111 * out_y_coord | |
| 112 */ | |
| 113 }; | |
| 114 | |
| 115 | |
| 116 /*-------------------------------------------------------------------------- | |
| 117 * spotname | |
| 118 * spot name table | |
| 119 * the name column is normally indexed | |
| 120 * | |
| 121 * history: | |
| 122 * 1.0.1 - split X and Y into spotcoord table | |
| 123 */ | |
| 124 table INSDC:SRA:tbl:spotname #1.0.1 = INSDC:SRA:tbl:spotcoord #1 | |
| 125 { | |
| 126 /* NAME | |
| 127 * external name for spot | |
| 128 */ | |
| 129 extern column ascii NAME = _out_name; | |
| 130 | |
| 131 | |
| 132 /* SPOT_IDS_FOUND | |
| 133 * lookup by NAME column | |
| 134 */ | |
| 135 readonly column INSDC:SRA:spot_ids_found SPOT_IDS_FOUND | |
| 136 = spot_ids_found; | |
| 137 | |
| 138 | |
| 139 /* default rules */ | |
| 140 | |
| 141 // assemble NAME column output in order of preference | |
| 142 ascii _out_name | |
| 143 = INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord, out_spot_name ) | |
| 144 | INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord ) | |
| 145 | INSDC:SRA:format_spot_name_no_coord (out_name_fmt) | |
| 146 | out_spot_name; | |
| 147 | |
| 148 | |
| 149 /* INSDC:SRA:tbl:spotcoord inherited virtual productions | |
| 150 * out_x_coord | |
| 151 * out_y_coord | |
| 152 */ | |
| 153 | |
| 154 /* INSDC:SRA:tbl:spotname virtual productions | |
| 155 * out_name_fmt | |
| 156 * out_spot_name | |
| 157 * spot_ids_found | |
| 158 */ | |
| 159 }; | |
| 160 | |
| 161 | |
| 162 /*-------------------------------------------------------------------------- | |
| 163 * spotdesc | |
| 164 * spot descriptor table | |
| 165 * | |
| 166 * history: | |
| 167 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
| 168 * 1.0.2 - added alternate taps for in_read_type and in_read_len | |
| 169 */ | |
| 170 table INSDC:SRA:tbl:spotdesc #1.0.2 = INSDC:tbl:sequence #1.0.1 | |
| 171 { | |
| 172 /* NREADS | |
| 173 * describes the number of reads within spot | |
| 174 */ | |
| 175 extern column U8 NREADS = out_nreads; | |
| 176 | |
| 177 | |
| 178 /* SPOT_LEN | |
| 179 * length of sequence | |
| 180 * FIXED_SPOT_LEN | |
| 181 * non-zero if sequence length is fixed throughout table | |
| 182 */ | |
| 183 readonly column INSDC:coord:len SPOT_LEN = spot_len; | |
| 184 readonly column INSDC:coord:len FIXED_SPOT_LEN = fixed_spot_len; | |
| 185 | |
| 186 | |
| 187 /* TRIM_START | |
| 188 * TRIM_LEN | |
| 189 * define the spot segment after applying trimming | |
| 190 * trimming may be based upon technical segments and read quality | |
| 191 */ | |
| 192 readonly column INSDC:coord:zero TRIM_START | |
| 193 = trim_start | |
| 194 | < INSDC:coord:zero> echo < 0 > (); | |
| 195 readonly column INSDC:coord:one TRIM_START | |
| 196 = ( INSDC:coord:one ) < I32 > sum < 1 > ( trim_start ) | |
| 197 | < INSDC:coord:one> echo < 1 > (); | |
| 198 readonly column INSDC:coord:len TRIM_LEN | |
| 199 = trim_len | |
| 200 | spot_len; | |
| 201 | |
| 202 | |
| 203 /* LABEL | |
| 204 * LABEL_START, LABEL_LEN | |
| 205 * column pair for writing read labels | |
| 206 * the label text for all reads is concatenated to form the LABEL row | |
| 207 * starting coordinates and lengths delineate labels by read | |
| 208 * | |
| 209 * NB - row length for LABEL_START/LEN === NREADS, | |
| 210 * row length for LABEL === SUM ( LABEL_LEN [ n ] ) for NREADS | |
| 211 */ | |
| 212 extern column ascii LABEL = out_label; | |
| 213 extern column INSDC:coord:zero LABEL_START = out_label_start; | |
| 214 extern column INSDC:coord:len LABEL_LEN = out_label_len; | |
| 215 | |
| 216 // 16-bit versions | |
| 217 readonly column U16 LABEL_START = cast ( out_label_start ); | |
| 218 readonly column U16 LABEL_LEN = cast ( out_label_len ); | |
| 219 | |
| 220 | |
| 221 /* READ_TYPE | |
| 222 * binary values giving type of a read | |
| 223 * | |
| 224 * NB - row length === NREADS | |
| 225 */ | |
| 226 extern default column INSDC:SRA:xread_type READ_TYPE = out_read_type; | |
| 227 | |
| 228 INSDC:SRA:xread_type in_read_type | |
| 229 = READ_TYPE | |
| 230 | _alt_in_read_type; | |
| 231 | |
| 232 readonly column INSDC:SRA:read_type READ_TYPE | |
| 233 = out_read_type | |
| 234 | < INSDC:SRA:xread_type, INSDC:SRA:read_type > map < [ 0,1,2,3,4,5,6,7 ], [ 0,1,0,1,0,1,0,1 ] > ( out_read_type ); | |
| 235 | |
| 236 | |
| 237 /* READ_START | |
| 238 * READ_LEN | |
| 239 * define starting coordinates and length of read segments | |
| 240 * | |
| 241 * NB - row length === NREADS | |
| 242 */ | |
| 243 extern default column INSDC:coord:zero READ_START | |
| 244 = out_read_start; | |
| 245 extern column INSDC:coord:one READ_START | |
| 246 = ( INSDC:coord:one ) < I32 > sum < 1 > ( out_read_start ); | |
| 247 extern column INSDC:coord:len READ_LEN = out_read_len; | |
| 248 | |
| 249 // 16-bit versions | |
| 250 readonly column U16 READ_START = cast ( out_read_start ); | |
| 251 readonly column U16 READ_LEN = cast ( out_read_len ); | |
| 252 | |
| 253 INSDC:coord:len in_read_len | |
| 254 = READ_LEN | |
| 255 | _alt_in_read_len; | |
| 256 | |
| 257 | |
| 258 /* READ_FILTER | |
| 259 * bits indicate usability of sequence | |
| 260 * always available | |
| 261 */ | |
| 262 extern column INSDC:SRA:read_filter READ_FILTER | |
| 263 = out_rd_filter | |
| 264 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_start ); | |
| 265 | |
| 266 // RD_FILTER - only available if physical column is present | |
| 267 extern readonly column INSDC:SRA:read_filter RD_FILTER = out_rd_filter; | |
| 268 | |
| 269 | |
| 270 /* spot_len is used internally */ | |
| 271 INSDC:coord:len spot_len | |
| 272 = base_space_spot_len | |
| 273 | color_space_spot_len | |
| 274 | align_spot_len; | |
| 275 INSDC:coord:len fixed_spot_len | |
| 276 = static_fixed_spot_len | |
| 277 | base_space_fixed_spot_len | |
| 278 | color_space_fixed_spot_len; | |
| 279 | |
| 280 | |
| 281 /* INSDC:tbl:sequence inherited virtual productions | |
| 282 * out_2cs_packed | |
| 283 * out_2na_packed | |
| 284 */ | |
| 285 | |
| 286 /* INSDC:SRA:tbl:spotdesc productions | |
| 287 * trim_len | |
| 288 * out_label | |
| 289 * out_nreads | |
| 290 * trim_start | |
| 291 * out_read_len | |
| 292 * out_label_len | |
| 293 * out_rd_filter | |
| 294 * out_read_type | |
| 295 * out_read_start | |
| 296 * out_label_start | |
| 297 * static_fixed_spot_len | |
| 298 */ | |
| 299 }; | |
| 300 | |
| 301 /*-------------------------------------------------------------------------- | |
| 302 * stats | |
| 303 * run and spot-group statistics | |
| 304 * | |
| 305 * history: | |
| 306 * 1.1.0 - added CMP_BASE_COUNT | |
| 307 */ | |
| 308 table INSDC:SRA:tbl:stats #1.1 | |
| 309 { | |
| 310 readonly column INSDC:SRA:spotid_t MIN_SPOT_ID | |
| 311 = min_spot_id | |
| 312 | < INSDC:SRA:spotid_t > echo < 1 > (); | |
| 313 readonly column INSDC:SRA:spotid_t MAX_SPOT_ID | |
| 314 = max_spot_id | |
| 315 | cast ( spot_count ); | |
| 316 readonly column U64 | |
| 317 SPOT_COUNT = spot_count; | |
| 318 readonly column U64 | |
| 319 BASE_COUNT = base_count; | |
| 320 readonly column U64 | |
| 321 BIO_BASE_COUNT = bio_base_count; | |
| 322 readonly column U64 CMP_BASE_COUNT | |
| 323 = cmp_base_count | |
| 324 | base_count; | |
| 325 | |
| 326 U8 stats_dummy = in_stats_bin; | |
| 327 | |
| 328 /* INSDC:SRA:tbl:stats productions | |
| 329 * base_count | |
| 330 * spot_count | |
| 331 * max_spot_id | |
| 332 * min_spot_id | |
| 333 * in_stats_bin | |
| 334 * bio_base_count | |
| 335 * cmp_base_count | |
| 336 */ | |
| 337 }; | |
| 338 | |
| 339 /*-------------------------------------------------------------------------- | |
| 340 * sra | |
| 341 * the INSDC SRA table | |
| 342 * | |
| 343 * history: | |
| 344 * 1.0.1 - base explicitly upon spotname #1.0.1 | |
| 345 * 1.0.2 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 | |
| 346 * 1.0.3 - base upon spotdesc #1.0.2 | |
| 347 */ | |
| 348 | |
| 349 // platform constants from <insdc/sra.h> | |
| 350 typedef U8 INSDC:SRA:platform_id; | |
| 351 const INSDC:SRA:platform_id SRA_PLATFORM_UNDEFINED = 0; | |
| 352 const INSDC:SRA:platform_id SRA_PLATFORM_454 = 1; | |
| 353 const INSDC:SRA:platform_id SRA_PLATFORM_ILLUMINA = 2; | |
| 354 const INSDC:SRA:platform_id SRA_PLATFORM_ABSOLID = 3; | |
| 355 const INSDC:SRA:platform_id SRA_PLATFORM_COMPLETE_GENOMICS = 4; | |
| 356 const INSDC:SRA:platform_id SRA_PLATFORM_HELICOS = 5; | |
| 357 const INSDC:SRA:platform_id SRA_PLATFORM_PACBIO_SMRT = 6; | |
| 358 const INSDC:SRA:platform_id SRA_PLATFORM_ION_TORRENT = 7; | |
| 359 const INSDC:SRA:platform_id SRA_PLATFORM_CAPILLARY = 8; | |
| 360 const INSDC:SRA:platform_id SRA_PLATFORM_OXFORD_NANOPORE = 9; | |
| 361 | |
| 362 table INSDC:SRA:tbl:sra #1.0.3 = | |
| 363 INSDC:tbl:sequence #1.0.1, INSDC:SRA:tbl:spotname #1.0.1, | |
| 364 INSDC:SRA:tbl:spotdesc #1.0.2, INSDC:SRA:tbl:stats #1.1.0 | |
| 365 { | |
| 366 /* PLATFORM | |
| 367 * platform description | |
| 368 * one version returns a constant defined above | |
| 369 * while the other returns a textual representation | |
| 370 */ | |
| 371 extern column INSDC:SRA:platform_id PLATFORM | |
| 372 = .PLATFORM | |
| 373 | out_platform; | |
| 374 readonly column ascii PLATFORM | |
| 375 = platform_name; | |
| 376 | |
| 377 physical column | |
| 378 < INSDC:SRA:platform_id > zip_encoding .PLATFORM = PLATFORM; | |
| 379 | |
| 380 | |
| 381 /* SPOT_ID | |
| 382 * reports spot id of current row | |
| 383 */ | |
| 384 extern column INSDC:SRA:spotid_t SPOT_ID | |
| 385 = < INSDC:SRA:spotid_t > add_row_id ( .SPOT_ID ) | |
| 386 | cast ( rowid_64 ); | |
| 387 I64 rowid_64 = row_id (); | |
| 388 | |
| 389 physical column < INSDC:SRA:spotid_t > izip_encoding .SPOT_ID | |
| 390 = < INSDC:SRA:spotid_t > sub_row_id ( SPOT_ID ); | |
| 391 | |
| 392 | |
| 393 /* SPOT_GROUP | |
| 394 * a name denoting group membership, '' | |
| 395 * used for "barcode" support | |
| 396 */ | |
| 397 extern column ascii SPOT_GROUP | |
| 398 = out_spot_group | |
| 399 | .SPOT_GROUP | |
| 400 | < ascii > echo < '' > (); | |
| 401 | |
| 402 ascii in_spot_group = SPOT_GROUP; | |
| 403 | |
| 404 physical column | |
| 405 < ascii > zip_encoding < Z_DEFAULT_STRATEGY, Z_BEST_SPEED > .SPOT_GROUP = in_spot_group; | |
| 406 | |
| 407 | |
| 408 /* INSDC:tbl:sequence inherited virtual productions | |
| 409 * cs_native | |
| 410 * in_cs_key | |
| 411 * out_cs_key | |
| 412 * out_signal | |
| 413 * in_dna_text | |
| 414 * out_2cs_bin | |
| 415 * out_2na_bin | |
| 416 * out_4na_bin | |
| 417 * out_dna_text | |
| 418 * out_x2cs_bin | |
| 419 * out_x2na_bin | |
| 420 * in_color_text | |
| 421 * out_2cs_packed | |
| 422 * out_2na_packed | |
| 423 * out_4na_packed | |
| 424 * out_color_text | |
| 425 * out_qual_phred | |
| 426 * out_color_matrix | |
| 427 */ | |
| 428 | |
| 429 /* INSDC:SRA:tbl:spotcoord inherited virtual productions | |
| 430 * out_x_coord | |
| 431 * out_y_coord | |
| 432 */ | |
| 433 | |
| 434 /* INSDC:SRA:tbl:spotname inherited virtual productions | |
| 435 * out_name_fmt | |
| 436 * out_spot_name | |
| 437 * spot_ids_found | |
| 438 */ | |
| 439 | |
| 440 /* INSDC:SRA:tbl:spotdesc inherited productions | |
| 441 * trim_len | |
| 442 * out_label | |
| 443 * out_nreads | |
| 444 * trim_start | |
| 445 * out_read_len | |
| 446 * out_label_len | |
| 447 * out_rd_filter | |
| 448 * out_read_type | |
| 449 * out_read_start | |
| 450 * out_label_start | |
| 451 * static_fixed_spot_len | |
| 452 */ | |
| 453 | |
| 454 /* INSDC:SRA:tbl:stats inherited productions | |
| 455 * base_count | |
| 456 * spot_count | |
| 457 * max_spot_id | |
| 458 * min_spot_id | |
| 459 * in_stats_bin | |
| 460 * bio_base_count | |
| 461 */ | |
| 462 | |
| 463 /* INSDC:SRA:tbl:sra productions | |
| 464 * out_platform | |
| 465 * platform_name | |
| 466 */ | |
| 467 }; |
