Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/insdc/sra.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/insdc/sra.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,467 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +/*========================================================================== + * INSDC Sequence Read Archive schema + */ +version 1; + +include 'insdc/seq.vschema'; + + +/*-------------------------------------------------------------------------- + * types + */ + +/* spotid_t + * unique id given to every spot + */ +typedef U32 INSDC:SRA:spotid_t; + + +/* spot_ids_found + */ +typedef U64 INSDC:SRA:spot_ids_found [ 4 ]; + + +/*-------------------------------------------------------------------------- + * functions + */ + + +/* format_spot_name + * given a name format string, X, and Y + * produce a reconstructed spot name string + * + * "name_fmt" [ DATA ] - name format string ( see format explanation below ) + * + * "X" [ DATA ] - X coordinate for spot + * + * "Y" [ DATA ] - Y coordinate for spot + * + * "spot_name" [ DATA, OPTIONAL ] - potential source of unformatted names + * + * SYNOPSIS: + * "name_fmt" may have any ASCII characters + * the special character '$' is an escape symbol + * when followed by a recognized format character, + * both the '$' and its format character will be + * replaced with a numeral generated from X and/or Y. + * + * when "spot_name" is present and the "name_fmt" row is empty, + * output is taken verbatim from "spot_name" + */ +function +ascii INSDC:SRA:format_spot_name #1 ( ascii name_fmt , I32 X , I32 Y * ascii spot_name ); + +function +ascii INSDC:SRA:format_spot_name_no_coord #1 ( ascii name_fmt * ascii spot_name ); + + +/*-------------------------------------------------------------------------- + * spotcoord + * spot coordinate table + * gives X and Y and potentially other common coordinates + */ +table INSDC:SRA:tbl:spotcoord #1 +{ + /* X, Y + * 32 ( or 16 ) bit coordinates within plate region + * the coordinate system ( zero or one-based ) is unspecified + */ + extern default column INSDC:coord:val X = out_x_coord; + extern default column INSDC:coord:val Y = out_y_coord; + + // backward compatibility for 16-bit unsigned coordinates + extern readonly column U16 X = cast ( x_clip_U16 ); + extern readonly column U16 Y = cast ( y_clip_U16 ); + + // clip signed 32-bit coordinates to unsigned 16-bit + INSDC:coord:val x_clip_U16 + = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_x_coord ); + INSDC:coord:val y_clip_U16 + = < INSDC:coord:val > clip < 0, 0xFFFF > ( out_y_coord ); + + + /* INSDC:SRA:tbl:spotcoord virtual productions + * out_x_coord + * out_y_coord + */ +}; + + +/*-------------------------------------------------------------------------- + * spotname + * spot name table + * the name column is normally indexed + * + * history: + * 1.0.1 - split X and Y into spotcoord table + */ +table INSDC:SRA:tbl:spotname #1.0.1 = INSDC:SRA:tbl:spotcoord #1 +{ + /* NAME + * external name for spot + */ + extern column ascii NAME = _out_name; + + + /* SPOT_IDS_FOUND + * lookup by NAME column + */ + readonly column INSDC:SRA:spot_ids_found SPOT_IDS_FOUND + = spot_ids_found; + + + /* default rules */ + + // assemble NAME column output in order of preference + ascii _out_name + = INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord, out_spot_name ) + | INSDC:SRA:format_spot_name ( out_name_fmt, out_x_coord, out_y_coord ) + | INSDC:SRA:format_spot_name_no_coord (out_name_fmt) + | out_spot_name; + + + /* INSDC:SRA:tbl:spotcoord inherited virtual productions + * out_x_coord + * out_y_coord + */ + + /* INSDC:SRA:tbl:spotname virtual productions + * out_name_fmt + * out_spot_name + * spot_ids_found + */ +}; + + +/*-------------------------------------------------------------------------- + * spotdesc + * spot descriptor table + * + * history: + * 1.0.1 - base explicitly upon sequence #1.0.1 + * 1.0.2 - added alternate taps for in_read_type and in_read_len + */ +table INSDC:SRA:tbl:spotdesc #1.0.2 = INSDC:tbl:sequence #1.0.1 +{ + /* NREADS + * describes the number of reads within spot + */ + extern column U8 NREADS = out_nreads; + + + /* SPOT_LEN + * length of sequence + * FIXED_SPOT_LEN + * non-zero if sequence length is fixed throughout table + */ + readonly column INSDC:coord:len SPOT_LEN = spot_len; + readonly column INSDC:coord:len FIXED_SPOT_LEN = fixed_spot_len; + + + /* TRIM_START + * TRIM_LEN + * define the spot segment after applying trimming + * trimming may be based upon technical segments and read quality + */ + readonly column INSDC:coord:zero TRIM_START + = trim_start + | < INSDC:coord:zero> echo < 0 > (); + readonly column INSDC:coord:one TRIM_START + = ( INSDC:coord:one ) < I32 > sum < 1 > ( trim_start ) + | < INSDC:coord:one> echo < 1 > (); + readonly column INSDC:coord:len TRIM_LEN + = trim_len + | spot_len; + + + /* LABEL + * LABEL_START, LABEL_LEN + * column pair for writing read labels + * the label text for all reads is concatenated to form the LABEL row + * starting coordinates and lengths delineate labels by read + * + * NB - row length for LABEL_START/LEN === NREADS, + * row length for LABEL === SUM ( LABEL_LEN [ n ] ) for NREADS + */ + extern column ascii LABEL = out_label; + extern column INSDC:coord:zero LABEL_START = out_label_start; + extern column INSDC:coord:len LABEL_LEN = out_label_len; + + // 16-bit versions + readonly column U16 LABEL_START = cast ( out_label_start ); + readonly column U16 LABEL_LEN = cast ( out_label_len ); + + + /* READ_TYPE + * binary values giving type of a read + * + * NB - row length === NREADS + */ + extern default column INSDC:SRA:xread_type READ_TYPE = out_read_type; + + INSDC:SRA:xread_type in_read_type + = READ_TYPE + | _alt_in_read_type; + + readonly column INSDC:SRA:read_type READ_TYPE + = out_read_type + | < INSDC:SRA:xread_type, INSDC:SRA:read_type > map < [ 0,1,2,3,4,5,6,7 ], [ 0,1,0,1,0,1,0,1 ] > ( out_read_type ); + + + /* READ_START + * READ_LEN + * define starting coordinates and length of read segments + * + * NB - row length === NREADS + */ + extern default column INSDC:coord:zero READ_START + = out_read_start; + extern column INSDC:coord:one READ_START + = ( INSDC:coord:one ) < I32 > sum < 1 > ( out_read_start ); + extern column INSDC:coord:len READ_LEN = out_read_len; + + // 16-bit versions + readonly column U16 READ_START = cast ( out_read_start ); + readonly column U16 READ_LEN = cast ( out_read_len ); + + INSDC:coord:len in_read_len + = READ_LEN + | _alt_in_read_len; + + + /* READ_FILTER + * bits indicate usability of sequence + * always available + */ + extern column INSDC:SRA:read_filter READ_FILTER + = out_rd_filter + | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_start ); + + // RD_FILTER - only available if physical column is present + extern readonly column INSDC:SRA:read_filter RD_FILTER = out_rd_filter; + + + /* spot_len is used internally */ + INSDC:coord:len spot_len + = base_space_spot_len + | color_space_spot_len + | align_spot_len; + INSDC:coord:len fixed_spot_len + = static_fixed_spot_len + | base_space_fixed_spot_len + | color_space_fixed_spot_len; + + + /* INSDC:tbl:sequence inherited virtual productions + * out_2cs_packed + * out_2na_packed + */ + + /* INSDC:SRA:tbl:spotdesc productions + * trim_len + * out_label + * out_nreads + * trim_start + * out_read_len + * out_label_len + * out_rd_filter + * out_read_type + * out_read_start + * out_label_start + * static_fixed_spot_len + */ +}; + +/*-------------------------------------------------------------------------- + * stats + * run and spot-group statistics + * + * history: + * 1.1.0 - added CMP_BASE_COUNT + */ +table INSDC:SRA:tbl:stats #1.1 +{ + readonly column INSDC:SRA:spotid_t MIN_SPOT_ID + = min_spot_id + | < INSDC:SRA:spotid_t > echo < 1 > (); + readonly column INSDC:SRA:spotid_t MAX_SPOT_ID + = max_spot_id + | cast ( spot_count ); + readonly column U64 + SPOT_COUNT = spot_count; + readonly column U64 + BASE_COUNT = base_count; + readonly column U64 + BIO_BASE_COUNT = bio_base_count; + readonly column U64 CMP_BASE_COUNT + = cmp_base_count + | base_count; + + U8 stats_dummy = in_stats_bin; + + /* INSDC:SRA:tbl:stats productions + * base_count + * spot_count + * max_spot_id + * min_spot_id + * in_stats_bin + * bio_base_count + * cmp_base_count + */ +}; + +/*-------------------------------------------------------------------------- + * sra + * the INSDC SRA table + * + * history: + * 1.0.1 - base explicitly upon spotname #1.0.1 + * 1.0.2 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 + * 1.0.3 - base upon spotdesc #1.0.2 + */ + +// platform constants from <insdc/sra.h> +typedef U8 INSDC:SRA:platform_id; +const INSDC:SRA:platform_id SRA_PLATFORM_UNDEFINED = 0; +const INSDC:SRA:platform_id SRA_PLATFORM_454 = 1; +const INSDC:SRA:platform_id SRA_PLATFORM_ILLUMINA = 2; +const INSDC:SRA:platform_id SRA_PLATFORM_ABSOLID = 3; +const INSDC:SRA:platform_id SRA_PLATFORM_COMPLETE_GENOMICS = 4; +const INSDC:SRA:platform_id SRA_PLATFORM_HELICOS = 5; +const INSDC:SRA:platform_id SRA_PLATFORM_PACBIO_SMRT = 6; +const INSDC:SRA:platform_id SRA_PLATFORM_ION_TORRENT = 7; +const INSDC:SRA:platform_id SRA_PLATFORM_CAPILLARY = 8; +const INSDC:SRA:platform_id SRA_PLATFORM_OXFORD_NANOPORE = 9; + +table INSDC:SRA:tbl:sra #1.0.3 = + INSDC:tbl:sequence #1.0.1, INSDC:SRA:tbl:spotname #1.0.1, + INSDC:SRA:tbl:spotdesc #1.0.2, INSDC:SRA:tbl:stats #1.1.0 +{ + /* PLATFORM + * platform description + * one version returns a constant defined above + * while the other returns a textual representation + */ + extern column INSDC:SRA:platform_id PLATFORM + = .PLATFORM + | out_platform; + readonly column ascii PLATFORM + = platform_name; + + physical column + < INSDC:SRA:platform_id > zip_encoding .PLATFORM = PLATFORM; + + + /* SPOT_ID + * reports spot id of current row + */ + extern column INSDC:SRA:spotid_t SPOT_ID + = < INSDC:SRA:spotid_t > add_row_id ( .SPOT_ID ) + | cast ( rowid_64 ); + I64 rowid_64 = row_id (); + + physical column < INSDC:SRA:spotid_t > izip_encoding .SPOT_ID + = < INSDC:SRA:spotid_t > sub_row_id ( SPOT_ID ); + + + /* SPOT_GROUP + * a name denoting group membership, '' + * used for "barcode" support + */ + extern column ascii SPOT_GROUP + = out_spot_group + | .SPOT_GROUP + | < ascii > echo < '' > (); + + ascii in_spot_group = SPOT_GROUP; + + physical column + < ascii > zip_encoding < Z_DEFAULT_STRATEGY, Z_BEST_SPEED > .SPOT_GROUP = in_spot_group; + + + /* INSDC:tbl:sequence inherited virtual productions + * cs_native + * in_cs_key + * out_cs_key + * out_signal + * in_dna_text + * out_2cs_bin + * out_2na_bin + * out_4na_bin + * out_dna_text + * out_x2cs_bin + * out_x2na_bin + * in_color_text + * out_2cs_packed + * out_2na_packed + * out_4na_packed + * out_color_text + * out_qual_phred + * out_color_matrix + */ + + /* INSDC:SRA:tbl:spotcoord inherited virtual productions + * out_x_coord + * out_y_coord + */ + + /* INSDC:SRA:tbl:spotname inherited virtual productions + * out_name_fmt + * out_spot_name + * spot_ids_found + */ + + /* INSDC:SRA:tbl:spotdesc inherited productions + * trim_len + * out_label + * out_nreads + * trim_start + * out_read_len + * out_label_len + * out_rd_filter + * out_read_type + * out_read_start + * out_label_start + * static_fixed_spot_len + */ + + /* INSDC:SRA:tbl:stats inherited productions + * base_count + * spot_count + * max_spot_id + * min_spot_id + * in_stats_bin + * bio_base_count + */ + + /* INSDC:SRA:tbl:sra productions + * out_platform + * platform_name + */ +};