Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/sra/illumina.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/sra/illumina.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,408 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +/*========================================================================== + * NCBI Illumina Sequence Read Archive schema + */ +version 1; + +include 'ncbi/sra.vschema'; +include 'ncbi/spotname.vschema'; + + +/*-------------------------------------------------------------------------- + * types + */ + +typedef INSDC:quality:log_odds NCBI:qual4 [ 4 ]; +typedef NCBI:qual4 NCBI:SRA:rotated_qual4, NCBI:SRA:swapped_qual4; + + +/*-------------------------------------------------------------------------- + * functions + */ + +/* tokenize_spot_name + * scans name on input + * tokenizes into parts + */ +extern function NCBI:SRA:spot_name_token + NCBI:SRA:Illumina:tokenize_spot_name #1 ( ascii name ); + + +/*-------------------------------------------------------------------------- + * NCBI:SRA:Illumina:qual4 + * 4-channel log-odds-ish quality + */ + +/* history: + * 1.0.1 - base explicitly upon updated ancestry + */ +table NCBI:SRA:Illumina:qual4_nocol #1.0.1 + = INSDC:tbl:sequence #1.0.1 + , NCBI:tbl:log_odds_quality_nocol #1.0.1 +{ + /* QUALITY + * 4-channel quality column + */ + readonly column NCBI:qual4 QUALITY = out_qual4; + + NCBI:qual4 out_qual4 + = < NCBI:qual4 > NCBI:SRA:swap ( out_qual4_swapped, read_unpack ) + | < NCBI:qual4 > NCBI:SRA:rotate < false > ( out_qual4_rotated, read_unpack ); + + + /* single-channel output + * convert 4-channel log-odds to single channel + * must retain n-encoding, which was intended to be the 4-channel pattern + * ( -5, -5, -5, -5 ) and a base of 'A' + */ + + // first, extract quality for called base + INSDC:quality:log_odds out_qual1_ch0 + = < INSDC:quality:log_odds> cut < 0 > ( out_qual4_swapped ) + | < INSDC:quality:log_odds> cut < 0 > ( out_qual4_rotated ); + + // clip it to -5 and above + INSDC:quality:log_odds out_qual1_clip + = < INSDC:quality:log_odds > clip < -5, 127 > ( out_qual1_ch0 ); + + // convert 4 channel to single 32-bit value + U32 out_qual4_32 + = redimension ( out_qual4_swapped ) + | redimension ( out_qual4_rotated ); + + // detect ( -5, -5, -5, -5 ) and introduce a -6 value into log-odds + // this is treated as an 'N', but still not ready + INSDC:quality:log_odds out_qual1_fives + = < U32, INSDC:quality:log_odds > map < 0xFBFBFBFB, -6 > ( out_qual4_32, out_qual1_clip ); + + // now slam zeros into anything that doesn't correspond to an A + // essentially this leaves all of the A qualities. any having -6 are really N. + INSDC:quality:log_odds out_qual1_n + = < U8, INSDC:quality:log_odds > map < [ 1, 2, 3 ], [ 0, 0, 0 ] > ( read_unpack, out_qual1_fives ); + + // finally, produce log-odds with n-encoded as -6 + INSDC:quality:log_odds out_qual_log_odds + = < INSDC:quality:log_odds, INSDC:quality:log_odds > map < -6, -6 > ( out_qual1_n, out_qual1_clip ); + + + /* NCBI:tbl:n_encoding inherited productions + * read_unpack + */ + + /* NCBI:SRA:Illumina:qual4_nocol productions + * out_qual4_rotated + * out_qual4_swapped + */ +}; + + +/* 4-channel log-odds compression + */ + +// encoded type - a single byte code for 4-channel pattern +typedef B8 NCBI:SRA:encoded_qual4; + +// decoding function +extern function +NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_decode #1 ( NCBI:SRA:encoded_qual4 in ); + +// encoding function +extern function +NCBI:SRA:encoded_qual4 NCBI:SRA:qual4_encode #1 ( NCBI:SRA:swapped_qual4 in ); + +// compression rules +physical NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_encoding #1 +{ + encode + { + // produce codes + NCBI:SRA:encoded_qual4 encoded = NCBI:SRA:qual4_encode ( @ ); + + // gzip + return zip < Z_RLE, Z_BEST_SPEED > ( encoded ); + } + + decode + { + // gunzip + NCBI:SRA:encoded_qual4 unzipped = unzip ( @ ); + + // inflate to swapped + return NCBI:SRA:qual4_decode ( unzipped ); + } +} + +/* history: + * 1.0.1 - base upon updated qual4_nocol + */ +table NCBI:SRA:Illumina:qual4 #1.0.1 = NCBI:SRA:Illumina:qual4_nocol #1.0.1 +{ + // read directly as swapped, n-encoded log_odds + NCBI:SRA:swapped_qual4 out_qual4_swapped = .QUALITY; + + /* NCBI:tbl:n_encoding inherited virtual productions + * read_unpack + */ +}; + +/* history: + * 2.0.2 - base upon updated ancestry + * 2.0.3 - base upon updated ancestry + * 2.0.4 - base upon updated ancestry + * 2.1.0 - base upon updated ancestry, added in_qual_log_odds + */ +table NCBI:SRA:Illumina:qual4 #2.1.0 + = NCBI:tbl:base_space #2.0.3 + , NCBI:tbl:log_odds_quality_nocol #2.1.0 +{ + /* QUALITY + * 4-channel log-odds + */ + extern column NCBI:qual4 QUALITY = out_qual4; + + NCBI:SRA:swapped_qual4 in_qual4 + = ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_x2na_bin ) + | ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_2na_bin ); + + NCBI:qual4 out_qual4 + = < NCBI:SRA:swapped_qual4 > NCBI:SRA:swap ( .QUALITY, out_x2na_bin ); + + physical column NCBI:SRA:qual4_encoding .QUALITY = in_qual4; + + // feed to compressed statistics + NCBI:qual4 in_stats_qual = in_qual4; + + // single channel + INSDC:quality:log_odds in_qual_log_odds + = < INSDC:quality:log_odds > cut < 0 > ( in_qual4 ); + INSDC:quality:log_odds out_qual_log_odds + = < INSDC:quality:log_odds > cut < 0 > ( .QUALITY ); +}; + + +/*-------------------------------------------------------------------------- + * NCBI:SRA:Illumina + * Illumina SRA Platform + */ + + +/* NCBI:SRA:Illumina:common #1 + * basic table interface based upon Illumina's pipelines + * + * history: + * 1.0.1 - explictly base upon sra #1.0.1 + * 1.0.2 - base explicitly upon sra #1.0.2 + * 1.0.3 - base explicitly upon sra #1.0.3 + */ +table NCBI:SRA:Illumina:common #1.0.3 = INSDC:SRA:tbl:sra #1.0.3 +{ + // platform name is always 'ILLUMINA' + ascii platform_name + = < ascii > echo < "ILLUMINA" > (); + + /* TRIMMED SEQUENCE + * need to find the 0-based trim_start and trim_len + */ + INSDC:coord:zero bio_start = NCBI:SRA:bio_start ( out_read_start, out_read_type ); + INSDC:coord:zero trim_start = bio_start; + U32 trim_left = ( U32 ) trim_start; + INSDC:coord:len trim_len = (INSDC:coord:len) < U32 > diff ( spot_len, trim_left ); + + /* COORDINATES + * in addition to X and Y, + * Illumina has LANE and TILE + */ + readonly column INSDC:coord:val LANE = out_lane_coord; + readonly column INSDC:coord:val TILE = out_tile_coord; +}; + + +/*-------------------------------------------------------------------------- + * NCBI:SRA:Illumina:tbl:v2 #1 + * normalized v2 table + * still has variants based upon quality type + * + * history: + * 1.0.1 - explictly base upon sra #1.0.1 and related tables + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + */ + +physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:SIGNAL #2 +{ + decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } + encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } +} + +physical NCBI:fsamp4 NCBI:SRA:Illumina:encoding:NOISE #2 +{ + decode + { + F32 dcmp = funzip ( @ ); + return redimension ( dcmp ); + } + encode + { + F32 ncmp = redimension ( @ ); + return fzip < 10 > ( ncmp ); + } +} + +physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:INTENSITY #2 +{ + decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } + encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } +} + +// v2 base table +table NCBI:SRA:Illumina:tbl:v2 #1.0.4 + = NCBI:SRA:tbl:sra #2.1.3 + , NCBI:tbl:base_space #2.0.3 + , NCBI:SRA:Illumina:common #1.0.3 +{ + /* NAME tokenizing and coordinates + * most work happens in skeyname table + * we still obtain LANE and TILE from name + */ + INSDC:coord:val out_lane_coord = ( INSDC:coord:val ) + NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( _out_name, out_spot_name_tok ); + INSDC:coord:val out_tile_coord = ( INSDC:coord:val ) + NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok ); + NCBI:SRA:spot_name_token out_spot_name_tok + = NCBI:SRA:Illumina:tokenize_spot_name ( _out_name ); + + NCBI:SRA:spot_name_token in_spot_name_tok + = NCBI:SRA:Illumina:tokenize_spot_name ( NAME ); + + /* SIGNAL + * optional, no longer archived + */ + extern column NCBI:fsamp4 SIGNAL + { + read = out_signal; + validate = < NCBI:fsamp4 > no_compare #1 ( in_signal, out_signal ); + } + NCBI:fsamp4 in_signal = SIGNAL; + NCBI:fsamp4 out_signal + = < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .SIGNAL, out_x2na_bin ); + + physical column NCBI:SRA:Illumina:encoding:SIGNAL #2 .SIGNAL + = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_x2na_bin ) + | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_2na_bin ); + + /* NOISE + * optional, no longer archived + */ + extern column NCBI:fsamp4 NOISE + { + read = out_noise; + validate = < NCBI:fsamp4 > no_compare #1 ( in_noise, out_noise ); + } + NCBI:fsamp4 in_noise = NOISE; + NCBI:fsamp4 out_noise = .NOISE; + + physical column NCBI:SRA:Illumina:encoding:NOISE #2 .NOISE = in_noise; + + /* INTENSITY + * optional, no longer archived + */ + extern column NCBI:fsamp4 INTENSITY + { + read = out_intensity; + validate = < NCBI:fsamp4 > no_compare #1 ( in_intensity, out_intensity ); + } + NCBI:fsamp4 in_intensity = INTENSITY; + NCBI:fsamp4 out_intensity + = < NCBI:fsamp4 > NCBI:SRA:denormalize ( out_norm_intensity, out_x2na_bin ); + NCBI:fsamp4 out_norm_intensity + = ( NCBI:fsamp4 ) < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .INTENSITY, out_x2na_bin ); + NCBI:fsamp4 in_norm_intensity + = < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_x2na_bin ) + | < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_2na_bin ); + physical column NCBI:SRA:Illumina:encoding:INTENSITY #2 .INTENSITY + = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_x2na_bin ) + | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_2na_bin ); + + /* INSDC:tbl:sequence inherited virtual productions + * out_qual_phred + */ + + /* INSDC:SRA:tbl:spotdesc inherited productions + * static_fixed_spot_len + */ +}; + +/* 4-channel log-odds qualities + * + * history: + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + * 1.0.4 - updated ancestry + * 1.1.0 - updated ancestry + */ +table NCBI:SRA:Illumina:tbl:q4:v2 #1.1.0 + = NCBI:SRA:Illumina:tbl:v2 #1.0.4 + , NCBI:SRA:Illumina:qual4 #2.1.0 +{ + /* INSDC:SRA:tbl:spotdesc inherited virtual productions + * static_fixed_spot_len + */ +}; + +/* 1-channel log-odds qualities + * + * history: + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + * 1.0.4 - updated ancestry + * 1.1.0 - updated ancestry + */ +table NCBI:SRA:Illumina:tbl:q1:v2 #1.1 + = NCBI:SRA:Illumina:tbl:v2 #1.0.4 + , NCBI:tbl:log_odds_quality #2.1.0 +{ + /* INSDC:SRA:tbl:spotdesc inherited productions + * static_fixed_spot_len + */ +}; + +/* phred qualities + * + * history: + * 1.0.2 - updated ancestry + * 1.0.3 - updated ancestry + * 1.0.4 - updated ancestry + */ +table NCBI:SRA:Illumina:tbl:phred:v2 #1.0.4 + = NCBI:SRA:Illumina:tbl:v2 #1.0.4 + , NCBI:tbl:phred_quality #2.0.3 +{ + /* INSDC:SRA:tbl:spotdesc inherited virtual productions + * static_fixed_spot_len + */ +};