Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/seq.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/seq.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,894 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +/*========================================================================== + * Sequence schema implementation tables + */ +version 1; + +include 'vdb/vdb.vschema'; +include 'ncbi/ncbi.vschema'; +include 'insdc/sra.vschema'; + + +/*-------------------------------------------------------------------------- + * n_encoding - implementation + * introduces common virtual productions + */ +table NCBI:tbl:n_encoding #1 +{ + U8 n_encoding_dummy + = read_unpack + | read_ndecode; +}; + + +/*-------------------------------------------------------------------------- + * seqloc + * NCBI sequence locator table + */ +table NCBI:tbl:seqloc #1.0 +{ + /* SEQ_ID + * a FASTA-style SeqId + */ + extern column < ascii > zip_encoding SEQ_ID; + + /* SEQ_START + * provided in both 1 ( default ) and 0-based coordinates + */ + extern default column < INSDC:coord:one > izip_encoding SEQ_START; + readonly column INSDC:coord:zero SEQ_START + = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START ); + + /* SEQ_LEN + */ + extern column < INSDC:coord:len > izip_encoding SEQ_LEN; +}; + + +/*-------------------------------------------------------------------------- + * base_space - implementation + * READ column rules + */ + +/* color_from_dna + * use starting keys and color matrix to convert individual reads + * to base space. + */ +extern function +INSDC:x2cs:bin NCBI:color_from_dna #1 ( INSDC:x2na:bin bin_x2na, + INSDC:coord:zero read_start, INSDC:coord:len read_len, + INSDC:dna:text cs_key, U8 color_matrix ); + + +/* dcmp_base_space + * table to introduce common virtual productions + */ +table NCBI:tbl:dcmp_base_space #1 +{ + // rules to introduce purely virtual productions + // never expected to resolve... + INSDC:dna:text dcmp_virtual_productions + = out_dcmp_4na_bin + | out_dcmp_x2na_bin + | out_dcmp_2na_bin + | out_dcmp_2na_packed; +} + +/* history: + * 1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 + * 1.0.2 - spotdesc #1.0.2 + * 1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin" + */ +table NCBI:tbl:base_space_common #1.0.3 + = INSDC:tbl:sequence #1.0.1 + , INSDC:SRA:tbl:spotdesc #1.0.2 + , INSDC:SRA:tbl:stats #1.1.0 + , NCBI:tbl:dcmp_base_space #1.0.0 +{ + /* INSDC:tbl:sequence inherited virtual productions + */ + + // cs_native - tells user color space is not native + bool cs_native = < bool > echo < false > (); + + // in_cs_key is not writable in base_space + + // color-space key is completely artificial + INSDC:dna:text out_cs_key + = .CS_KEY + | < INSDC:dna:text > echo < 'T' > ( out_read_type ) + | < INSDC:dna:text > echo < 'T' > ( out_read_len ) + | < INSDC:dna:text > echo < 'T' > (); + + // unambiguous synthesized 2cs + INSDC:2cs:bin out_2cs_bin + = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin ); + + // unambiguous unpacked 2na + INSDC:2na:bin out_2na_bin + = out_dcmp_2na_bin + | ( INSDC:2na:bin ) unpack ( out_2na_packed ); + + // synthesized color sequence + INSDC:x2cs:bin out_x2cs_bin + = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); + + // synthesized packed 2cs + INSDC:2cs:packed out_2cs_packed + = ( INSDC:2cs:packed ) pack ( out_2cs_bin ); + + // synthesized packed 4na + INSDC:4na:packed out_4na_packed + = ( INSDC:4na:packed ) pack ( out_4na_bin ); + + // synthesized color text + INSDC:color:text out_color_text + = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); + + // published color matrix + U8 out_color_matrix + = < U8 > echo < INSDC:color:default_matrix > (); + + // spot_len and fixed_spot_len + INSDC:coord:len base_space_spot_len + = ( INSDC:coord:len ) row_len ( out_2na_packed ); + INSDC:coord:len base_space_fixed_spot_len + = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed ); + + + /* INSDC:tbl:sequence inherited productions + * out_signal + * in_dna_text + * out_4na_bin + * out_dna_text + * out_x2na_bin + * out_2na_packed + */ + + /* INSDC:SRA:tbl:stats inherited productions + * in_stats_bin + */ + + /* NCBI:tbl:dcmp_base_space inherited productions + * out_dcmp_2na_bin + * out_dcmp_4na_bin + * out_dcmp_x2na_bin + * out_dcmp_2na_packed + */ +}; + + +/* base_space_nocol + * this table describes viewing rules + * but omits writing rules and physical column description + * in order to support older tables + * + * history: + * 1.0.1 - base explicitly upon base_space_common #1.0.1 + * 1.0.2 - base explicitly upon base_space_common #1.0.2 + * 1.0.3 - " " 1.0.3 + */ +table NCBI:tbl:base_space_nocol #1.0.3 + = NCBI:tbl:base_space_common #1.0.3 + , NCBI:tbl:n_encoding #1 +{ + // incoming is disabled + + // synthesized dna text + INSDC:dna:text out_dna_text + = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); + + // synthesized 4na + INSDC:4na:bin out_4na_bin + = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); + + // unpacked 2na with ambiguities + INSDC:x2na:bin out_x2na_bin + = ( INSDC:x2na:bin ) read_ndecode; + + // interface with n-encoded qualities + U8 read_unpack = out_2na_bin; + + /* INSDC:tbl:sequence inherited productions + * out_signal + * out_2na_packed + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_ndecode + */ +}; + +/* base_space #1 + * this schema brings in standard .READ column for v1 tables + * + * history: + * 1.0.1 - base explicitly upon base_space_nocol #1.0.1 + * 1.0.2 - base explicitly upon base_space_nocol #1.0.2 + * 1.0.3 - base explicitly upon base_space_nocol #1.0.3 + */ +table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3 +{ + // 2-bit 2na representation (0..3) + INSDC:2na:packed out_2na_packed = .READ; + + // no rules for writing to .READ + + /* INSDC:tbl:sequence inherited productions + * out_signal + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_ndecode + */ +}; + + +/* base_space #2 + * standard current base-space table + * + * history: + * 2.0.2 - base_space_common #1.0.2 + * 2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well + */ +table NCBI:tbl:base_space #2.0.3 + = NCBI:tbl:base_space_common #1.0.3 + , NCBI:tbl:dcmp_base_space #1 +{ + /* input rules + */ + + // input text + INSDC:dna:text in_dna_text + = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ ); + + // input 4na bin + INSDC:4na:bin in_4na_bin + = < INSDC:4na:bin > range_validate < 0, 15 > ( READ ) + | ( INSDC:4na:bin ) unpack ( in_4na_packed ) + | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text ) + | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin ); + + // input 4na packed + INSDC:4na:packed in_4na_packed = READ; + + // input x2na bin + INSDC:x2na:bin in_x2na_bin + = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ ) + | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin ); + + // input 2na bin + INSDC:2na:bin in_2na_bin + = < INSDC:2na:bin > range_validate < 0, 3 > ( READ ) + | ( INSDC:2na:bin ) unpack ( in_2na_packed ) + | INSDC:SEQ:rand_4na_2na ( in_4na_bin ); + + // input 2na packed + INSDC:2na:packed in_2na_packed = READ; + + // input 4na alt-read ( ambiguities ) + INSDC:4na:bin in_alt_4na_bin + = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin ); + + // preparing a feed into stats column + U8 in_stats_bin = in_2na_bin; + + + /* physical columns + */ + + physical column INSDC:2na:packed .READ + = in_2na_packed + | ( INSDC:2na:packed ) pack ( in_2na_bin ); + + physical column < INSDC:4na:bin > zip_encoding .ALTREAD + = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin ); + + + /* output rules + */ + + // output 2na packed + INSDC:2na:packed out_2na_packed + = .READ + | out_dcmp_2na_packed; + + // output x2na bin + INSDC:x2na:bin out_x2na_bin + = out_dcmp_x2na_bin + | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ); + + // output 2na->4na bin + INSDC:4na:bin out_2na_4na_bin + = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin ); + + // output 4na bin + INSDC:4na:bin out_4na_bin + = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD ) + | out_dcmp_4na_bin + | out_2na_4na_bin; + + // output text + INSDC:dna:text out_dna_text + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); + + + /* INSDC:tbl:sequence inherited productions + * out_signal + */ + + /* NCBI:tbl:dcmp_base_space inherited productions + * out_dcmp_2na_bin + * out_dcmp_4na_bin + * out_dcmp_x2na_bin + * out_dcmp_2na_packed + */ +}; + + + + +/*-------------------------------------------------------------------------- + * color_space - implementation + * nucleotide sequences in color space + */ + +extern function +INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin, + INSDC:coord:zero read_start, INSDC:coord:len read_len, + INSDC:dna:text cs_key, U8 color_matrix ); + + +/* dcmp_color_space + * declares common virtual productions + */ +table NCBI:tbl:dcmp_color_space #1 +{ + // rules to introduce purely virtual productions + // never expected to resolve... + INSDC:dna:text dcmp_virtual_productions + = out_dcmp_x2cs_bin + | out_dcmp_2cs_bin + | out_dcmp_2cs_packed; +} + +/* history: + * 1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1 + * 1.0.2 - spotdesc #1.0.2 + * 1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin" + */ +table NCBI:tbl:color_space_common #1.0.3 + = INSDC:tbl:sequence #1.0.1 + , INSDC:SRA:tbl:spotdesc #1.0.2 + , INSDC:SRA:tbl:stats #1.1.0 + , NCBI:tbl:dcmp_color_space #1.0.0 +{ + // cs_native - tells user color space is native + bool cs_native = < bool > echo < true > (); + + // unambiguous unpacked 2cs + INSDC:2cs:bin out_2cs_bin + = out_dcmp_2cs_bin + | ( INSDC:2cs:bin ) unpack ( out_2cs_packed ); + + // unambiguous synthesized 2na + INSDC:2na:bin out_2na_bin + = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ); + + // synthesized unpacked 4na + INSDC:4na:bin out_4na_bin + = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); + + // synthesized dna text + INSDC:dna:text out_dna_text + = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); + + // synthesized dna sequence + INSDC:x2na:bin out_x2na_bin + = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); + + // synthesized packed 2na + INSDC:2na:packed out_2na_packed + = ( INSDC:2na:packed ) pack ( out_2na_bin ); + + // synthesized packed 4na + INSDC:4na:packed out_4na_packed + = ( INSDC:4na:packed ) pack ( out_4na_bin ); + + // synthesized color text + INSDC:color:text out_color_text + = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); + + // spot_len and fixed_spot_len + INSDC:coord:len color_space_spot_len + = ( INSDC:coord:len ) row_len ( out_2cs_packed ); + INSDC:coord:len color_space_fixed_spot_len + = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed ); + + /* INSDC:tbl:sequence inherited productions + * in_cs_key + * out_cs_key + * out_signal + * out_x2cs_bin + * in_color_text + * out_2cs_packed + * out_color_matrix + */ + + /* INSDC:SRA:tbl:stats inherited productions + * in_stats_bin + */ + + /* NCBI:tbl:dcmp_color_space inherited productions + * out_dcmp_2cs_bin + * out_dcmp_x2cs_bin + * out_dcmp_2cs_packed + */ +}; + +/* color_space_nocol + * this table describes viewing rules + * but omits writing rules and physical column description + * in order to support older tables + * + * history: + * 1.0.1 - base explicitly upon color_space_common #1.0.1 + * 1.0.2 - color_space_common #1.0.2 + * 1.0.3 - color_space_common #1.0.3 + */ +table NCBI:tbl:color_space_nocol #1.0.3 + = NCBI:tbl:color_space_common #1.0.3 + , NCBI:tbl:n_encoding #1 +{ + // incoming is disabled + + // v1 color matrix was stored in metadata + U8 out_color_matrix + = < U8 > meta:read < "COLOR_MATRIX" > () + | < U8 > echo < INSDC:color:default_matrix > (); + + // unpacked 2cs with ambiguities + INSDC:x2cs:bin out_x2cs_bin + = ( INSDC:x2cs:bin ) read_ndecode; + + // interface with n-encoded qualities + U8 read_unpack = out_2cs_bin; + + /* INSDC:tbl:sequence inherited productions + * out_cs_key + * out_signal + * out_2cs_packed + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_ndecode + */ +}; + +/* color_space #1 + * this schema brings in .CSREAD and .CS_KEY columns for v1 tables + * + * history: + * 1.0.1 - base explicitly upon color_space_nocol #1.0.1 + * 1.0.2 - color_space_nocol #1.0.2 + * 1.0.3 - color_space_nocol #1.0.3 + */ +table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3 +{ + // stored as text + INSDC:dna:text out_cs_key = .CS_KEY; + + // stored color sequence + INSDC:2cs:packed out_2cs_packed = .CSREAD; + + /* INSDC:tbl:sequence inherited productions + * out_signal + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_ndecode + */ +}; + +/* color_space #2 + * standard current color-space table + * + * history: + * 2.0.1 - base explicitly upon color_space_common #1.0.1 + * 2.0.2 - base explicitly upon color_space_common #1.0.2 + * 2.1.0 - introduce hooks for compressed color space + */ +table NCBI:tbl:color_space #2.1 + = NCBI:tbl:color_space_common #1.0.3 + , NCBI:tbl:dcmp_color_space #1.0.0 +{ + /* input rules + */ + + // input text is not modified + // illegal values are not detected here + INSDC:color:text in_color_text = CSREAD; + + // input x2cs bin + // illegal values will be caught here + INSDC:x2cs:bin in_x2cs_bin + = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD ) + | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text ); + + // input 2cs bin + INSDC:2cs:bin in_2cs_bin + = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD ) + | ( INSDC:2cs:bin ) unpack ( in_2cs_packed ) + | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin ); + + // input 2cs packed + INSDC:2cs:packed in_2cs_packed = CSREAD; + + // input x2cs alt-csread ( ambiguity ) + INSDC:x2cs:bin in_alt_x2cs_bin + = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin ); + + // color-space keys ARE modified on input + INSDC:dna:text in_cs_key + = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY ); + + // color matrix + U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX ); + + // prepairing a feed into stats column + U8 in_stats_bin = in_2cs_bin; + + + /* physical columns + */ + + physical column INSDC:2cs:packed .CSREAD + = in_2cs_packed + | ( INSDC:2cs:packed ) pack ( in_2cs_bin ); + + physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD + = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin ); + + physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; + + physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix; + + + /* output rules + */ + + // output 2cs packed + INSDC:2cs:packed out_2cs_packed + = .CSREAD + | out_dcmp_2cs_packed; + + // unpacked 2cs with ambiguity + INSDC:x2cs:bin out_x2cs_bin + = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD ) + | out_dcmp_x2cs_bin + | ( INSDC:x2cs:bin ) out_2cs_bin; + + // read directly from physical column + INSDC:dna:text out_cs_key = .CS_KEY; + + // color matrix may be synthesized + U8 out_color_matrix + = .COLOR_MATRIX + | < U8 > echo < INSDC:color:default_matrix > (); + + + /* INSDC:tbl:sequence inherited productions + * out_signal + */ + + /* NCBI:tbl:dcmp_color_space inherited productions + * out_dcmp_2cs_bin + * out_dcmp_x2cs_bin + * out_dcmp_2cs_packed + */ +}; + + +/*-------------------------------------------------------------------------- + * protein + */ +table NCBI:tbl:protein #1 = INSDC:tbl:protein +{ + /* upper-case letters */ + INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text > + map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN ); + + /* std aa */ + INSDC:aa:bin in_aa_bin + = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN ) + | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text ); + + /* physical column */ + physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin; + + /* output rules */ + INSDC:aa:bin out_aa_bin = .PROTEIN; + INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text > + map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin ); +}; + + +/*-------------------------------------------------------------------------- + * phred + * standard phred quality representation + * limits values on input to 1..63 + * reserves value 0 as ambiguity symbol for reads + */ + + +/* history: + * 1.0.1 - base explicitly upon sequence #1.0.1 + */ +table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 +{ + /* [CS]READ - decoding + */ + U8 read_ndecode + = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack ); + + /* INSDC:tbl:sequence inherited productions + * out_qual_phred + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_unpack + */ +}; + +/* history: + * 1.0.1 - base explicitly upon phred_quality_nocol #1.0.1 + */ +table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1 +{ + // read directly as n-encoded phred is compatible with phred + NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY; + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_unpack + */ +}; + +/* history: + * 2.0.1 - added feed of in_stats_qual + * 2.0.2 - added input of text encodings + * 2.0.3 - base explicitly upon sequence #1.0.1 + * 2.0.4 - change compression from izip to zip + * 2.0.5 - change from zip to delta_average_zip + */ +table NCBI:tbl:phred_quality #2.0.4 = INSDC:tbl:sequence #1.0.1 +{ + // read directly quality as phred + INSDC:quality:phred out_qual_phred = .QUALITY; + + // input rules + INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY; + INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY; + + INSDC:quality:phred in_qual_phred + = QUALITY + | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 ) + | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 ); + + // physical storage +/*** next line is for future change in production, but we have to wait until supporting code is released to the public ***/ +// physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY = in_qual_phred; +/*** NB *** MUST change table version to 2.0.5 and propagate to all derived tables ***/ + physical column < INSDC:quality:phred > zip_encoding .QUALITY = in_qual_phred; + + // feed to compressed statistics + INSDC:quality:phred in_stats_qual = in_qual_phred; + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ +}; + + + +/*-------------------------------------------------------------------------- + * log_odds + * log-odds quality score support + * + * conversion from log-odds to phred is via formula + * 10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499 + * for x = -4..40 : when x = -5, phred = 0 + */ + +// the map function requires two lookup tables: +// the first table detects every legal value... +const INSDC:quality:log_odds NCBI:quality:from:log_odds = +[ + -6,-5,-4,-3,-2,-1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9,10, + 11,12,13,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30, + 31,32,33,34,35,36,37,38,39,40 +]; + +// ...the second table gives positional translations +const INSDC:quality:phred NCBI:quality:to:phred = +[ + 0, 1, 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, 7, 8, 9,10,10, + 11,12,13,14,15,16,17,18,19,20, + 21,22,23,24,25,26,27,28,29,30, + 31,32,33,34,35,36,37,38,39,40 +]; + +function +INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds ) +{ + // this range enforcement may not be required + INSDC:quality:log_odds log_odds_clip + = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds ); + + // use the tables above to map from log-odds to phred + return < INSDC:quality:log_odds, INSDC:quality:phred > + map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip ); +} + +/* history: + * 1.0.1 - base explicitly upon sequence #1.0.1 + */ +table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 +{ + /* READ - decoding + */ + U8 read_ndecode + = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack ); + + /* QUALITY + * declared in INSDC:tbl:sequence as phred + * introduce here as log-odds + */ + extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds; + + // resolve for phred + INSDC:quality:phred out_qual_phred + = out_qual2_phred + | NCBI:log_odds_to_phred ( out_qual_log_odds ); + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_unpack + */ + + /* NCBI:tbl:log_odds_quality_nocol productions + * out_qual2_phred + * out_qual_log_odds + */ +}; + +/* history: + * 1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1 + */ +table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1 +{ + // read directly as n-encoded log_odds is compatible with log_odds + NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY; + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:n_encoding inherited productions + * read_unpack + */ + + /* NCBI:tbl:log_odds_quality_nocol inherited productions + * out_qual2_phred + */ +}; + +/* history: + * 2.0.1 - base explicitly upon sequence #1.0.1 + * 2.1.0 - added production of in_qual_phred + */ +table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1 +{ + /* QUALITY + * declared in INSDC:tbl:sequence as phred + * introduce here as log-odds + */ + extern column INSDC:quality:log_odds QUALITY + = out_qual_log_odds; + + // resolve for phred + INSDC:quality:phred in_qual_phred + = NCBI:log_odds_to_phred ( in_qual_log_odds ); + + INSDC:quality:phred out_qual_phred + = NCBI:log_odds_to_phred ( out_qual_log_odds ); + + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:log_odds_quality_nocol productions + * out_qual_log_odds + */ +}; + +/* history: + * 2.0.1 - added feed of in_stats_qual + * 2.0.2 - added input of text encodings + * 2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1 + * 2.0.4 - changed compression from izip to zip + * 2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0 + */ +table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0 +{ + INSDC:quality:log_odds out_qual_log_odds= .QUALITY; + + extern column INSDC:quality:text:log_odds_64 QUALITY + = out_qual_text_log_odds_64 + | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds ); + + // input rules + INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY; + + INSDC:quality:log_odds in_qual_log_odds + = QUALITY + | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 ); + + physical column < INSDC:quality:log_odds > zip_encoding .QUALITY + = in_qual_log_odds; + + // feed to compressed statistics + INSDC:quality:log_odds in_stats_qual = in_qual_log_odds; + + + /* INSDC:tbl:sequence inherited productions + * out_qual_text_phred_33 + * out_qual_text_phred_64 + */ + + /* NCBI:tbl:log_odds_quality productions + * out_qual_text_log_odds_64 + */ +};