Mercurial > repos > charles_s_test > seqsero2

/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * Sequence schema implementation tables
 */
version 1;

include 'vdb/vdb.vschema';
include 'ncbi/ncbi.vschema';
include 'insdc/sra.vschema';


/*--------------------------------------------------------------------------
 * n_encoding - implementation
 *  introduces common virtual productions
 */
table NCBI:tbl:n_encoding #1
{
    U8 n_encoding_dummy
        = read_unpack
        | read_ndecode;
};


/*--------------------------------------------------------------------------
 * seqloc
 *  NCBI sequence locator table
 */
table NCBI:tbl:seqloc #1.0
{
    /* SEQ_ID
     *  a FASTA-style SeqId
     */
    extern column < ascii > zip_encoding SEQ_ID;

    /* SEQ_START
     *  provided in both 1 ( default ) and 0-based coordinates
     */
    extern default column < INSDC:coord:one > izip_encoding SEQ_START;
    readonly column INSDC:coord:zero SEQ_START
        = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START );

    /* SEQ_LEN
     */
    extern column < INSDC:coord:len > izip_encoding SEQ_LEN;
};


/*--------------------------------------------------------------------------
 * base_space - implementation
 *  READ column rules
 */

/* color_from_dna
 *  use starting keys and color matrix to convert individual reads
 *  to base space.
 */
extern function
INSDC:x2cs:bin NCBI:color_from_dna #1 ( INSDC:x2na:bin bin_x2na,
    INSDC:coord:zero read_start, INSDC:coord:len read_len,
    INSDC:dna:text cs_key, U8 color_matrix );


/* dcmp_base_space
 *  table to introduce common virtual productions
 */
table NCBI:tbl:dcmp_base_space #1
{
    // rules to introduce purely virtual productions
    // never expected to resolve...
    INSDC:dna:text dcmp_virtual_productions
        = out_dcmp_4na_bin
        | out_dcmp_x2na_bin
        | out_dcmp_2na_bin
        | out_dcmp_2na_packed;
}

/* history:
 *  1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1
 *  1.0.2 - spotdesc #1.0.2
 *  1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin"
 */
table NCBI:tbl:base_space_common #1.0.3
    = INSDC:tbl:sequence #1.0.1
    , INSDC:SRA:tbl:spotdesc #1.0.2
    , INSDC:SRA:tbl:stats #1.1.0
    , NCBI:tbl:dcmp_base_space #1.0.0
{
	/* INSDC:tbl:sequence inherited virtual productions
     */

    // cs_native - tells user color space is not native
    bool cs_native = < bool > echo < false > ();

    // in_cs_key is not writable in base_space

    // color-space key is completely artificial
    INSDC:dna:text out_cs_key
        = .CS_KEY
        | < INSDC:dna:text > echo < 'T' > ( out_read_type )
        | < INSDC:dna:text > echo < 'T' > ( out_read_len )
        | < INSDC:dna:text > echo < 'T' > ();

    // unambiguous synthesized 2cs
    INSDC:2cs:bin out_2cs_bin
        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin );

    // unambiguous unpacked 2na
    INSDC:2na:bin out_2na_bin
        = out_dcmp_2na_bin
        | ( INSDC:2na:bin ) unpack ( out_2na_packed );

    // synthesized color sequence
    INSDC:x2cs:bin out_x2cs_bin
        = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix );

    // synthesized packed 2cs
    INSDC:2cs:packed out_2cs_packed
        = ( INSDC:2cs:packed ) pack ( out_2cs_bin );

    // synthesized packed 4na
    INSDC:4na:packed out_4na_packed
        = ( INSDC:4na:packed ) pack ( out_4na_bin );

    // synthesized color text
    INSDC:color:text out_color_text
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin );

    // published color matrix
    U8 out_color_matrix
        = < U8 > echo < INSDC:color:default_matrix > ();

    // spot_len and fixed_spot_len
    INSDC:coord:len base_space_spot_len
        = ( INSDC:coord:len ) row_len ( out_2na_packed );
    INSDC:coord:len base_space_fixed_spot_len
        = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed );


	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 *  in_dna_text
	 *  out_4na_bin
	 *  out_dna_text
	 *  out_x2na_bin
	 *  out_2na_packed
	 */

	/* INSDC:SRA:tbl:stats inherited productions
	 *  in_stats_bin
	 */

	/* NCBI:tbl:dcmp_base_space inherited productions
	 *  out_dcmp_2na_bin
	 *  out_dcmp_4na_bin
	 *  out_dcmp_x2na_bin
	 *  out_dcmp_2na_packed
	 */
};


/* base_space_nocol
 *  this table describes viewing rules
 *  but omits writing rules and physical column description
 *  in order to support older tables
 *
 * history:
 *  1.0.1 - base explicitly upon base_space_common #1.0.1
 *  1.0.2 - base explicitly upon base_space_common #1.0.2
 *  1.0.3 - " " 1.0.3
 */
table NCBI:tbl:base_space_nocol #1.0.3
    = NCBI:tbl:base_space_common #1.0.3
    , NCBI:tbl:n_encoding #1
{
    // incoming is disabled

    // synthesized dna text
    INSDC:dna:text out_dna_text
        = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin );

    // synthesized 4na
    INSDC:4na:bin out_4na_bin
        = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin );

    // unpacked 2na with ambiguities
    INSDC:x2na:bin out_x2na_bin
        = ( INSDC:x2na:bin ) read_ndecode;

    // interface with n-encoded qualities
    U8 read_unpack = out_2na_bin;

	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 *  out_2na_packed
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_ndecode
	 */
};

/* base_space #1
 *  this schema brings in standard .READ column for v1 tables
 *
 * history:
 *  1.0.1 - base explicitly upon base_space_nocol #1.0.1
 *  1.0.2 - base explicitly upon base_space_nocol #1.0.2
 *  1.0.3 - base explicitly upon base_space_nocol #1.0.3
 */
table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3
{
    // 2-bit 2na representation (0..3)
    INSDC:2na:packed out_2na_packed = .READ;

    // no rules for writing to .READ

	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_ndecode
	 */
};


/* base_space #2
 *  standard current base-space table
 *
 * history:
 *  2.0.2 - base_space_common #1.0.2
 *  2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well
 */
table NCBI:tbl:base_space #2.0.3
    = NCBI:tbl:base_space_common #1.0.3
    , NCBI:tbl:dcmp_base_space #1
{
    /* input rules
     */

    // input text
    INSDC:dna:text in_dna_text
        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ );

    // input 4na bin
    INSDC:4na:bin in_4na_bin
        = < INSDC:4na:bin > range_validate < 0, 15 > ( READ )
        | ( INSDC:4na:bin ) unpack ( in_4na_packed )
        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text )
        | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin );

    // input 4na packed
    INSDC:4na:packed in_4na_packed = READ;

    // input x2na bin
    INSDC:x2na:bin in_x2na_bin
        = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ )
        | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin );

    // input 2na bin
    INSDC:2na:bin in_2na_bin
        = < INSDC:2na:bin > range_validate < 0, 3 > ( READ )
        | ( INSDC:2na:bin ) unpack ( in_2na_packed )
        | INSDC:SEQ:rand_4na_2na ( in_4na_bin );

    // input 2na packed
    INSDC:2na:packed in_2na_packed = READ;

    // input 4na alt-read ( ambiguities )
    INSDC:4na:bin in_alt_4na_bin
        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin );

    // preparing a feed into stats column
    U8 in_stats_bin = in_2na_bin;


    /* physical columns
     */

    physical column INSDC:2na:packed .READ
        = in_2na_packed
        | ( INSDC:2na:packed ) pack ( in_2na_bin );

    physical column < INSDC:4na:bin > zip_encoding .ALTREAD
        = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin );


    /* output rules
     */

    // output 2na packed
    INSDC:2na:packed out_2na_packed
        = .READ
        | out_dcmp_2na_packed;

    // output x2na bin
    INSDC:x2na:bin out_x2na_bin
        = out_dcmp_x2na_bin
        | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin );

    // output 2na->4na bin
    INSDC:4na:bin out_2na_4na_bin
        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin );

    // output 4na bin
    INSDC:4na:bin out_4na_bin
        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD )
        | out_dcmp_4na_bin
        | out_2na_4na_bin;

    // output text
    INSDC:dna:text out_dna_text
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin );


	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 */

	/* NCBI:tbl:dcmp_base_space inherited productions
	 *  out_dcmp_2na_bin
	 *  out_dcmp_4na_bin
	 *  out_dcmp_x2na_bin
	 *  out_dcmp_2na_packed
	 */
};


/*--------------------------------------------------------------------------
 * color_space - implementation
 *  nucleotide sequences in color space
 */

extern function
INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin,
     INSDC:coord:zero read_start, INSDC:coord:len read_len,
     INSDC:dna:text cs_key, U8 color_matrix );


/* dcmp_color_space
 *  declares common virtual productions
 */
table NCBI:tbl:dcmp_color_space #1
{
    // rules to introduce purely virtual productions
    // never expected to resolve...
    INSDC:dna:text dcmp_virtual_productions
        = out_dcmp_x2cs_bin
        | out_dcmp_2cs_bin
        | out_dcmp_2cs_packed;
}

/* history:
 *  1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1
 *  1.0.2 - spotdesc #1.0.2
 *  1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin"
 */
table NCBI:tbl:color_space_common #1.0.3
    = INSDC:tbl:sequence #1.0.1
    , INSDC:SRA:tbl:spotdesc #1.0.2
    , INSDC:SRA:tbl:stats #1.1.0
    , NCBI:tbl:dcmp_color_space #1.0.0
{
    // cs_native - tells user color space is native
    bool cs_native = < bool > echo < true > ();

     // unambiguous unpacked 2cs
    INSDC:2cs:bin out_2cs_bin
        = out_dcmp_2cs_bin
        | ( INSDC:2cs:bin ) unpack ( out_2cs_packed );

     // unambiguous synthesized 2na
    INSDC:2na:bin out_2na_bin
        = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin );

     // synthesized unpacked 4na
    INSDC:4na:bin out_4na_bin
        = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin );

    // synthesized dna text
    INSDC:dna:text out_dna_text
        = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin );

    // synthesized dna sequence
    INSDC:x2na:bin out_x2na_bin
        = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix );

    // synthesized packed 2na
    INSDC:2na:packed out_2na_packed
        = ( INSDC:2na:packed ) pack ( out_2na_bin );

    // synthesized packed 4na
    INSDC:4na:packed out_4na_packed
        = ( INSDC:4na:packed ) pack ( out_4na_bin );

    // synthesized color text
    INSDC:color:text out_color_text
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin );

    // spot_len and fixed_spot_len
    INSDC:coord:len color_space_spot_len
        = ( INSDC:coord:len ) row_len ( out_2cs_packed );
    INSDC:coord:len color_space_fixed_spot_len
        = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed );

	/* INSDC:tbl:sequence inherited productions
	 *  in_cs_key
	 *  out_cs_key
	 *  out_signal
	 *  out_x2cs_bin
	 *  in_color_text
	 *  out_2cs_packed
	 *  out_color_matrix
	 */

	/* INSDC:SRA:tbl:stats inherited productions
	 *  in_stats_bin
	 */

	/* NCBI:tbl:dcmp_color_space inherited productions
	 *  out_dcmp_2cs_bin
	 *  out_dcmp_x2cs_bin
	 *  out_dcmp_2cs_packed
	 */
};

/* color_space_nocol
 *  this table describes viewing rules
 *  but omits writing rules and physical column description
 *  in order to support older tables
 *
 * history:
 *  1.0.1 - base explicitly upon color_space_common #1.0.1
 *  1.0.2 - color_space_common #1.0.2
 *  1.0.3 - color_space_common #1.0.3
 */
table NCBI:tbl:color_space_nocol #1.0.3
    = NCBI:tbl:color_space_common #1.0.3
    , NCBI:tbl:n_encoding #1
{
    // incoming is disabled

    // v1 color matrix was stored in metadata
    U8 out_color_matrix
        = < U8 > meta:read < "COLOR_MATRIX" > ()
        | < U8 > echo < INSDC:color:default_matrix > ();

    // unpacked 2cs with ambiguities
    INSDC:x2cs:bin out_x2cs_bin
        = ( INSDC:x2cs:bin ) read_ndecode;

    // interface with n-encoded qualities
    U8 read_unpack = out_2cs_bin;

	/* INSDC:tbl:sequence inherited productions
	 *  out_cs_key
	 *  out_signal
	 *  out_2cs_packed
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_ndecode
	 */
};

/* color_space #1
 *  this schema brings in .CSREAD and .CS_KEY columns for v1 tables
 *
 * history:
 *  1.0.1 - base explicitly upon color_space_nocol #1.0.1
 *  1.0.2 - color_space_nocol #1.0.2
 *  1.0.3 - color_space_nocol #1.0.3
 */
table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3
{
    // stored as text
    INSDC:dna:text out_cs_key = .CS_KEY;

    // stored color sequence
    INSDC:2cs:packed out_2cs_packed = .CSREAD;

	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_ndecode
	 */
};

/* color_space #2
 *  standard current color-space table
 *
 * history:
 *  2.0.1 - base explicitly upon color_space_common #1.0.1
 *  2.0.2 - base explicitly upon color_space_common #1.0.2
 *  2.1.0 - introduce hooks for compressed color space
 */
table NCBI:tbl:color_space #2.1
    = NCBI:tbl:color_space_common #1.0.3
    , NCBI:tbl:dcmp_color_space #1.0.0
{
    /* input rules
     */

    // input text is not modified
    // illegal values are not detected here
    INSDC:color:text in_color_text = CSREAD;

    // input x2cs bin
    // illegal values will be caught here
    INSDC:x2cs:bin in_x2cs_bin
        = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD )
        | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text );

    // input 2cs bin
    INSDC:2cs:bin in_2cs_bin
        = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD )
        | ( INSDC:2cs:bin ) unpack ( in_2cs_packed )
        | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin );

    // input 2cs packed
    INSDC:2cs:packed in_2cs_packed = CSREAD;

    // input x2cs alt-csread ( ambiguity )
    INSDC:x2cs:bin in_alt_x2cs_bin
        = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin );

    // color-space keys ARE modified on input
    INSDC:dna:text in_cs_key
        = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY );

    // color matrix
    U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX );

    // prepairing a feed into stats column
    U8 in_stats_bin = in_2cs_bin;


    /* physical columns
     */

    physical column INSDC:2cs:packed .CSREAD
        = in_2cs_packed
        | ( INSDC:2cs:packed ) pack ( in_2cs_bin );

    physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD
        = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin );

    physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key;

    physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix;


    /* output rules
     */

    // output 2cs packed
    INSDC:2cs:packed out_2cs_packed
        = .CSREAD
        | out_dcmp_2cs_packed;

    // unpacked 2cs with ambiguity
    INSDC:x2cs:bin out_x2cs_bin
        = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD )
        | out_dcmp_x2cs_bin
        | ( INSDC:x2cs:bin ) out_2cs_bin;

    // read directly from physical column
    INSDC:dna:text out_cs_key = .CS_KEY;

    // color matrix may be synthesized
    U8 out_color_matrix
        = .COLOR_MATRIX
        | < U8 > echo < INSDC:color:default_matrix > ();


	/* INSDC:tbl:sequence inherited productions
	 *  out_signal
	 */

	/* NCBI:tbl:dcmp_color_space inherited productions
	 *  out_dcmp_2cs_bin
	 *  out_dcmp_x2cs_bin
	 *  out_dcmp_2cs_packed
	 */
};


/*--------------------------------------------------------------------------
 * protein
 */
table NCBI:tbl:protein #1 = INSDC:tbl:protein
{
    /* upper-case letters */
    INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text >
        map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN );

    /* std aa */
    INSDC:aa:bin in_aa_bin
        = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN )
        | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text );

    /* physical column */
    physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin;

    /* output rules */
    INSDC:aa:bin out_aa_bin = .PROTEIN;
    INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text >
        map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin );
};


/*--------------------------------------------------------------------------
 * phred
 *  standard phred quality representation
 *  limits values on input to 1..63
 *  reserves value 0 as ambiguity symbol for reads
 */


/* history:
 *  1.0.1 - base explicitly upon sequence #1.0.1
 */
table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1
{
    /* [CS]READ - decoding
     */
    U8 read_ndecode
        = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack );

	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_phred
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_unpack
	 */
};

/* history:
 *  1.0.1 - base explicitly upon phred_quality_nocol #1.0.1
 */
table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1
{
    // read directly as n-encoded phred is compatible with phred
    NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY;

	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_unpack
	 */
};

/* history:
 *  2.0.1 - added feed of in_stats_qual
 *  2.0.2 - added input of text encodings
 *  2.0.3 - base explicitly upon sequence #1.0.1
 *  2.0.4 - change compression from izip to zip
 *  2.0.5 - change from zip to delta_average_zip
 */
table NCBI:tbl:phred_quality #2.0.4 = INSDC:tbl:sequence #1.0.1
{
    // read directly quality as  phred
    INSDC:quality:phred out_qual_phred = .QUALITY;

    // input rules
    INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY;
    INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY;

    INSDC:quality:phred in_qual_phred
        = QUALITY
        | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 )
        | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 );

    // physical storage
/*** next line is  for future change in production, but we have to wait until supporting code is released to the public ***/
// physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY = in_qual_phred;
/*** NB *** MUST change table version to 2.0.5 and propagate to all derived tables ***/
    physical column < INSDC:quality:phred > zip_encoding .QUALITY = in_qual_phred;

    // feed to compressed statistics
    INSDC:quality:phred in_stats_qual = in_qual_phred;

	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */
};


/*--------------------------------------------------------------------------
 * log_odds
 *  log-odds quality score support
 *
 *  conversion from log-odds to phred is via formula
 *    10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499
 *  for x = -4..40 : when x = -5, phred = 0
 */

// the map function requires two lookup tables:
// the first table detects every legal value...
const INSDC:quality:log_odds NCBI:quality:from:log_odds =
[
             -6,-5,-4,-3,-2,-1, 0,
     1, 2, 3, 4, 5, 6, 7, 8, 9,10,
    11,12,13,14,15,16,17,18,19,20,
    21,22,23,24,25,26,27,28,29,30,
    31,32,33,34,35,36,37,38,39,40
];

// ...the second table gives positional translations
const INSDC:quality:phred NCBI:quality:to:phred =
[
              0, 1, 1, 2, 2, 3, 3,
     4, 4, 5, 5, 6, 7, 8, 9,10,10,
    11,12,13,14,15,16,17,18,19,20,
    21,22,23,24,25,26,27,28,29,30,
    31,32,33,34,35,36,37,38,39,40
];

function
INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds )
{
    // this range enforcement may not be required
    INSDC:quality:log_odds log_odds_clip
        = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds );

    // use the tables above to map from log-odds to phred
    return < INSDC:quality:log_odds, INSDC:quality:phred >
        map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip );
}

/* history:
 *  1.0.1 - base explicitly upon sequence #1.0.1
 */
table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1
{
    /* READ - decoding
     */
    U8 read_ndecode
        = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack );

    /* QUALITY
     *  declared in INSDC:tbl:sequence as phred
     *  introduce here as log-odds
     */
    extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds;

    // resolve for phred
    INSDC:quality:phred out_qual_phred
        = out_qual2_phred
        | NCBI:log_odds_to_phred ( out_qual_log_odds );

	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_unpack
	 */

	/* NCBI:tbl:log_odds_quality_nocol productions
	 *  out_qual2_phred
	 *  out_qual_log_odds
	 */
};

/* history:
 *  1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1
 */
table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1
{
    // read directly as n-encoded log_odds is compatible with log_odds
    NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY;

	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:n_encoding inherited productions
	 *  read_unpack
	 */

	/* NCBI:tbl:log_odds_quality_nocol inherited productions
	 *  out_qual2_phred
	 */
};

/* history:
 *  2.0.1 - base explicitly upon sequence #1.0.1
 *  2.1.0 - added production of in_qual_phred
 */
table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1
{
    /* QUALITY
     *  declared in INSDC:tbl:sequence as phred
     *  introduce here as log-odds
     */
    extern column INSDC:quality:log_odds QUALITY
        = out_qual_log_odds;

    // resolve for phred
    INSDC:quality:phred in_qual_phred
        = NCBI:log_odds_to_phred ( in_qual_log_odds );

    INSDC:quality:phred out_qual_phred
        = NCBI:log_odds_to_phred ( out_qual_log_odds );


	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:log_odds_quality_nocol productions
	 *  out_qual_log_odds
	 */
};

/* history:
 *  2.0.1 - added feed of in_stats_qual
 *  2.0.2 - added input of text encodings
 *  2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1
 *  2.0.4 - changed compression from izip to zip
 *  2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0
 */
table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0
{
    INSDC:quality:log_odds out_qual_log_odds= .QUALITY;

    extern column INSDC:quality:text:log_odds_64 QUALITY
        = out_qual_text_log_odds_64
        | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds );

    // input rules
    INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY;

    INSDC:quality:log_odds in_qual_log_odds
        = QUALITY
        | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 );

    physical column < INSDC:quality:log_odds > zip_encoding .QUALITY
        = in_qual_log_odds;

    // feed to compressed statistics
    INSDC:quality:log_odds in_stats_qual = in_qual_log_odds;


	/* INSDC:tbl:sequence inherited productions
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:log_odds_quality productions
	 *  out_qual_text_log_odds_64
	 */
};
author	charles_s_test
date	Mon, 27 Nov 2017 11:21:07 -0500
parents
children