Mercurial > repos > charles_s_test > seqsero2

/*===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
*/

/*==========================================================================
 * Sequence schema
 */
version 1;

include 'vdb/vdb.vschema';
include 'ncbi/seq.vschema';


/* cmp_base_space
 *  table representing compressed reads in base space,
 *  where the bases are only stored for unaligned reads
 */
table NCBI:align:tbl:cmp_base_space #1
    = INSDC:tbl:sequence #1.0.1
    , NCBI:tbl:dcmp_base_space #1
{
    /* CMP_READ
     *  read compressed against a reference sequence
     */

    // default is IUPAC character representation
    extern default column INSDC:dna:text CMP_READ
    {
        read = out_cmp_dna_text;
        validate = < INSDC:dna:text > compare ( in_cmp_dna_text, out_cmp_dna_text );
    }

    // 4na representation
    extern column INSDC:4na:bin CMP_READ = out_cmp_4na_bin;
    extern column INSDC:4na:packed CMP_READ = out_cmp_4na_packed;

    // x2na representation - 2na with ambiguity
    extern column INSDC:x2na:bin CMP_READ = out_cmp_x2na_bin;

    // 2na representation - 2na with no ambiguity
    extern column INSDC:2na:bin CMP_READ = out_cmp_2na_bin;
    extern column INSDC:2na:packed CMP_READ = out_cmp_2na_packed;


    /* input processing rules
     */

    // compressed input text
    INSDC:dna:text in_cmp_dna_text
        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( CMP_READ );

    // compressed input 4na bin
    INSDC:4na:bin in_cmp_4na_bin
        = < INSDC:4na:bin > range_validate < 0, 15 > ( CMP_READ )
        | ( INSDC:4na:bin ) unpack ( in_cmp_4na_packed )
        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_cmp_dna_text )
        | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_cmp_x2na_bin );

    // compressed input 4na packed
    INSDC:4na:packed in_cmp_4na_packed = CMP_READ;

    // compressed input x2na bin
    INSDC:x2na:bin in_cmp_x2na_bin
        = < INSDC:x2na:bin > range_validate < 0, 4 > ( CMP_READ )
        | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_cmp_4na_bin );

    // compressed input 2na bin
    INSDC:2na:bin in_cmp_2na_bin
        = < INSDC:2na:bin > range_validate < 0, 3 > ( CMP_READ )
        | ( INSDC:2na:bin ) unpack ( in_cmp_2na_packed )
        | INSDC:SEQ:rand_4na_2na ( in_cmp_4na_bin );

    // compressed input 2na packed
    INSDC:2na:packed in_cmp_2na_packed = CMP_READ;

    // input 4na alt-read ( ambiguities )
    INSDC:4na:bin in_cmp_alt_4na_bin
        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_cmp_4na_bin );

    // preparing a feed into stats column
    U8 in_cmp_stats_bin = in_cmp_2na_bin;


    /* physical columns
     */

    physical column INSDC:2na:packed .CMP_READ
        = in_cmp_2na_packed
        | ( INSDC:2na:packed ) pack ( in_cmp_2na_bin );

    physical column < INSDC:4na:bin > zip_encoding .CMP_ALTREAD
        = < INSDC:4na:bin > trim < 0, 0 > ( in_cmp_alt_4na_bin );


    /* output processing rules
     */

    // output 2na packed
    INSDC:2na:packed out_cmp_2na_packed = .CMP_READ;

    // unambiguous unpacked 2na
    INSDC:2na:bin out_cmp_2na_bin
        = ( INSDC:2na:bin ) unpack ( out_cmp_2na_packed );

    // output x2na bin
    INSDC:x2na:bin out_cmp_x2na_bin
        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_cmp_4na_bin );

    // output 2na->4na bin
    INSDC:4na:bin out_cmp_2na_4na_bin
        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_cmp_2na_bin );

    // output 4na bin
    INSDC:4na:bin out_cmp_4na_bin
        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_cmp_2na_4na_bin, .CMP_ALTREAD )
        | out_cmp_2na_4na_bin;

    // synthesized packed 4na
    INSDC:4na:packed out_cmp_4na_packed
        = ( INSDC:4na:packed ) pack ( out_cmp_4na_bin );

    // output text
    INSDC:dna:text out_cmp_dna_text
        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_cmp_4na_bin );


    /* decompressed sequences
     *   source is out_dcmp_4na_bin - a virtual production
     */

    // synthesize x2na_bin, 2na_bin and 2na_packed
    INSDC:x2na:bin out_dcmp_x2na_bin
        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin );
    INSDC:2na:bin out_dcmp_2na_bin
        = < INSDC:x2na:bin, INSDC:2na:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2na_bin );
    INSDC:2na:packed out_dcmp_2na_packed
        = ( INSDC:2na:packed ) pack ( out_dcmp_2na_bin );


	/* INSDC:tbl:sequence inherited productions
	 *  cs_native
	 *  out_cs_key
	 *  out_signal
	 *  out_2cs_bin
	 *  out_2na_bin
	 *  out_4na_bin
	 *  out_dna_text
	 *  out_x2cs_bin
	 *  out_x2na_bin
	 *  out_2cs_packed
	 *  out_2na_packed
	 *  out_4na_packed
	 *  out_color_text
	 *  out_color_matrix
	 */

	/* NCBI:tbl:dcmp_base_space inherited productions
	 *  out_dcmp_4na_bin
	 */
}


/* cmp_color_space
 *  table representing compressed reads in color space,
 *  where the colors are only stored for unaligned reads
 */
table NCBI:align:tbl:cmp_color_space #1 =
    INSDC:tbl:sequence #1.0.1, NCBI:tbl:dcmp_color_space #1
{
    /* CMP_CSREAD
     *  read compressed against a reference sequence
     */

    // default is IUPAC character representation
    extern default column INSDC:color:text CMP_CSREAD = out_cmp_color_text;

    // x2cs representation - 2cs with ambiguity
    extern column INSDC:x2cs:bin CMP_CSREAD = out_cmp_x2cs_bin;

    // 2cs representation - 2cs with no ambiguity
    extern column INSDC:2cs:bin CMP_CSREAD = out_cmp_2cs_bin;
    extern column INSDC:2cs:packed CMP_CSREAD = out_cmp_2cs_packed;


    /* input processing rules
     */

    // compressed input text
    INSDC:color:text in_cmp_color_text = CMP_CSREAD;

    // compressed input x2cs bin
    INSDC:x2cs:bin in_cmp_x2cs_bin
        = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CMP_CSREAD )
        | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_cmp_color_text );

    // compressed input 2cs bin
    INSDC:2cs:bin in_cmp_2cs_bin
        = < INSDC:2cs:bin > range_validate < 0, 3 > ( CMP_CSREAD )
        | ( INSDC:2cs:bin ) unpack ( in_cmp_2cs_packed )
        | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin );

    // compressed input 2cs packed
    INSDC:2cs:packed in_cmp_2cs_packed = CMP_CSREAD;

    // compressed input x2cs alt-read ( ambiguities )
    INSDC:x2cs:bin in_cmp_alt_x2cs_bin
        = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin );

    // preparing a feed into stats column
    U8 in_cmp_stats_bin = in_cmp_2cs_bin;


    /* physical columns
     */

    physical column INSDC:2cs:packed .CMP_CSREAD
        = in_cmp_2cs_packed
        | ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin );

    physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD
        = < INSDC:x2cs:bin > trim < 0, 0 > ( in_cmp_alt_x2cs_bin );


    /* output processing rules
     */

    // compressed output 2cs packed
    INSDC:2cs:packed out_cmp_2cs_packed = .CMP_CSREAD;

    // unambiguous unpacked 2cs
    INSDC:2cs:bin out_cmp_2cs_bin
        = ( INSDC:2cs:bin ) unpack ( out_cmp_2cs_packed );

    // unpacked 2cs with ambiguity
    INSDC:x2cs:bin out_cmp_x2cs_bin
        = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, .CMP_ALTCSREAD )
        | ( INSDC:x2cs:bin ) out_cmp_2cs_bin;

    // output text
    INSDC:color:text out_cmp_color_text
        = < INSDC:x2cs:bin, INSDC:color:text > map <  INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin );


    /* decompressed sequences
     *   sources are out_dcmp_x2cs_bin - virtual production
     */

    // synthesize 2cs_bin and 2cs_packed
    INSDC:2cs:bin out_dcmp_2cs_bin
        = < INSDC:x2cs:bin, INSDC:2cs:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2cs_bin );
    INSDC:2cs:packed out_dcmp_2cs_packed
        = ( INSDC:2cs:packed ) pack ( out_dcmp_2cs_bin );


	/* INSDC:tbl:sequence inherited productions
	 *  cs_native
	 *  out_cs_key
	 *  out_signal
	 *  out_2cs_bin
	 *  out_2na_bin
	 *  out_4na_bin
	 *  out_dna_text
	 *  out_x2cs_bin
	 *  out_x2na_bin
	 *  out_2cs_packed
	 *  out_2na_packed
	 *  out_4na_packed
	 *  out_color_text
	 *  out_qual_phred
	 *  out_color_matrix
	 *  out_qual_text_phred_33
	 *  out_qual_text_phred_64
	 */

	/* NCBI:tbl:dcmp_color_space inherited productions
	 *  out_dcmp_x2cs_bin
	 */
}
author	charles_s_test
date	Mon, 27 Nov 2017 11:21:07 -0500
parents
children