diff libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/reference.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/reference.vschema	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,245 @@
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE
+*               National Center for Biotechnology Information
+*
+*  This software/database is a "United States Government Work" under the
+*  terms of the United States Copyright Act.  It was written as part of
+*  the author's official duties as a United States Government employee and
+*  thus cannot be copyrighted.  This software/database is freely available
+*  to the public for use. The National Library of Medicine and the U.S.
+*  Government have not placed any restriction on its use or reproduction.
+*
+*  Although all reasonable efforts have been taken to ensure the accuracy
+*  and reliability of the software and data, the NLM and the U.S.
+*  Government do not and cannot warrant the performance or results that
+*  may be obtained by using this software or data. The NLM and the U.S.
+*  Government disclaim all warranties, express or implied, including
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.
+*
+*  Please cite the author in any work or product based on this material.
+*
+* ===========================================================================
+*
+*/
+
+/*==========================================================================
+ * VDB Alignment types, functions and tables
+ */
+version 1;
+
+include 'vdb/vdb.vschema';
+include 'csra2/stats.vschema';
+
+
+/*--------------------------------------------------------------------------
+ * tables
+ */
+table NCBI:csra2:tbl:reference #1.0
+    = NCBI:csra2:tbl:read_stats #1
+{
+    /* CHUNK_SIZE
+     *  describes the maximum number of bases in any cell
+     */
+    extern column INSDC:coord:len CHUNK_SIZE;
+
+    /* CIRCULAR
+     *  true if the reference is circular
+     */
+    extern column bool CIRCULAR;
+
+    /* CANONICAL_NAME
+     *  this should be an accessioned proper name
+     */
+    extern column utf8 CANONICAL_NAME;
+
+    /* COMMON_NAME
+     *  this name may be ambiguous or missing entirely
+     */
+    extern column utf8 COMMON_NAME;
+
+    /* LOCAL_SEQUENCE
+     *  supports name overloading by type
+     */
+    extern default column INSDC:dna:text LOCAL_SEQUENCE
+    {
+        read = out_local_dna_text;
+        validate = < INSDC:dna:text > compare ( in_local_dna_text, out_local_dna_text );
+    }
+    extern column INSDC:4na:bin LOCAL_SEQUENCE = out_local_4na_bin;
+
+    /* PRIMARY_ALIGNMENT_IDS
+     * SECONDARY_ALIGNMENT_IDS
+     *  an index to rows in the PRIMARY_ALIGNMENT and
+     *  SECONDARY_ALIGNMENT tables having alignments 
+     *  STARTING within this chunk
+     *
+     *  the indicies MUST be sorted in clustered order,
+     *  meaning that they are in ascending numeric order
+     */
+    extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS;
+    extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS;
+
+    /* OVERLAP_REF_POS
+     *  min ( REF_POS ) for all alignments intersecting this chunk
+     *  but starting in a previous chunk, where the stored position
+     *  is in reference coordinates.
+     *
+     *  a value of 0 indicates that no alignments starting to
+     *  the left of this chunk also intersect with it.
+     */
+    extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS;
+
+    /* OVERLAP_REF_LEN
+     *  max ( REF_POS + REF_LEN - CHUNK_START ) % CHUNK_SIZE
+     *  for all alignments intersecting this chunk but starting
+     *  in a previous chunk.
+     *
+     *  indicates the amount of this chunk that is needed by
+     *  alignments not starting within chunk. so if a slice on
+     *  this reference were to start at 100 bases into this chunk,
+     *  for example, and the OVERLAP_REF_LEN were 100 or less, then
+     *  there are no alignments from prior chunks that need to be
+     *  considered.
+     */
+    extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN;
+
+    /* COVERAGE
+     *  graphing statistics for the chunk
+     */
+
+    // clipped at 255
+    extern column < U8 > izip_encoding CGRAPH_HIGH;
+    extern column < U8 > izip_encoding CGRAPH_LOW;
+
+    // count of the number of mismatches in the chunk
+    extern column < U32 > izip_encoding CGRAPH_MISMATCHES;
+
+    // count of the number of inserts and deletes in the chunk
+    extern column < U32 > izip_encoding CGRAPH_INDELS;
+
+
+    /* writing rules */
+    INSDC:dna:text in_local_dna_text
+        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( LOCAL_SEQUENCE );
+        ;
+    INSDC:4na:bin in_local_4na_bin
+        = < INSDC:4na:bin > range_validate < 0, 15 > ( LOCAL_SEQUENCE )
+        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_local_dna_text )
+        ;
+    INSDC:2na:bin in_local_2na_bin
+        = INSDC:SEQ:rand_4na_2na ( in_local_4na_bin )
+        ;
+    INSDC:4na:bin in_ambig_4na_bin
+        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_local_4na_bin );
+        ;
+
+    INSDC:4na:bin in_stats_seq = in_local_4na_bin;
+
+    /* physical columns for sequence */
+    physical column INSDC:2na:packed .LOCAL_SEQUENCE
+        = ( INSDC:2na:packed ) pack ( in_local_2na_bin )
+        ;
+    physical column < INSDC:4na:bin > zip_encoding .LOCAL_AMBIGUITY
+        = < INSDC:4na:bin > trim < 0, 0 > ( in_ambig_4na_bin )
+        ;
+
+    /* reading rules */
+    INSDC:2na:packed out_local_2na_packed
+        = .LOCAL_SEQUENCE
+        ;
+    INSDC:2na:bin out_local_2na_bin
+        = ( INSDC:2na:bin ) unpack ( out_local_2na_packed )
+        ;
+    INSDC:4na:bin out_local_2na_4na_bin
+        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_local_2na_bin );
+        ;
+    INSDC:4na:bin out_local_4na_bin
+        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_local_2na_4na_bin, .LOCAL_AMBIGUITY )
+        ;
+    INSDC:dna:text out_local_dna_text
+        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_local_4na_bin )
+        ;
+
+
+    INSDC:coord:len in_local_read_len
+        = ( INSDC:coord:len ) row_len ( in_local_2na_bin )
+        ;
+    INSDC:SRA:xread_type in_local_read_type
+        = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > ()
+        ;
+}
+
+
+/*--------------------------------------------------------------------------
+ * "views"
+ */
+table NCBI:csra2:view:reference #1.0
+    = NCBI:csra2:tbl:reference #1.0
+{
+    /* EXTERNAL
+     *  may need to be a function
+     *  it can test the CANONICAL_NAME as in cSRA.v1,
+     *  but if internal it can also check row_length of bases
+     */
+    readonly column bool EXTERNAL
+        = < bool > exists < false > ( .LOCAL_SEQUENCE )
+        | < bool > echo < true > ()
+        ;
+
+    /* SEQUENCE
+     *  available as text, 4na, x2na, 2na
+     */
+    default readonly column INSDC:dna:text SEQUENCE
+        = out_dna_text
+        ;
+    readonly column INSDC:4na:bin SEQUENCE
+        = out_4na_bin
+        ;
+    readonly column INSDC:4na:packed SEQUENCE
+        = ( INSDC:4na:packed ) pack ( out_4na_bin )
+        ;
+    readonly column INSDC:x2na:bin SEQUENCE
+        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin )
+        ;
+    readonly column INSDC:2na:bin SEQUENCE
+        = out_2na_bin
+        ;
+    readonly column INSDC:2na:packed SEQUENCE
+        = pack ( out_2na_bin )
+        ;
+
+    /* QUALITY
+     * This is fake column for compatibility
+     */
+    readonly column INSDC:quality:phred QUALITY
+        = out_qual_phred
+        ;
+
+    /* column aliases */
+    readonly column INSDC:coord:len MAX_SEQ_LEN = .CHUNK_SIZE;
+    readonly column ascii SEQ_ID = cast ( .CANONICAL_NAME );
+
+    /* sequence productions */
+    INSDC:4na:bin out_4na_bin
+        = out_local_4na_bin
+    // TODO:  | sub-select from external table
+        ;
+
+    INSDC:dna:text out_dna_text
+        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
+        ;
+
+    INSDC:2na:bin out_2na_bin
+        = INSDC:SEQ:rand_4na_2na ( out_4na_bin )
+        ;
+
+    /* quality productions */
+    INSDC:quality:phred out_qual_phred
+        = < INSDC:quality:phred > echo < 30 > ( SEQUENCE )
+        ;
+
+    INSDC:quality:phred in_stats_qual_phred = out_qual_phred;
+}
+