diff libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/read.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/read.vschema	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,255 @@
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE
+*               National Center for Biotechnology Information
+*
+*  This software/database is a "United States Government Work" under the
+*  terms of the United States Copyright Act.  It was written as part of
+*  the author's official duties as a United States Government employee and
+*  thus cannot be copyrighted.  This software/database is freely available
+*  to the public for use. The National Library of Medicine and the U.S.
+*  Government have not placed any restriction on its use or reproduction.
+*
+*  Although all reasonable efforts have been taken to ensure the accuracy
+*  and reliability of the software and data, the NLM and the U.S.
+*  Government do not and cannot warrant the performance or results that
+*  may be obtained by using this software or data. The NLM and the U.S.
+*  Government disclaim all warranties, express or implied, including
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.
+*
+*  Please cite the author in any work or product based on this material.
+*
+* ===========================================================================
+*
+*/
+
+/*==========================================================================
+ * General read table which will be inherited by others
+ */
+version 1;
+
+include 'vdb/vdb.vschema';
+include 'insdc/insdc.vschema';
+include 'csra2/stats.vschema';
+
+
+/*--------------------------------------------------------------------------
+ * tables
+ */
+table NCBI:csra2:tbl:read #1.0 = NCBI:csra2:tbl:read_stats #1
+{
+    /* CHUNK_SZ
+     *  describes the maximum number of bases in any row
+     *
+     *  if present, allows a single sequence to be broken into multiple rows
+     *  where this value gives the limit on the number of bases in any row.
+     *
+     *  the sequence will be split across some number of rows, depending upon
+     *  the value of CHUNK_SZ. if length ( seq ) > CHUNK_SZ, then there will
+     *  be multiple rows, where all but the last will have a length of CHUNK_SZ.
+     *  the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
+     */
+    extern column INSDC:coord:len CHUNK_SZ;
+
+
+    /* READ
+     *  base calls
+     */
+
+    // textual representation
+    extern default column INSDC:dna:text READ
+    {
+        read = out_dna_text;
+        validate = < INSDC:dna:text > compare ( in_dna_text, out_dna_text );
+    }
+
+    // 4na representation - unpacked
+    extern column INSDC:4na:bin READ
+        = out_4na_bin
+        ;
+
+
+    /* QUALITY
+     *  phred-score quality values
+     */
+    extern default column INSDC:quality:phred QUALITY
+        = out_qual_phred
+        ;
+    extern column INSDC:quality:text:phred_33 QUALITY
+        = ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( out_qual_phred )
+        ;
+    extern column INSDC:quality:text:phred_64 QUALITY
+        = ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( out_qual_phred )
+        ;
+
+    /* ---------------------------- optional columns ---------------------------- */
+
+    /* RD_ID
+     * RD_GROUP
+     *  reports group and id of current row
+     */
+    extern column I64 RD_ID;
+    extern column ascii RD_GROUP;
+
+    /* RD_FILTER
+     *  records filter value if used
+     */
+    extern column INSDC:SRA:read_filter RD_FILTER;
+
+
+    /* ---------------------------- input rules ---------------------------- */
+
+    // input text
+    INSDC:dna:text in_dna_text
+        = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ )
+        ;
+
+    // input 4na bin
+    INSDC:4na:bin in_4na_bin
+        = < INSDC:4na:bin > range_validate < 0, 15 > ( READ )
+        | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text )
+        ;
+
+    // input 2na bin
+    INSDC:2na:bin in_2na_bin
+        = INSDC:SEQ:rand_4na_2na ( in_4na_bin )
+        ;
+
+    // input 4na alt-read ( ambiguities )
+    INSDC:4na:bin in_alt_4na_bin
+        = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin )
+        ;
+
+    // feed the statistics
+    INSDC:4na:bin in_stats_seq = in_4na_bin;
+    
+    // quality
+    INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY;
+    INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY;
+
+    INSDC:quality:phred in_qual_phred
+        = QUALITY
+        | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 )
+        | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 )
+        ;
+
+    // feed the statistics
+    INSDC:quality:phred in_stats_qual_phred = in_qual_phred;
+
+    ascii in_stats_read_group
+        = in_stats_spot_group
+        | RD_GROUP
+        ;
+
+
+    /* ---------------------------- physical columns ---------------------------- */
+
+    physical column INSDC:2na:packed .READ
+        = ( INSDC:2na:packed ) pack ( in_2na_bin )
+        ;
+
+    physical column < INSDC:4na:bin > zip_encoding .ALTREAD
+        = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin )
+        ;
+
+    physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY
+        = in_qual_phred
+        ;
+
+
+    /* ---------------------------- output rules ---------------------------- */
+
+    // output 2na packed
+    INSDC:2na:packed out_2na_packed
+        = .READ
+        ;
+
+    // output 2na bin
+    INSDC:2na:bin out_2na_bin
+        = ( INSDC:2na:bin ) unpack ( out_2na_packed )
+        ;
+
+    // output 2na->4na bin
+    INSDC:4na:bin out_2na_4na_bin
+        = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin )
+        ;
+
+    // output 4na bin
+    INSDC:4na:bin out_4na_bin
+        = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD )
+        | out_2na_4na_bin
+        ;
+    
+    // output text
+    INSDC:dna:text out_dna_text
+        = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
+        ;
+    
+    // output quality
+    INSDC:quality:phred out_qual_phred
+        = .QUALITY
+        | < INSDC:quality:phred > echo < 30 > ( out_4na_bin )
+        ;
+}
+
+
+/*--------------------------------------------------------------------------
+ * views
+ */
+table NCBI:csra2:view:read #1.0 =
+    NCBI:csra2:tbl:read #1.0
+{
+    /* CHUNK_SIZE
+     *  describes the maximum number of bases in any row
+     *
+     *  if present, allows a single sequence to be broken into multiple rows
+     *  where this value gives the limit on the number of bases in any row.
+     *
+     *  the sequence will be split across some number of rows, depending upon
+     *  the value of CHUNK_SIZE. if length ( seq ) > CHUNK_SIZE, then there will
+     *  be multiple rows, where all but the last will have a length of CHUNK_SIZE.
+     *  the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
+     */
+    readonly column INSDC:coord:len CHUNK_SIZE
+        = .CHUNK_SZ
+        | < INSDC:coord:len > echo < 0xFFFFFFFF > ()
+        ;
+    
+    /* READ
+     *  generate remaining 4 types
+     */
+    readonly column INSDC:4na:packed READ
+        = ( INSDC:4na:packed ) pack ( out_4na_bin )
+        ;
+    readonly column INSDC:x2na:bin READ
+        = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin )
+        ;
+    readonly column INSDC:2na:bin READ
+        = out_2na_bin
+        ;
+    readonly column INSDC:2na:packed READ
+        = out_2na_packed
+        ;
+
+    /* READ_ID
+     * READ_GROUP
+     *  reports group and id of current row
+     */
+    readonly column I64 READ_ID
+        = .RD_ID
+        | row_id ()
+        ;
+    readonly column ascii READ_GROUP
+        = .RD_GROUP
+        | < ascii > echo < '' > ()
+        ;
+
+    /* READ_FILTER
+     *  records filter value if used
+     */
+    readonly column INSDC:SRA:read_filter READ_FILTER
+        = .RD_FILTER
+        | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ()
+        ;
+}