diff libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,307 @@
+/*===========================================================================
+ *
+ *                            PUBLIC DOMAIN NOTICE
+ *               National Center for Biotechnology Information
+ *
+ *  This software/database is a "United States Government Work" under the
+ *  terms of the United States Copyright Act.  It was written as part of
+ *  the author's official duties as a United States Government employee and
+ *  thus cannot be copyrighted.  This software/database is freely available
+ *  to the public for use. The National Library of Medicine and the U.S.
+ *  Government have not placed any restriction on its use or reproduction.
+ *
+ *  Although all reasonable efforts have been taken to ensure the accuracy
+ *  and reliability of the software and data, the NLM and the U.S.
+ *  Government do not and cannot warrant the performance or results that
+ *  may be obtained by using this software or data. The NLM and the U.S.
+ *  Government disclaim all warranties, express or implied, including
+ *  warranties of performance, merchantability or fitness for any particular
+ *  purpose.
+ *
+ *  Please cite the author in any work or product based on this material.
+ *
+ * ===========================================================================
+ *
+ */
+
+/*==========================================================================
+ * NCBI PacBio Fastq Sequence Read Archive schema
+ */
+version 1;
+
+include 'insdc/sra.vschema';
+include 'ncbi/sra.vschema';
+
+
+/*--------------------------------------------------------------------------
+ * NCBI:SRA:PacBio
+ *  Pacific Biotech SRA Platform
+ *
+ * history:
+ *  1.0.2 - updated ancestry
+ *  1.0.3 - updated ancestry
+ */
+table NCBI:SRA:PacBio:common #1.0.3 = NCBI:SRA:tbl:sra #2.1.3
+{
+}
+
+/* history:
+ *  1.0.2 - updated ancestry
+ *  1.0.3 - updated ancestry
+ */
+table NCBI:SRA:PacBio:smrt:fastq #1.0.3
+    = NCBI:SRA:PacBio:common #1.0.3
+    , NCBI:tbl:base_space #2.0.3
+    , NCBI:tbl:phred_quality #2.0.3
+{
+    /* PLATFORM
+     *  platform name is always "PACBIO_SMRT"
+     */
+    ascii platform_name
+        = < ascii > echo < "PACBIO_SMRT" > ();
+
+    /* TRIMMED SEQUENCE
+     *  need to find the 0-based trim_start and trim_len
+     */
+    INSDC:coord:zero bio_start
+        = NCBI:SRA:bio_start ( out_read_start, out_read_type );
+
+    INSDC:coord:zero trim_start = bio_start;
+
+    U32 trim_left = ( U32 ) trim_start;
+    INSDC:coord:len trim_len = ( INSDC:coord:len )
+        < U32 > diff ( spot_len, trim_left );
+}
+
+/*--------------------------------------------------------------------------
+ * NCBI:SRA:PacBio:smrt:db
+ *  Pacific Biotech SRA Platform
+ */
+table NCBI:SRA:PacBio:smrt:indelsubst #1
+{
+    // probability that the current base is an insertion
+    column < U8 > zip_encoding INSERTION_QV;
+
+    // probability of a deletion error following current base
+    // and identity of deleted base, if it exists
+    column < U8 > zip_encoding DELETION_QV;
+    column < INSDC:dna:text > zip_encoding DELETION_TAG;
+
+    // probability of a substitution error
+    // and most likely alternative base call
+    column < U8 > zip_encoding SUBSTITUTION_QV;
+    column < INSDC:dna:text > zip_encoding SUBSTITUTION_TAG;
+};
+
+typedef U8 PacBio:hole:status;
+const PacBio:hole:status PacBio:hole:SEQUENCING  = 0;
+const PacBio:hole:status PacBio:hole:ANTIHOLE    = 1;
+const PacBio:hole:status PacBio:hole:FIDUCIAL    = 2;
+const PacBio:hole:status PacBio:hole:SUSPECT     = 3;
+const PacBio:hole:status PacBio:hole:ANTIMIRROR  = 4;
+const PacBio:hole:status PacBio:hole:FDZMW       = 5;
+const PacBio:hole:status PacBio:hole:FBZMW       = 6;
+const PacBio:hole:status PacBio:hole:ANTIBEAMLET = 7;
+const PacBio:hole:status PacBio:hole:OUTSIDEFOV  = 8;
+
+/* history:
+ *  1.0.1 - updated ancestry
+ *  1.0.2 - updated ancestry
+ */
+table NCBI:SRA:PacBio:smrt:basecalls #1.0.2
+    = INSDC:SRA:tbl:spotcoord #1
+    , NCBI:tbl:base_space #2.0.3
+    , NCBI:tbl:phred_quality #2.0.3
+    , NCBI:SRA:PacBio:smrt:indelsubst #1
+{
+    /* PLATFORM
+     *  platform name is always "PACBIO_SMRT"
+     */
+    ascii platform_name
+        = < ascii > echo < "PACBIO_SMRT" > ();
+
+    // basecalls will be routed to READ column
+    readonly column INSDC:dna:text BASECALL
+        = out_dna_text;
+
+    // quality value for each base
+    readonly column INSDC:quality:phred QUALITY_VALUE
+        = out_qual_phred;
+
+    // zero-based hole number
+    column < U32 > izip_encoding HOLE_NUMBER;
+
+    // hole status
+    column < PacBio:hole:status > zip_encoding HOLE_STATUS;
+
+    // optional column pair to describe hole status
+    // when/if it does not line up with our constants above
+    column < ascii > zip_encoding HOLE_STATUS_VALUE;
+    column < INSDC:coord:len > izip_encoding HOLE_STATUS_VALUE_LEN;
+
+    // hole ( X,Y ) pair will be split and sent to X and Y columns
+    column I16 [ 2 ] HOLE_XY
+        = < I16 > paste ( x_clip_I16, y_clip_I16 );
+    I16 x_clip_I16 = cast ( out_x_coord );
+    I16 y_clip_I16 = cast ( out_y_coord );
+
+    I16 in_x16_coord = < I16 > cut < 0 > ( HOLE_XY );
+    I16 in_y16_coord = < I16 > cut < 1 > ( HOLE_XY );
+
+    INSDC:coord:val in_x_coord = cast ( in_x16_coord );
+    INSDC:coord:val in_y_coord = cast ( in_y16_coord );
+
+    // the number of bases in ZMW
+    readonly column INSDC:coord:len NUM_EVENT
+        = base_space_spot_len;
+};
+
+/* history:
+ *  1.0.1 - updated ancestry
+ *  1.0.2 - updated ancestry
+ */
+table NCBI:SRA:PacBio:smrt:sequence #1.0.2
+    = NCBI:SRA:PacBio:smrt:basecalls #1.0.2
+    , NCBI:SRA:tbl:sra_nopos #2.1.3
+{
+    // pulse information
+    column < U16 > izip_encoding PRE_BASE_FRAMES;
+    column < U16 > izip_encoding WIDTH_IN_FRAMES;
+
+    // spot to pulse map
+    default column INSDC:position:zero PULSE_INDEX
+        = .PULSE_INDEX;
+    readonly column INSDC:position:one PULSE_INDEX
+        = out_position;
+    INSDC:position:one out_position
+        = ( INSDC:position:one ) < INSDC:position:zero > sum < 1 > ( .PULSE_INDEX );
+
+    column NCBI:SRA:pos16 PULSE_INDEX
+        = cast ( .PULSE_INDEX );
+    NCBI:SRA:pos16 in_pulse_index16
+        = PULSE_INDEX;
+
+    INSDC:position:zero in_pulse_index32
+        = PULSE_INDEX
+        | cast ( in_pulse_index16 );
+
+    physical column < INSDC:position:zero > izip_encoding .PULSE_INDEX
+        = in_pulse_index32;
+
+    /* clip quality */
+    extern column < INSDC:coord:zero > izip_encoding CLIP_QUALITY_LEFT;
+    extern column < INSDC:coord:one > izip_encoding CLIP_QUALITY_RIGHT;
+
+    /* TRIMMED SEQUENCE
+     *  need to find the 0-based trim_start and trim_len
+     */
+    INSDC:coord:zero trim_start
+        = .CLIP_QUALITY_LEFT
+        | NCBI:SRA:bio_start ( out_read_start, out_read_type );
+
+    U32 trim_right
+        = ( U32 ) .CLIP_QUALITY_RIGHT
+        | spot_len;
+
+    U32 trim_left = ( U32 ) trim_start;
+    INSDC:coord:len trim_len = ( INSDC:coord:len )
+        < U32 > diff ( trim_right, trim_left );
+};
+
+/* history:
+ *  1.0.1 - updated ancestry
+ *  1.0.2 - updated ancestry
+ */
+table NCBI:SRA:PacBio:smrt:cons #1.0.2
+    = NCBI:SRA:PacBio:smrt:basecalls #1.0.2
+    , NCBI:SRA:tbl:sra #2.1.3
+{
+    // documented in both hdf5 and xsd as signed...
+    column < I32 > izip_encoding NUM_PASSES;
+
+    /* TRIMMED SEQUENCE
+     *  need to find the 0-based trim_start and trim_len
+     */
+    INSDC:coord:zero trim_start
+        = NCBI:SRA:bio_start ( out_read_start, out_read_type );
+
+    U32 trim_left = ( U32 ) trim_start;
+    INSDC:coord:len trim_len = ( INSDC:coord:len )
+        < U32 > diff ( spot_len, trim_left );
+};
+
+/* these encoding rules attempt to compress the channels individually,
+   although they may compress fine interleaved as they are... */
+physical
+F32 [ 4 ] NCBI:SRA:PacBio:smrt:F32_4ch_encoding #1.0 < U32 mantissa >
+{
+    decode
+    {
+        fzip_fmt cmp0 = split < 0 > ( @ );
+        fzip_fmt cmp1 = split < 1 > ( @ );
+        fzip_fmt cmp2 = split < 2 > ( @ );
+        fzip_fmt cmp3 = split < 3 > ( @ );
+
+        F32 ch0 = funzip ( cmp0 );
+        F32 ch1 = funzip ( cmp1 );
+        F32 ch2 = funzip ( cmp2 );
+        F32 ch3 = funzip ( cmp3 );
+
+        return < F32 > paste ( ch0, ch1, ch2, ch3 );
+    }
+
+    encode
+    {
+        F32 ch0 = < F32 > cut < 0 > ( @ );
+        F32 ch1 = < F32 > cut < 1 > ( @ );
+        F32 ch2 = < F32 > cut < 2 > ( @ );
+        F32 ch3 = < F32 > cut < 3 > ( @ );
+
+        fzip_fmt cmp0 = fzip < mantissa > ( ch0 );
+        fzip_fmt cmp1 = fzip < mantissa > ( ch1 );
+        fzip_fmt cmp2 = fzip < mantissa > ( ch2 );
+        fzip_fmt cmp3 = fzip < mantissa > ( ch3 );
+
+        return merge ( cmp0, cmp1, cmp2, cmp3 );
+    }
+}
+
+table NCBI:SRA:PacBio:smrt:zmw_metrics #1
+{
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > BASE_FRACTION;
+    column < F32 > fzip_encoding < 24 > BASE_IPD;
+    column < F32 > fzip_encoding < 24 > BASE_RATE;
+    column < F32 > fzip_encoding < 24 > BASE_WIDTH;
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_BASE_QV;
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_DEL_QV;
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_INS_QV;
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_SUB_QV;
+    column < F32 > fzip_encoding < 24 > LOCAL_BASE_RATE;
+    column < F32 > fzip_encoding < 24 > DARK_BASE_RATE;
+    column < F32 > fzip_encoding < 24 > HQ_RGN_START_TIME;
+    column < F32 > fzip_encoding < 24 > HQ_RGN_END_TIME;
+    column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > HQ_RGN_SNR;
+    column < I8 > zip_encoding PRODUCTIVITY;
+    column < F32 > fzip_encoding < 24 > READ_SCORE;
+    column < F32 > fzip_encoding < 24 > READ_BASE_QV;
+    column < F32 > fzip_encoding < 24 > READ_DEL_QV;
+    column < F32 > fzip_encoding < 24 > READ_INS_QV;
+    column < F32 > fzip_encoding < 24 > READ_SUB_QV;
+};
+
+table NCBI:SRA:PacBio:smrt:passes #1
+{
+    column < U8 > zip_encoding ADAPTER_HIT_BEFORE;
+    column < U8 > zip_encoding ADAPTER_HIT_AFTER;
+    column < U8 > zip_encoding PASS_DIRECTION;
+    column < I32 > izip_encoding PASS_NUM_BASES;
+    column < I32 > izip_encoding PASS_START_BASE;
+};
+
+database NCBI:SRA:PacBio:smrt:db #1.0.1
+{
+    table NCBI:SRA:PacBio:smrt:sequence #1.0 SEQUENCE;
+    table NCBI:SRA:PacBio:smrt:cons #1.0 CONSENSUS;
+    table NCBI:SRA:PacBio:smrt:passes #1.0 PASSES;
+    table NCBI:SRA:PacBio:smrt:zmw_metrics #1.0 ZMW_METRICS;
+};