diff libs/sratoolkit.2.8.0-centos_linux64/schema/sra/454.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/sra/454.vschema	Mon Nov 27 11:21:07 2017 -0500
@@ -0,0 +1,289 @@
+/*===========================================================================
+*
+*                            PUBLIC DOMAIN NOTICE
+*               National Center for Biotechnology Information
+*
+*  This software/database is a "United States Government Work" under the
+*  terms of the United States Copyright Act.  It was written as part of
+*  the author's official duties as a United States Government employee and
+*  thus cannot be copyrighted.  This software/database is freely available
+*  to the public for use. The National Library of Medicine and the U.S.
+*  Government have not placed any restriction on its use or reproduction.
+*
+*  Although all reasonable efforts have been taken to ensure the accuracy
+*  and reliability of the software and data, the NLM and the U.S.
+*  Government do not and cannot warrant the performance or results that
+*  may be obtained by using this software or data. The NLM and the U.S.
+*  Government disclaim all warranties, express or implied, including
+*  warranties of performance, merchantability or fitness for any particular
+*  purpose.
+*
+*  Please cite the author in any work or product based on this material.
+*
+* ===========================================================================
+*
+*/
+
+/*==========================================================================
+ * NCBI 454 Sequence Read Archive schema
+ */
+version 1;
+
+include 'ncbi/sra.vschema';
+include 'ncbi/spotname.vschema';
+include 'ncbi/clip.vschema';
+
+
+/*--------------------------------------------------------------------------
+ * functions
+ */
+
+/* dynamic_read_desc
+ *  uses inputs to determine read type and segmentation
+ *
+ *  "edit_distance" [ CONST, OPTIONAL ] - a tolerance figure for
+ *  linker matching, where 0 requires exact match, 5 is default.
+ *
+ *  "spot" [ DATA ] - bases for entire spot
+ *
+ *  "key" [ DATA, CONTROL ] - bases for key sequence. for version 1,
+ *  the first base following key is taken as biological start
+ *
+ *  "linker" [ DATA, CONTROL, OPTIONAL ] - if present, is used to separate
+ *  all bases following "key" into mate pair biological reads
+ *
+ *  returns a trio for each identified read, with read type, start and length
+ */
+typeset NCBI:SRA:_454_:drdparam_set { ascii, U8, INSDC:2na:packed };
+extern function
+U32 [ 3 ] NCBI:SRA:_454_:dynamic_read_desc #1 < * U32 edit_distance >
+    ( NCBI:SRA:_454_:drdparam_set spot, NCBI:SRA:_454_:drdparam_set key
+      * NCBI:SRA:_454_:drdparam_set linker );
+
+const U32 NCBI:SRA:_454_:dyn_read_type  = 0;
+const U32 NCBI:SRA:_454_:dyn_read_start = 1;
+const U32 NCBI:SRA:_454_:dyn_read_len   = 2;
+
+
+/* tokenize_spot_name
+ *  scans name on input
+ *  tokenizes into parts
+ */
+extern function NCBI:SRA:spot_name_token
+    NCBI:SRA:_454_:tokenize_spot_name #1 ( ascii name );
+
+
+/*--------------------------------------------------------------------------
+ * NCBI:SRA:_454_:common
+ *  Roche 454 SRA Platform
+ *
+ * history:
+ *  1.0.1 - explictly base upon sra #1.0.1
+ *  1.0.2 - bring in clip processing from external table
+ *  1.0.3 - base explicitly upon sra #1.0.2, clip #1.0.1
+ *  1.0.4 - base explicitly upon sra #1.0.3, clip #1.0.2
+ */
+table NCBI:SRA:_454_:common #1.0.4 = INSDC:SRA:tbl:sra #1.0.3, NCBI:SRA:tbl:clip #1.0.2
+{
+    /* PLATFORM
+     *  platform name is always 454
+     */
+    ascii platform_name
+        = < ascii > echo < "454" > ();
+
+    /* 454 TECHNICAL SEQUENCES
+     */
+    column INSDC:dna:text FLOW_CHARS = out_flow_chars;
+    INSDC:dna:text in_flow_chars
+        = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( FLOW_CHARS );
+    column INSDC:dna:text KEY_SEQUENCE = out_key_sequence;
+    INSDC:dna:text in_key_sequence
+        = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( KEY_SEQUENCE );
+    column INSDC:dna:text LINKER_SEQUENCE = out_linker_sequence;
+    INSDC:dna:text in_linker_sequence
+        = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( LINKER_SEQUENCE );
+
+    // binary technical sequences
+    INSDC:x2na:bin out_flow_bin
+        = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_flow_chars );
+    INSDC:x2na:bin out_key_bin
+        = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_key_sequence );
+    INSDC:x2na:bin out_linker_bin
+        = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_linker_sequence );
+
+    /* SIGNAL
+     *  single channel integer
+     */
+    column NCBI:isamp1 SIGNAL = out_signal;
+    NCBI:isamp1 out_signal = .SIGNAL;
+
+
+	/* INSDC:tbl:sequence inherited productions
+	 *  cs_native
+	 *  out_cs_key
+	 *  in_dna_text
+	 *  out_2cs_bin
+	 *  out_2na_bin
+	 *  out_4na_bin
+	 *  out_dna_text
+	 *  out_x2cs_bin
+	 *  out_x2na_bin
+	 *  out_2cs_packed
+	 *  out_2na_packed
+	 *  out_4na_packed
+	 *  out_color_text
+	 *  out_qual_phred
+	 *  out_color_matrix
+	 */
+
+	/* INSDC:SRA:tbl:spotname inherited productions
+	 *  out_x_coord
+	 *  out_y_coord
+	 *  out_name_fmt
+	 *  out_spot_name
+	 *  spot_ids_found
+	 */
+
+	/* INSDC:SRA:tbl:spotdesc inherited productions
+	 *  trim_len
+	 *  out_label
+	 *  out_nreads
+	 *  trim_start
+	 *  out_read_len
+	 *  out_label_len
+	 *  out_rd_filter
+	 *  out_read_type
+	 *  out_read_start
+	 *  out_label_start
+	 *  static_fixed_spot_len
+	 */
+
+	/* INSDC:SRA:tbl:stats inherited productions
+	 *  base_count
+	 *  spot_count
+	 *  max_spot_id
+	 *  min_spot_id
+	 *  in_stats_bin
+	 *  bio_base_count
+	 */
+
+	/* NCBI:tbl:n_encoding inherited productions
+	 *  read_unpack
+	 */
+
+	/* NCBI:SRA:_454_:common productions
+	 *  .SIGNAL
+	 *  .CLIP_ADAPTER_LEFT
+	 *  .CLIP_QUALITY_LEFT
+	 *  .CLIP_ADAPTER_RIGHT
+	 *  .CLIP_QUALITY_RIGHT
+	 *  out_flow_chars
+	 *  out_key_sequence
+	 *  out_linker_sequence
+	 */
+};
+
+
+/*--------------------------------------------------------------------------
+ * NCBI:SRA:_454_:tbl:v2
+ *  Roche 454 SRA Platform
+ *
+ * history:
+ *  1.0.1 - explictly base upon sra #1.0.1 and related changes
+ *  1.0.2 - respond to change to 454:common base table #1.0.2
+ */
+
+// encodings are declared to have their own version
+// so that they may be changed over time independently
+physical INSDC:coord:one NCBI:SRA:_454_:encoding:CLIP #2
+{
+    decode { return ( INSDC:coord:one ) iunzip ( @ ); }
+    encode { return izip ( @ ); }
+}
+
+physical NCBI:isamp1 NCBI:SRA:_454_:encoding:SIGNAL #2
+{
+    decode { return ( NCBI:isamp1 ) iunzip ( @ ); }
+    encode { return izip ( @ ); }
+}
+
+physical INSDC:position:one NCBI:SRA:_454_:encoding:POSITION #2
+{
+    decode
+    {
+        I32 pos_1st_deriv = iunzip ( @ );
+        return ( INSDC:position:one ) < I32 > integral ( pos_1st_deriv );
+    }
+    encode
+    { 
+        I32 pos_1st_deriv = < I32 > deriv ( @ );
+        return izip ( pos_1st_deriv ); 
+    }
+}
+
+/* normalized v2 table
+ *
+ * history:
+ *  1.0.6 - base upon updated ancestry
+ *  1.0.7 - base upon updated ancestry
+ */
+table NCBI:SRA:_454_:tbl:v2 #1.0.7
+    = NCBI:SRA:tbl:sra_nopos #2.1.3
+    , NCBI:tbl:base_space #2.0.3
+    , NCBI:tbl:phred_quality #2.0.3
+    , NCBI:SRA:_454_:common #1.0.4
+{
+    /* NAME tokenizing and coordinates
+     *  most work happens in skeyname table
+     *  we still obtain REGION from name
+     */
+    readonly column INSDC:coord:val REGION = ( INSDC:coord:val )
+        NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok );
+    NCBI:SRA:spot_name_token out_spot_name_tok
+        = NCBI:SRA:_454_:tokenize_spot_name ( _out_name );
+
+    NCBI:SRA:spot_name_token in_spot_name_tok
+        = NCBI:SRA:_454_:tokenize_spot_name ( NAME );
+
+    // special sequences
+    INSDC:dna:text out_flow_chars
+        = .FLOW_CHARS
+        | < INSDC:dna:text > echo < 'TACG' > ( .SIGNAL )
+        | < INSDC:dna:text > echo < 'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG' > ();
+
+    physical column < INSDC:dna:text > zip_encoding
+        .FLOW_CHARS = in_flow_chars;
+
+    INSDC:dna:text out_key_sequence
+        = .KEY_SEQUENCE
+        | < INSDC:dna:text > echo < 'TCAG' > ();
+
+    physical column < INSDC:dna:text > zip_encoding
+        .KEY_SEQUENCE = in_key_sequence;
+
+    INSDC:dna:text out_linker_sequence = .LINKER_SEQUENCE;
+    physical column < INSDC:dna:text > zip_encoding
+        .LINKER_SEQUENCE = in_linker_sequence;
+
+// linker needs to be representable by its own table
+// either in metadata or somewhere else
+
+    // position stored as normal 1-based coordinate
+    INSDC:position:one out_position = .POSITION;
+    physical column NCBI:SRA:_454_:encoding:POSITION #2
+        .POSITION = POSITION;
+
+    // clips
+    physical column NCBI:SRA:_454_:encoding:CLIP #2
+        .CLIP_ADAPTER_LEFT = CLIP_ADAPTER_LEFT;
+    physical column NCBI:SRA:_454_:encoding:CLIP #2
+        .CLIP_ADAPTER_RIGHT = CLIP_ADAPTER_RIGHT;
+    physical column NCBI:SRA:_454_:encoding:CLIP #2
+        .CLIP_QUALITY_LEFT = CLIP_QUALITY_LEFT;
+    physical column NCBI:SRA:_454_:encoding:CLIP #2
+        .CLIP_QUALITY_RIGHT = CLIP_QUALITY_RIGHT;
+
+    // signal
+    physical column NCBI:SRA:_454_:encoding:SIGNAL #2
+        .SIGNAL = SIGNAL;
+};