Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/454.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * NCBI 454 Sequence Read Archive schema | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'ncbi/sra.vschema'; | |
| 33 include 'ncbi/spotname.vschema'; | |
| 34 include 'ncbi/clip.vschema'; | |
| 35 | |
| 36 | |
| 37 /*-------------------------------------------------------------------------- | |
| 38 * functions | |
| 39 */ | |
| 40 | |
| 41 /* dynamic_read_desc | |
| 42 * uses inputs to determine read type and segmentation | |
| 43 * | |
| 44 * "edit_distance" [ CONST, OPTIONAL ] - a tolerance figure for | |
| 45 * linker matching, where 0 requires exact match, 5 is default. | |
| 46 * | |
| 47 * "spot" [ DATA ] - bases for entire spot | |
| 48 * | |
| 49 * "key" [ DATA, CONTROL ] - bases for key sequence. for version 1, | |
| 50 * the first base following key is taken as biological start | |
| 51 * | |
| 52 * "linker" [ DATA, CONTROL, OPTIONAL ] - if present, is used to separate | |
| 53 * all bases following "key" into mate pair biological reads | |
| 54 * | |
| 55 * returns a trio for each identified read, with read type, start and length | |
| 56 */ | |
| 57 typeset NCBI:SRA:_454_:drdparam_set { ascii, U8, INSDC:2na:packed }; | |
| 58 extern function | |
| 59 U32 [ 3 ] NCBI:SRA:_454_:dynamic_read_desc #1 < * U32 edit_distance > | |
| 60 ( NCBI:SRA:_454_:drdparam_set spot, NCBI:SRA:_454_:drdparam_set key | |
| 61 * NCBI:SRA:_454_:drdparam_set linker ); | |
| 62 | |
| 63 const U32 NCBI:SRA:_454_:dyn_read_type = 0; | |
| 64 const U32 NCBI:SRA:_454_:dyn_read_start = 1; | |
| 65 const U32 NCBI:SRA:_454_:dyn_read_len = 2; | |
| 66 | |
| 67 | |
| 68 /* tokenize_spot_name | |
| 69 * scans name on input | |
| 70 * tokenizes into parts | |
| 71 */ | |
| 72 extern function NCBI:SRA:spot_name_token | |
| 73 NCBI:SRA:_454_:tokenize_spot_name #1 ( ascii name ); | |
| 74 | |
| 75 | |
| 76 /*-------------------------------------------------------------------------- | |
| 77 * NCBI:SRA:_454_:common | |
| 78 * Roche 454 SRA Platform | |
| 79 * | |
| 80 * history: | |
| 81 * 1.0.1 - explictly base upon sra #1.0.1 | |
| 82 * 1.0.2 - bring in clip processing from external table | |
| 83 * 1.0.3 - base explicitly upon sra #1.0.2, clip #1.0.1 | |
| 84 * 1.0.4 - base explicitly upon sra #1.0.3, clip #1.0.2 | |
| 85 */ | |
| 86 table NCBI:SRA:_454_:common #1.0.4 = INSDC:SRA:tbl:sra #1.0.3, NCBI:SRA:tbl:clip #1.0.2 | |
| 87 { | |
| 88 /* PLATFORM | |
| 89 * platform name is always 454 | |
| 90 */ | |
| 91 ascii platform_name | |
| 92 = < ascii > echo < "454" > (); | |
| 93 | |
| 94 /* 454 TECHNICAL SEQUENCES | |
| 95 */ | |
| 96 column INSDC:dna:text FLOW_CHARS = out_flow_chars; | |
| 97 INSDC:dna:text in_flow_chars | |
| 98 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( FLOW_CHARS ); | |
| 99 column INSDC:dna:text KEY_SEQUENCE = out_key_sequence; | |
| 100 INSDC:dna:text in_key_sequence | |
| 101 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( KEY_SEQUENCE ); | |
| 102 column INSDC:dna:text LINKER_SEQUENCE = out_linker_sequence; | |
| 103 INSDC:dna:text in_linker_sequence | |
| 104 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( LINKER_SEQUENCE ); | |
| 105 | |
| 106 // binary technical sequences | |
| 107 INSDC:x2na:bin out_flow_bin | |
| 108 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_flow_chars ); | |
| 109 INSDC:x2na:bin out_key_bin | |
| 110 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_key_sequence ); | |
| 111 INSDC:x2na:bin out_linker_bin | |
| 112 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_linker_sequence ); | |
| 113 | |
| 114 /* SIGNAL | |
| 115 * single channel integer | |
| 116 */ | |
| 117 column NCBI:isamp1 SIGNAL = out_signal; | |
| 118 NCBI:isamp1 out_signal = .SIGNAL; | |
| 119 | |
| 120 | |
| 121 /* INSDC:tbl:sequence inherited productions | |
| 122 * cs_native | |
| 123 * out_cs_key | |
| 124 * in_dna_text | |
| 125 * out_2cs_bin | |
| 126 * out_2na_bin | |
| 127 * out_4na_bin | |
| 128 * out_dna_text | |
| 129 * out_x2cs_bin | |
| 130 * out_x2na_bin | |
| 131 * out_2cs_packed | |
| 132 * out_2na_packed | |
| 133 * out_4na_packed | |
| 134 * out_color_text | |
| 135 * out_qual_phred | |
| 136 * out_color_matrix | |
| 137 */ | |
| 138 | |
| 139 /* INSDC:SRA:tbl:spotname inherited productions | |
| 140 * out_x_coord | |
| 141 * out_y_coord | |
| 142 * out_name_fmt | |
| 143 * out_spot_name | |
| 144 * spot_ids_found | |
| 145 */ | |
| 146 | |
| 147 /* INSDC:SRA:tbl:spotdesc inherited productions | |
| 148 * trim_len | |
| 149 * out_label | |
| 150 * out_nreads | |
| 151 * trim_start | |
| 152 * out_read_len | |
| 153 * out_label_len | |
| 154 * out_rd_filter | |
| 155 * out_read_type | |
| 156 * out_read_start | |
| 157 * out_label_start | |
| 158 * static_fixed_spot_len | |
| 159 */ | |
| 160 | |
| 161 /* INSDC:SRA:tbl:stats inherited productions | |
| 162 * base_count | |
| 163 * spot_count | |
| 164 * max_spot_id | |
| 165 * min_spot_id | |
| 166 * in_stats_bin | |
| 167 * bio_base_count | |
| 168 */ | |
| 169 | |
| 170 /* NCBI:tbl:n_encoding inherited productions | |
| 171 * read_unpack | |
| 172 */ | |
| 173 | |
| 174 /* NCBI:SRA:_454_:common productions | |
| 175 * .SIGNAL | |
| 176 * .CLIP_ADAPTER_LEFT | |
| 177 * .CLIP_QUALITY_LEFT | |
| 178 * .CLIP_ADAPTER_RIGHT | |
| 179 * .CLIP_QUALITY_RIGHT | |
| 180 * out_flow_chars | |
| 181 * out_key_sequence | |
| 182 * out_linker_sequence | |
| 183 */ | |
| 184 }; | |
| 185 | |
| 186 | |
| 187 /*-------------------------------------------------------------------------- | |
| 188 * NCBI:SRA:_454_:tbl:v2 | |
| 189 * Roche 454 SRA Platform | |
| 190 * | |
| 191 * history: | |
| 192 * 1.0.1 - explictly base upon sra #1.0.1 and related changes | |
| 193 * 1.0.2 - respond to change to 454:common base table #1.0.2 | |
| 194 */ | |
| 195 | |
| 196 // encodings are declared to have their own version | |
| 197 // so that they may be changed over time independently | |
| 198 physical INSDC:coord:one NCBI:SRA:_454_:encoding:CLIP #2 | |
| 199 { | |
| 200 decode { return ( INSDC:coord:one ) iunzip ( @ ); } | |
| 201 encode { return izip ( @ ); } | |
| 202 } | |
| 203 | |
| 204 physical NCBI:isamp1 NCBI:SRA:_454_:encoding:SIGNAL #2 | |
| 205 { | |
| 206 decode { return ( NCBI:isamp1 ) iunzip ( @ ); } | |
| 207 encode { return izip ( @ ); } | |
| 208 } | |
| 209 | |
| 210 physical INSDC:position:one NCBI:SRA:_454_:encoding:POSITION #2 | |
| 211 { | |
| 212 decode | |
| 213 { | |
| 214 I32 pos_1st_deriv = iunzip ( @ ); | |
| 215 return ( INSDC:position:one ) < I32 > integral ( pos_1st_deriv ); | |
| 216 } | |
| 217 encode | |
| 218 { | |
| 219 I32 pos_1st_deriv = < I32 > deriv ( @ ); | |
| 220 return izip ( pos_1st_deriv ); | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 /* normalized v2 table | |
| 225 * | |
| 226 * history: | |
| 227 * 1.0.6 - base upon updated ancestry | |
| 228 * 1.0.7 - base upon updated ancestry | |
| 229 */ | |
| 230 table NCBI:SRA:_454_:tbl:v2 #1.0.7 | |
| 231 = NCBI:SRA:tbl:sra_nopos #2.1.3 | |
| 232 , NCBI:tbl:base_space #2.0.3 | |
| 233 , NCBI:tbl:phred_quality #2.0.3 | |
| 234 , NCBI:SRA:_454_:common #1.0.4 | |
| 235 { | |
| 236 /* NAME tokenizing and coordinates | |
| 237 * most work happens in skeyname table | |
| 238 * we still obtain REGION from name | |
| 239 */ | |
| 240 readonly column INSDC:coord:val REGION = ( INSDC:coord:val ) | |
| 241 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok ); | |
| 242 NCBI:SRA:spot_name_token out_spot_name_tok | |
| 243 = NCBI:SRA:_454_:tokenize_spot_name ( _out_name ); | |
| 244 | |
| 245 NCBI:SRA:spot_name_token in_spot_name_tok | |
| 246 = NCBI:SRA:_454_:tokenize_spot_name ( NAME ); | |
| 247 | |
| 248 // special sequences | |
| 249 INSDC:dna:text out_flow_chars | |
| 250 = .FLOW_CHARS | |
| 251 | < INSDC:dna:text > echo < 'TACG' > ( .SIGNAL ) | |
| 252 | < INSDC:dna:text > echo < 'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG' > (); | |
| 253 | |
| 254 physical column < INSDC:dna:text > zip_encoding | |
| 255 .FLOW_CHARS = in_flow_chars; | |
| 256 | |
| 257 INSDC:dna:text out_key_sequence | |
| 258 = .KEY_SEQUENCE | |
| 259 | < INSDC:dna:text > echo < 'TCAG' > (); | |
| 260 | |
| 261 physical column < INSDC:dna:text > zip_encoding | |
| 262 .KEY_SEQUENCE = in_key_sequence; | |
| 263 | |
| 264 INSDC:dna:text out_linker_sequence = .LINKER_SEQUENCE; | |
| 265 physical column < INSDC:dna:text > zip_encoding | |
| 266 .LINKER_SEQUENCE = in_linker_sequence; | |
| 267 | |
| 268 // linker needs to be representable by its own table | |
| 269 // either in metadata or somewhere else | |
| 270 | |
| 271 // position stored as normal 1-based coordinate | |
| 272 INSDC:position:one out_position = .POSITION; | |
| 273 physical column NCBI:SRA:_454_:encoding:POSITION #2 | |
| 274 .POSITION = POSITION; | |
| 275 | |
| 276 // clips | |
| 277 physical column NCBI:SRA:_454_:encoding:CLIP #2 | |
| 278 .CLIP_ADAPTER_LEFT = CLIP_ADAPTER_LEFT; | |
| 279 physical column NCBI:SRA:_454_:encoding:CLIP #2 | |
| 280 .CLIP_ADAPTER_RIGHT = CLIP_ADAPTER_RIGHT; | |
| 281 physical column NCBI:SRA:_454_:encoding:CLIP #2 | |
| 282 .CLIP_QUALITY_LEFT = CLIP_QUALITY_LEFT; | |
| 283 physical column NCBI:SRA:_454_:encoding:CLIP #2 | |
| 284 .CLIP_QUALITY_RIGHT = CLIP_QUALITY_RIGHT; | |
| 285 | |
| 286 // signal | |
| 287 physical column NCBI:SRA:_454_:encoding:SIGNAL #2 | |
| 288 .SIGNAL = SIGNAL; | |
| 289 }; |
