comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/454.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * NCBI 454 Sequence Read Archive schema
29 */
30 version 1;
31
32 include 'ncbi/sra.vschema';
33 include 'ncbi/spotname.vschema';
34 include 'ncbi/clip.vschema';
35
36
37 /*--------------------------------------------------------------------------
38 * functions
39 */
40
41 /* dynamic_read_desc
42 * uses inputs to determine read type and segmentation
43 *
44 * "edit_distance" [ CONST, OPTIONAL ] - a tolerance figure for
45 * linker matching, where 0 requires exact match, 5 is default.
46 *
47 * "spot" [ DATA ] - bases for entire spot
48 *
49 * "key" [ DATA, CONTROL ] - bases for key sequence. for version 1,
50 * the first base following key is taken as biological start
51 *
52 * "linker" [ DATA, CONTROL, OPTIONAL ] - if present, is used to separate
53 * all bases following "key" into mate pair biological reads
54 *
55 * returns a trio for each identified read, with read type, start and length
56 */
57 typeset NCBI:SRA:_454_:drdparam_set { ascii, U8, INSDC:2na:packed };
58 extern function
59 U32 [ 3 ] NCBI:SRA:_454_:dynamic_read_desc #1 < * U32 edit_distance >
60 ( NCBI:SRA:_454_:drdparam_set spot, NCBI:SRA:_454_:drdparam_set key
61 * NCBI:SRA:_454_:drdparam_set linker );
62
63 const U32 NCBI:SRA:_454_:dyn_read_type = 0;
64 const U32 NCBI:SRA:_454_:dyn_read_start = 1;
65 const U32 NCBI:SRA:_454_:dyn_read_len = 2;
66
67
68 /* tokenize_spot_name
69 * scans name on input
70 * tokenizes into parts
71 */
72 extern function NCBI:SRA:spot_name_token
73 NCBI:SRA:_454_:tokenize_spot_name #1 ( ascii name );
74
75
76 /*--------------------------------------------------------------------------
77 * NCBI:SRA:_454_:common
78 * Roche 454 SRA Platform
79 *
80 * history:
81 * 1.0.1 - explictly base upon sra #1.0.1
82 * 1.0.2 - bring in clip processing from external table
83 * 1.0.3 - base explicitly upon sra #1.0.2, clip #1.0.1
84 * 1.0.4 - base explicitly upon sra #1.0.3, clip #1.0.2
85 */
86 table NCBI:SRA:_454_:common #1.0.4 = INSDC:SRA:tbl:sra #1.0.3, NCBI:SRA:tbl:clip #1.0.2
87 {
88 /* PLATFORM
89 * platform name is always 454
90 */
91 ascii platform_name
92 = < ascii > echo < "454" > ();
93
94 /* 454 TECHNICAL SEQUENCES
95 */
96 column INSDC:dna:text FLOW_CHARS = out_flow_chars;
97 INSDC:dna:text in_flow_chars
98 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( FLOW_CHARS );
99 column INSDC:dna:text KEY_SEQUENCE = out_key_sequence;
100 INSDC:dna:text in_key_sequence
101 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( KEY_SEQUENCE );
102 column INSDC:dna:text LINKER_SEQUENCE = out_linker_sequence;
103 INSDC:dna:text in_linker_sequence
104 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn.', 'ACGTNN' > ( LINKER_SEQUENCE );
105
106 // binary technical sequences
107 INSDC:x2na:bin out_flow_bin
108 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_flow_chars );
109 INSDC:x2na:bin out_key_bin
110 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_key_sequence );
111 INSDC:x2na:bin out_linker_bin
112 = < INSDC:dna:text, INSDC:x2na:bin > map < INSDC:x2na:map:CHARSET, INSDC:x2na:map:BINSET > ( out_linker_sequence );
113
114 /* SIGNAL
115 * single channel integer
116 */
117 column NCBI:isamp1 SIGNAL = out_signal;
118 NCBI:isamp1 out_signal = .SIGNAL;
119
120
121 /* INSDC:tbl:sequence inherited productions
122 * cs_native
123 * out_cs_key
124 * in_dna_text
125 * out_2cs_bin
126 * out_2na_bin
127 * out_4na_bin
128 * out_dna_text
129 * out_x2cs_bin
130 * out_x2na_bin
131 * out_2cs_packed
132 * out_2na_packed
133 * out_4na_packed
134 * out_color_text
135 * out_qual_phred
136 * out_color_matrix
137 */
138
139 /* INSDC:SRA:tbl:spotname inherited productions
140 * out_x_coord
141 * out_y_coord
142 * out_name_fmt
143 * out_spot_name
144 * spot_ids_found
145 */
146
147 /* INSDC:SRA:tbl:spotdesc inherited productions
148 * trim_len
149 * out_label
150 * out_nreads
151 * trim_start
152 * out_read_len
153 * out_label_len
154 * out_rd_filter
155 * out_read_type
156 * out_read_start
157 * out_label_start
158 * static_fixed_spot_len
159 */
160
161 /* INSDC:SRA:tbl:stats inherited productions
162 * base_count
163 * spot_count
164 * max_spot_id
165 * min_spot_id
166 * in_stats_bin
167 * bio_base_count
168 */
169
170 /* NCBI:tbl:n_encoding inherited productions
171 * read_unpack
172 */
173
174 /* NCBI:SRA:_454_:common productions
175 * .SIGNAL
176 * .CLIP_ADAPTER_LEFT
177 * .CLIP_QUALITY_LEFT
178 * .CLIP_ADAPTER_RIGHT
179 * .CLIP_QUALITY_RIGHT
180 * out_flow_chars
181 * out_key_sequence
182 * out_linker_sequence
183 */
184 };
185
186
187 /*--------------------------------------------------------------------------
188 * NCBI:SRA:_454_:tbl:v2
189 * Roche 454 SRA Platform
190 *
191 * history:
192 * 1.0.1 - explictly base upon sra #1.0.1 and related changes
193 * 1.0.2 - respond to change to 454:common base table #1.0.2
194 */
195
196 // encodings are declared to have their own version
197 // so that they may be changed over time independently
198 physical INSDC:coord:one NCBI:SRA:_454_:encoding:CLIP #2
199 {
200 decode { return ( INSDC:coord:one ) iunzip ( @ ); }
201 encode { return izip ( @ ); }
202 }
203
204 physical NCBI:isamp1 NCBI:SRA:_454_:encoding:SIGNAL #2
205 {
206 decode { return ( NCBI:isamp1 ) iunzip ( @ ); }
207 encode { return izip ( @ ); }
208 }
209
210 physical INSDC:position:one NCBI:SRA:_454_:encoding:POSITION #2
211 {
212 decode
213 {
214 I32 pos_1st_deriv = iunzip ( @ );
215 return ( INSDC:position:one ) < I32 > integral ( pos_1st_deriv );
216 }
217 encode
218 {
219 I32 pos_1st_deriv = < I32 > deriv ( @ );
220 return izip ( pos_1st_deriv );
221 }
222 }
223
224 /* normalized v2 table
225 *
226 * history:
227 * 1.0.6 - base upon updated ancestry
228 * 1.0.7 - base upon updated ancestry
229 */
230 table NCBI:SRA:_454_:tbl:v2 #1.0.7
231 = NCBI:SRA:tbl:sra_nopos #2.1.3
232 , NCBI:tbl:base_space #2.0.3
233 , NCBI:tbl:phred_quality #2.0.3
234 , NCBI:SRA:_454_:common #1.0.4
235 {
236 /* NAME tokenizing and coordinates
237 * most work happens in skeyname table
238 * we still obtain REGION from name
239 */
240 readonly column INSDC:coord:val REGION = ( INSDC:coord:val )
241 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok );
242 NCBI:SRA:spot_name_token out_spot_name_tok
243 = NCBI:SRA:_454_:tokenize_spot_name ( _out_name );
244
245 NCBI:SRA:spot_name_token in_spot_name_tok
246 = NCBI:SRA:_454_:tokenize_spot_name ( NAME );
247
248 // special sequences
249 INSDC:dna:text out_flow_chars
250 = .FLOW_CHARS
251 | < INSDC:dna:text > echo < 'TACG' > ( .SIGNAL )
252 | < INSDC:dna:text > echo < 'TACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACG' > ();
253
254 physical column < INSDC:dna:text > zip_encoding
255 .FLOW_CHARS = in_flow_chars;
256
257 INSDC:dna:text out_key_sequence
258 = .KEY_SEQUENCE
259 | < INSDC:dna:text > echo < 'TCAG' > ();
260
261 physical column < INSDC:dna:text > zip_encoding
262 .KEY_SEQUENCE = in_key_sequence;
263
264 INSDC:dna:text out_linker_sequence = .LINKER_SEQUENCE;
265 physical column < INSDC:dna:text > zip_encoding
266 .LINKER_SEQUENCE = in_linker_sequence;
267
268 // linker needs to be representable by its own table
269 // either in metadata or somewhere else
270
271 // position stored as normal 1-based coordinate
272 INSDC:position:one out_position = .POSITION;
273 physical column NCBI:SRA:_454_:encoding:POSITION #2
274 .POSITION = POSITION;
275
276 // clips
277 physical column NCBI:SRA:_454_:encoding:CLIP #2
278 .CLIP_ADAPTER_LEFT = CLIP_ADAPTER_LEFT;
279 physical column NCBI:SRA:_454_:encoding:CLIP #2
280 .CLIP_ADAPTER_RIGHT = CLIP_ADAPTER_RIGHT;
281 physical column NCBI:SRA:_454_:encoding:CLIP #2
282 .CLIP_QUALITY_LEFT = CLIP_QUALITY_LEFT;
283 physical column NCBI:SRA:_454_:encoding:CLIP #2
284 .CLIP_QUALITY_RIGHT = CLIP_QUALITY_RIGHT;
285
286 // signal
287 physical column NCBI:SRA:_454_:encoding:SIGNAL #2
288 .SIGNAL = SIGNAL;
289 };