Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/illumina.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * NCBI Illumina Sequence Read Archive schema | |
29 */ | |
30 version 1; | |
31 | |
32 include 'ncbi/sra.vschema'; | |
33 include 'ncbi/spotname.vschema'; | |
34 | |
35 | |
36 /*-------------------------------------------------------------------------- | |
37 * types | |
38 */ | |
39 | |
40 typedef INSDC:quality:log_odds NCBI:qual4 [ 4 ]; | |
41 typedef NCBI:qual4 NCBI:SRA:rotated_qual4, NCBI:SRA:swapped_qual4; | |
42 | |
43 | |
44 /*-------------------------------------------------------------------------- | |
45 * functions | |
46 */ | |
47 | |
48 /* tokenize_spot_name | |
49 * scans name on input | |
50 * tokenizes into parts | |
51 */ | |
52 extern function NCBI:SRA:spot_name_token | |
53 NCBI:SRA:Illumina:tokenize_spot_name #1 ( ascii name ); | |
54 | |
55 | |
56 /*-------------------------------------------------------------------------- | |
57 * NCBI:SRA:Illumina:qual4 | |
58 * 4-channel log-odds-ish quality | |
59 */ | |
60 | |
61 /* history: | |
62 * 1.0.1 - base explicitly upon updated ancestry | |
63 */ | |
64 table NCBI:SRA:Illumina:qual4_nocol #1.0.1 | |
65 = INSDC:tbl:sequence #1.0.1 | |
66 , NCBI:tbl:log_odds_quality_nocol #1.0.1 | |
67 { | |
68 /* QUALITY | |
69 * 4-channel quality column | |
70 */ | |
71 readonly column NCBI:qual4 QUALITY = out_qual4; | |
72 | |
73 NCBI:qual4 out_qual4 | |
74 = < NCBI:qual4 > NCBI:SRA:swap ( out_qual4_swapped, read_unpack ) | |
75 | < NCBI:qual4 > NCBI:SRA:rotate < false > ( out_qual4_rotated, read_unpack ); | |
76 | |
77 | |
78 /* single-channel output | |
79 * convert 4-channel log-odds to single channel | |
80 * must retain n-encoding, which was intended to be the 4-channel pattern | |
81 * ( -5, -5, -5, -5 ) and a base of 'A' | |
82 */ | |
83 | |
84 // first, extract quality for called base | |
85 INSDC:quality:log_odds out_qual1_ch0 | |
86 = < INSDC:quality:log_odds> cut < 0 > ( out_qual4_swapped ) | |
87 | < INSDC:quality:log_odds> cut < 0 > ( out_qual4_rotated ); | |
88 | |
89 // clip it to -5 and above | |
90 INSDC:quality:log_odds out_qual1_clip | |
91 = < INSDC:quality:log_odds > clip < -5, 127 > ( out_qual1_ch0 ); | |
92 | |
93 // convert 4 channel to single 32-bit value | |
94 U32 out_qual4_32 | |
95 = redimension ( out_qual4_swapped ) | |
96 | redimension ( out_qual4_rotated ); | |
97 | |
98 // detect ( -5, -5, -5, -5 ) and introduce a -6 value into log-odds | |
99 // this is treated as an 'N', but still not ready | |
100 INSDC:quality:log_odds out_qual1_fives | |
101 = < U32, INSDC:quality:log_odds > map < 0xFBFBFBFB, -6 > ( out_qual4_32, out_qual1_clip ); | |
102 | |
103 // now slam zeros into anything that doesn't correspond to an A | |
104 // essentially this leaves all of the A qualities. any having -6 are really N. | |
105 INSDC:quality:log_odds out_qual1_n | |
106 = < U8, INSDC:quality:log_odds > map < [ 1, 2, 3 ], [ 0, 0, 0 ] > ( read_unpack, out_qual1_fives ); | |
107 | |
108 // finally, produce log-odds with n-encoded as -6 | |
109 INSDC:quality:log_odds out_qual_log_odds | |
110 = < INSDC:quality:log_odds, INSDC:quality:log_odds > map < -6, -6 > ( out_qual1_n, out_qual1_clip ); | |
111 | |
112 | |
113 /* NCBI:tbl:n_encoding inherited productions | |
114 * read_unpack | |
115 */ | |
116 | |
117 /* NCBI:SRA:Illumina:qual4_nocol productions | |
118 * out_qual4_rotated | |
119 * out_qual4_swapped | |
120 */ | |
121 }; | |
122 | |
123 | |
124 /* 4-channel log-odds compression | |
125 */ | |
126 | |
127 // encoded type - a single byte code for 4-channel pattern | |
128 typedef B8 NCBI:SRA:encoded_qual4; | |
129 | |
130 // decoding function | |
131 extern function | |
132 NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_decode #1 ( NCBI:SRA:encoded_qual4 in ); | |
133 | |
134 // encoding function | |
135 extern function | |
136 NCBI:SRA:encoded_qual4 NCBI:SRA:qual4_encode #1 ( NCBI:SRA:swapped_qual4 in ); | |
137 | |
138 // compression rules | |
139 physical NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_encoding #1 | |
140 { | |
141 encode | |
142 { | |
143 // produce codes | |
144 NCBI:SRA:encoded_qual4 encoded = NCBI:SRA:qual4_encode ( @ ); | |
145 | |
146 // gzip | |
147 return zip < Z_RLE, Z_BEST_SPEED > ( encoded ); | |
148 } | |
149 | |
150 decode | |
151 { | |
152 // gunzip | |
153 NCBI:SRA:encoded_qual4 unzipped = unzip ( @ ); | |
154 | |
155 // inflate to swapped | |
156 return NCBI:SRA:qual4_decode ( unzipped ); | |
157 } | |
158 } | |
159 | |
160 /* history: | |
161 * 1.0.1 - base upon updated qual4_nocol | |
162 */ | |
163 table NCBI:SRA:Illumina:qual4 #1.0.1 = NCBI:SRA:Illumina:qual4_nocol #1.0.1 | |
164 { | |
165 // read directly as swapped, n-encoded log_odds | |
166 NCBI:SRA:swapped_qual4 out_qual4_swapped = .QUALITY; | |
167 | |
168 /* NCBI:tbl:n_encoding inherited virtual productions | |
169 * read_unpack | |
170 */ | |
171 }; | |
172 | |
173 /* history: | |
174 * 2.0.2 - base upon updated ancestry | |
175 * 2.0.3 - base upon updated ancestry | |
176 * 2.0.4 - base upon updated ancestry | |
177 * 2.1.0 - base upon updated ancestry, added in_qual_log_odds | |
178 */ | |
179 table NCBI:SRA:Illumina:qual4 #2.1.0 | |
180 = NCBI:tbl:base_space #2.0.3 | |
181 , NCBI:tbl:log_odds_quality_nocol #2.1.0 | |
182 { | |
183 /* QUALITY | |
184 * 4-channel log-odds | |
185 */ | |
186 extern column NCBI:qual4 QUALITY = out_qual4; | |
187 | |
188 NCBI:SRA:swapped_qual4 in_qual4 | |
189 = ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_x2na_bin ) | |
190 | ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_2na_bin ); | |
191 | |
192 NCBI:qual4 out_qual4 | |
193 = < NCBI:SRA:swapped_qual4 > NCBI:SRA:swap ( .QUALITY, out_x2na_bin ); | |
194 | |
195 physical column NCBI:SRA:qual4_encoding .QUALITY = in_qual4; | |
196 | |
197 // feed to compressed statistics | |
198 NCBI:qual4 in_stats_qual = in_qual4; | |
199 | |
200 // single channel | |
201 INSDC:quality:log_odds in_qual_log_odds | |
202 = < INSDC:quality:log_odds > cut < 0 > ( in_qual4 ); | |
203 INSDC:quality:log_odds out_qual_log_odds | |
204 = < INSDC:quality:log_odds > cut < 0 > ( .QUALITY ); | |
205 }; | |
206 | |
207 | |
208 /*-------------------------------------------------------------------------- | |
209 * NCBI:SRA:Illumina | |
210 * Illumina SRA Platform | |
211 */ | |
212 | |
213 | |
214 /* NCBI:SRA:Illumina:common #1 | |
215 * basic table interface based upon Illumina's pipelines | |
216 * | |
217 * history: | |
218 * 1.0.1 - explictly base upon sra #1.0.1 | |
219 * 1.0.2 - base explicitly upon sra #1.0.2 | |
220 * 1.0.3 - base explicitly upon sra #1.0.3 | |
221 */ | |
222 table NCBI:SRA:Illumina:common #1.0.3 = INSDC:SRA:tbl:sra #1.0.3 | |
223 { | |
224 // platform name is always 'ILLUMINA' | |
225 ascii platform_name | |
226 = < ascii > echo < "ILLUMINA" > (); | |
227 | |
228 /* TRIMMED SEQUENCE | |
229 * need to find the 0-based trim_start and trim_len | |
230 */ | |
231 INSDC:coord:zero bio_start = NCBI:SRA:bio_start ( out_read_start, out_read_type ); | |
232 INSDC:coord:zero trim_start = bio_start; | |
233 U32 trim_left = ( U32 ) trim_start; | |
234 INSDC:coord:len trim_len = (INSDC:coord:len) < U32 > diff ( spot_len, trim_left ); | |
235 | |
236 /* COORDINATES | |
237 * in addition to X and Y, | |
238 * Illumina has LANE and TILE | |
239 */ | |
240 readonly column INSDC:coord:val LANE = out_lane_coord; | |
241 readonly column INSDC:coord:val TILE = out_tile_coord; | |
242 }; | |
243 | |
244 | |
245 /*-------------------------------------------------------------------------- | |
246 * NCBI:SRA:Illumina:tbl:v2 #1 | |
247 * normalized v2 table | |
248 * still has variants based upon quality type | |
249 * | |
250 * history: | |
251 * 1.0.1 - explictly base upon sra #1.0.1 and related tables | |
252 * 1.0.2 - updated ancestry | |
253 * 1.0.3 - updated ancestry | |
254 */ | |
255 | |
256 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:SIGNAL #2 | |
257 { | |
258 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } | |
259 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } | |
260 } | |
261 | |
262 physical NCBI:fsamp4 NCBI:SRA:Illumina:encoding:NOISE #2 | |
263 { | |
264 decode | |
265 { | |
266 F32 dcmp = funzip ( @ ); | |
267 return redimension ( dcmp ); | |
268 } | |
269 encode | |
270 { | |
271 F32 ncmp = redimension ( @ ); | |
272 return fzip < 10 > ( ncmp ); | |
273 } | |
274 } | |
275 | |
276 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:INTENSITY #2 | |
277 { | |
278 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } | |
279 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } | |
280 } | |
281 | |
282 // v2 base table | |
283 table NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
284 = NCBI:SRA:tbl:sra #2.1.3 | |
285 , NCBI:tbl:base_space #2.0.3 | |
286 , NCBI:SRA:Illumina:common #1.0.3 | |
287 { | |
288 /* NAME tokenizing and coordinates | |
289 * most work happens in skeyname table | |
290 * we still obtain LANE and TILE from name | |
291 */ | |
292 INSDC:coord:val out_lane_coord = ( INSDC:coord:val ) | |
293 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( _out_name, out_spot_name_tok ); | |
294 INSDC:coord:val out_tile_coord = ( INSDC:coord:val ) | |
295 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok ); | |
296 NCBI:SRA:spot_name_token out_spot_name_tok | |
297 = NCBI:SRA:Illumina:tokenize_spot_name ( _out_name ); | |
298 | |
299 NCBI:SRA:spot_name_token in_spot_name_tok | |
300 = NCBI:SRA:Illumina:tokenize_spot_name ( NAME ); | |
301 | |
302 /* SIGNAL | |
303 * optional, no longer archived | |
304 */ | |
305 extern column NCBI:fsamp4 SIGNAL | |
306 { | |
307 read = out_signal; | |
308 validate = < NCBI:fsamp4 > no_compare #1 ( in_signal, out_signal ); | |
309 } | |
310 NCBI:fsamp4 in_signal = SIGNAL; | |
311 NCBI:fsamp4 out_signal | |
312 = < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .SIGNAL, out_x2na_bin ); | |
313 | |
314 physical column NCBI:SRA:Illumina:encoding:SIGNAL #2 .SIGNAL | |
315 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_x2na_bin ) | |
316 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_2na_bin ); | |
317 | |
318 /* NOISE | |
319 * optional, no longer archived | |
320 */ | |
321 extern column NCBI:fsamp4 NOISE | |
322 { | |
323 read = out_noise; | |
324 validate = < NCBI:fsamp4 > no_compare #1 ( in_noise, out_noise ); | |
325 } | |
326 NCBI:fsamp4 in_noise = NOISE; | |
327 NCBI:fsamp4 out_noise = .NOISE; | |
328 | |
329 physical column NCBI:SRA:Illumina:encoding:NOISE #2 .NOISE = in_noise; | |
330 | |
331 /* INTENSITY | |
332 * optional, no longer archived | |
333 */ | |
334 extern column NCBI:fsamp4 INTENSITY | |
335 { | |
336 read = out_intensity; | |
337 validate = < NCBI:fsamp4 > no_compare #1 ( in_intensity, out_intensity ); | |
338 } | |
339 NCBI:fsamp4 in_intensity = INTENSITY; | |
340 NCBI:fsamp4 out_intensity | |
341 = < NCBI:fsamp4 > NCBI:SRA:denormalize ( out_norm_intensity, out_x2na_bin ); | |
342 NCBI:fsamp4 out_norm_intensity | |
343 = ( NCBI:fsamp4 ) < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .INTENSITY, out_x2na_bin ); | |
344 NCBI:fsamp4 in_norm_intensity | |
345 = < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_x2na_bin ) | |
346 | < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_2na_bin ); | |
347 physical column NCBI:SRA:Illumina:encoding:INTENSITY #2 .INTENSITY | |
348 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_x2na_bin ) | |
349 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_2na_bin ); | |
350 | |
351 /* INSDC:tbl:sequence inherited virtual productions | |
352 * out_qual_phred | |
353 */ | |
354 | |
355 /* INSDC:SRA:tbl:spotdesc inherited productions | |
356 * static_fixed_spot_len | |
357 */ | |
358 }; | |
359 | |
360 /* 4-channel log-odds qualities | |
361 * | |
362 * history: | |
363 * 1.0.2 - updated ancestry | |
364 * 1.0.3 - updated ancestry | |
365 * 1.0.4 - updated ancestry | |
366 * 1.1.0 - updated ancestry | |
367 */ | |
368 table NCBI:SRA:Illumina:tbl:q4:v2 #1.1.0 | |
369 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
370 , NCBI:SRA:Illumina:qual4 #2.1.0 | |
371 { | |
372 /* INSDC:SRA:tbl:spotdesc inherited virtual productions | |
373 * static_fixed_spot_len | |
374 */ | |
375 }; | |
376 | |
377 /* 1-channel log-odds qualities | |
378 * | |
379 * history: | |
380 * 1.0.2 - updated ancestry | |
381 * 1.0.3 - updated ancestry | |
382 * 1.0.4 - updated ancestry | |
383 * 1.1.0 - updated ancestry | |
384 */ | |
385 table NCBI:SRA:Illumina:tbl:q1:v2 #1.1 | |
386 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
387 , NCBI:tbl:log_odds_quality #2.1.0 | |
388 { | |
389 /* INSDC:SRA:tbl:spotdesc inherited productions | |
390 * static_fixed_spot_len | |
391 */ | |
392 }; | |
393 | |
394 /* phred qualities | |
395 * | |
396 * history: | |
397 * 1.0.2 - updated ancestry | |
398 * 1.0.3 - updated ancestry | |
399 * 1.0.4 - updated ancestry | |
400 */ | |
401 table NCBI:SRA:Illumina:tbl:phred:v2 #1.0.4 | |
402 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
403 , NCBI:tbl:phred_quality #2.0.3 | |
404 { | |
405 /* INSDC:SRA:tbl:spotdesc inherited virtual productions | |
406 * static_fixed_spot_len | |
407 */ | |
408 }; |