comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/illumina.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * NCBI Illumina Sequence Read Archive schema
29 */
30 version 1;
31
32 include 'ncbi/sra.vschema';
33 include 'ncbi/spotname.vschema';
34
35
36 /*--------------------------------------------------------------------------
37 * types
38 */
39
40 typedef INSDC:quality:log_odds NCBI:qual4 [ 4 ];
41 typedef NCBI:qual4 NCBI:SRA:rotated_qual4, NCBI:SRA:swapped_qual4;
42
43
44 /*--------------------------------------------------------------------------
45 * functions
46 */
47
48 /* tokenize_spot_name
49 * scans name on input
50 * tokenizes into parts
51 */
52 extern function NCBI:SRA:spot_name_token
53 NCBI:SRA:Illumina:tokenize_spot_name #1 ( ascii name );
54
55
56 /*--------------------------------------------------------------------------
57 * NCBI:SRA:Illumina:qual4
58 * 4-channel log-odds-ish quality
59 */
60
61 /* history:
62 * 1.0.1 - base explicitly upon updated ancestry
63 */
64 table NCBI:SRA:Illumina:qual4_nocol #1.0.1
65 = INSDC:tbl:sequence #1.0.1
66 , NCBI:tbl:log_odds_quality_nocol #1.0.1
67 {
68 /* QUALITY
69 * 4-channel quality column
70 */
71 readonly column NCBI:qual4 QUALITY = out_qual4;
72
73 NCBI:qual4 out_qual4
74 = < NCBI:qual4 > NCBI:SRA:swap ( out_qual4_swapped, read_unpack )
75 | < NCBI:qual4 > NCBI:SRA:rotate < false > ( out_qual4_rotated, read_unpack );
76
77
78 /* single-channel output
79 * convert 4-channel log-odds to single channel
80 * must retain n-encoding, which was intended to be the 4-channel pattern
81 * ( -5, -5, -5, -5 ) and a base of 'A'
82 */
83
84 // first, extract quality for called base
85 INSDC:quality:log_odds out_qual1_ch0
86 = < INSDC:quality:log_odds> cut < 0 > ( out_qual4_swapped )
87 | < INSDC:quality:log_odds> cut < 0 > ( out_qual4_rotated );
88
89 // clip it to -5 and above
90 INSDC:quality:log_odds out_qual1_clip
91 = < INSDC:quality:log_odds > clip < -5, 127 > ( out_qual1_ch0 );
92
93 // convert 4 channel to single 32-bit value
94 U32 out_qual4_32
95 = redimension ( out_qual4_swapped )
96 | redimension ( out_qual4_rotated );
97
98 // detect ( -5, -5, -5, -5 ) and introduce a -6 value into log-odds
99 // this is treated as an 'N', but still not ready
100 INSDC:quality:log_odds out_qual1_fives
101 = < U32, INSDC:quality:log_odds > map < 0xFBFBFBFB, -6 > ( out_qual4_32, out_qual1_clip );
102
103 // now slam zeros into anything that doesn't correspond to an A
104 // essentially this leaves all of the A qualities. any having -6 are really N.
105 INSDC:quality:log_odds out_qual1_n
106 = < U8, INSDC:quality:log_odds > map < [ 1, 2, 3 ], [ 0, 0, 0 ] > ( read_unpack, out_qual1_fives );
107
108 // finally, produce log-odds with n-encoded as -6
109 INSDC:quality:log_odds out_qual_log_odds
110 = < INSDC:quality:log_odds, INSDC:quality:log_odds > map < -6, -6 > ( out_qual1_n, out_qual1_clip );
111
112
113 /* NCBI:tbl:n_encoding inherited productions
114 * read_unpack
115 */
116
117 /* NCBI:SRA:Illumina:qual4_nocol productions
118 * out_qual4_rotated
119 * out_qual4_swapped
120 */
121 };
122
123
124 /* 4-channel log-odds compression
125 */
126
127 // encoded type - a single byte code for 4-channel pattern
128 typedef B8 NCBI:SRA:encoded_qual4;
129
130 // decoding function
131 extern function
132 NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_decode #1 ( NCBI:SRA:encoded_qual4 in );
133
134 // encoding function
135 extern function
136 NCBI:SRA:encoded_qual4 NCBI:SRA:qual4_encode #1 ( NCBI:SRA:swapped_qual4 in );
137
138 // compression rules
139 physical NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_encoding #1
140 {
141 encode
142 {
143 // produce codes
144 NCBI:SRA:encoded_qual4 encoded = NCBI:SRA:qual4_encode ( @ );
145
146 // gzip
147 return zip < Z_RLE, Z_BEST_SPEED > ( encoded );
148 }
149
150 decode
151 {
152 // gunzip
153 NCBI:SRA:encoded_qual4 unzipped = unzip ( @ );
154
155 // inflate to swapped
156 return NCBI:SRA:qual4_decode ( unzipped );
157 }
158 }
159
160 /* history:
161 * 1.0.1 - base upon updated qual4_nocol
162 */
163 table NCBI:SRA:Illumina:qual4 #1.0.1 = NCBI:SRA:Illumina:qual4_nocol #1.0.1
164 {
165 // read directly as swapped, n-encoded log_odds
166 NCBI:SRA:swapped_qual4 out_qual4_swapped = .QUALITY;
167
168 /* NCBI:tbl:n_encoding inherited virtual productions
169 * read_unpack
170 */
171 };
172
173 /* history:
174 * 2.0.2 - base upon updated ancestry
175 * 2.0.3 - base upon updated ancestry
176 * 2.0.4 - base upon updated ancestry
177 * 2.1.0 - base upon updated ancestry, added in_qual_log_odds
178 */
179 table NCBI:SRA:Illumina:qual4 #2.1.0
180 = NCBI:tbl:base_space #2.0.3
181 , NCBI:tbl:log_odds_quality_nocol #2.1.0
182 {
183 /* QUALITY
184 * 4-channel log-odds
185 */
186 extern column NCBI:qual4 QUALITY = out_qual4;
187
188 NCBI:SRA:swapped_qual4 in_qual4
189 = ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_x2na_bin )
190 | ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_2na_bin );
191
192 NCBI:qual4 out_qual4
193 = < NCBI:SRA:swapped_qual4 > NCBI:SRA:swap ( .QUALITY, out_x2na_bin );
194
195 physical column NCBI:SRA:qual4_encoding .QUALITY = in_qual4;
196
197 // feed to compressed statistics
198 NCBI:qual4 in_stats_qual = in_qual4;
199
200 // single channel
201 INSDC:quality:log_odds in_qual_log_odds
202 = < INSDC:quality:log_odds > cut < 0 > ( in_qual4 );
203 INSDC:quality:log_odds out_qual_log_odds
204 = < INSDC:quality:log_odds > cut < 0 > ( .QUALITY );
205 };
206
207
208 /*--------------------------------------------------------------------------
209 * NCBI:SRA:Illumina
210 * Illumina SRA Platform
211 */
212
213
214 /* NCBI:SRA:Illumina:common #1
215 * basic table interface based upon Illumina's pipelines
216 *
217 * history:
218 * 1.0.1 - explictly base upon sra #1.0.1
219 * 1.0.2 - base explicitly upon sra #1.0.2
220 * 1.0.3 - base explicitly upon sra #1.0.3
221 */
222 table NCBI:SRA:Illumina:common #1.0.3 = INSDC:SRA:tbl:sra #1.0.3
223 {
224 // platform name is always 'ILLUMINA'
225 ascii platform_name
226 = < ascii > echo < "ILLUMINA" > ();
227
228 /* TRIMMED SEQUENCE
229 * need to find the 0-based trim_start and trim_len
230 */
231 INSDC:coord:zero bio_start = NCBI:SRA:bio_start ( out_read_start, out_read_type );
232 INSDC:coord:zero trim_start = bio_start;
233 U32 trim_left = ( U32 ) trim_start;
234 INSDC:coord:len trim_len = (INSDC:coord:len) < U32 > diff ( spot_len, trim_left );
235
236 /* COORDINATES
237 * in addition to X and Y,
238 * Illumina has LANE and TILE
239 */
240 readonly column INSDC:coord:val LANE = out_lane_coord;
241 readonly column INSDC:coord:val TILE = out_tile_coord;
242 };
243
244
245 /*--------------------------------------------------------------------------
246 * NCBI:SRA:Illumina:tbl:v2 #1
247 * normalized v2 table
248 * still has variants based upon quality type
249 *
250 * history:
251 * 1.0.1 - explictly base upon sra #1.0.1 and related tables
252 * 1.0.2 - updated ancestry
253 * 1.0.3 - updated ancestry
254 */
255
256 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:SIGNAL #2
257 {
258 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); }
259 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); }
260 }
261
262 physical NCBI:fsamp4 NCBI:SRA:Illumina:encoding:NOISE #2
263 {
264 decode
265 {
266 F32 dcmp = funzip ( @ );
267 return redimension ( dcmp );
268 }
269 encode
270 {
271 F32 ncmp = redimension ( @ );
272 return fzip < 10 > ( ncmp );
273 }
274 }
275
276 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:INTENSITY #2
277 {
278 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); }
279 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); }
280 }
281
282 // v2 base table
283 table NCBI:SRA:Illumina:tbl:v2 #1.0.4
284 = NCBI:SRA:tbl:sra #2.1.3
285 , NCBI:tbl:base_space #2.0.3
286 , NCBI:SRA:Illumina:common #1.0.3
287 {
288 /* NAME tokenizing and coordinates
289 * most work happens in skeyname table
290 * we still obtain LANE and TILE from name
291 */
292 INSDC:coord:val out_lane_coord = ( INSDC:coord:val )
293 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( _out_name, out_spot_name_tok );
294 INSDC:coord:val out_tile_coord = ( INSDC:coord:val )
295 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok );
296 NCBI:SRA:spot_name_token out_spot_name_tok
297 = NCBI:SRA:Illumina:tokenize_spot_name ( _out_name );
298
299 NCBI:SRA:spot_name_token in_spot_name_tok
300 = NCBI:SRA:Illumina:tokenize_spot_name ( NAME );
301
302 /* SIGNAL
303 * optional, no longer archived
304 */
305 extern column NCBI:fsamp4 SIGNAL
306 {
307 read = out_signal;
308 validate = < NCBI:fsamp4 > no_compare #1 ( in_signal, out_signal );
309 }
310 NCBI:fsamp4 in_signal = SIGNAL;
311 NCBI:fsamp4 out_signal
312 = < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .SIGNAL, out_x2na_bin );
313
314 physical column NCBI:SRA:Illumina:encoding:SIGNAL #2 .SIGNAL
315 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_x2na_bin )
316 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_2na_bin );
317
318 /* NOISE
319 * optional, no longer archived
320 */
321 extern column NCBI:fsamp4 NOISE
322 {
323 read = out_noise;
324 validate = < NCBI:fsamp4 > no_compare #1 ( in_noise, out_noise );
325 }
326 NCBI:fsamp4 in_noise = NOISE;
327 NCBI:fsamp4 out_noise = .NOISE;
328
329 physical column NCBI:SRA:Illumina:encoding:NOISE #2 .NOISE = in_noise;
330
331 /* INTENSITY
332 * optional, no longer archived
333 */
334 extern column NCBI:fsamp4 INTENSITY
335 {
336 read = out_intensity;
337 validate = < NCBI:fsamp4 > no_compare #1 ( in_intensity, out_intensity );
338 }
339 NCBI:fsamp4 in_intensity = INTENSITY;
340 NCBI:fsamp4 out_intensity
341 = < NCBI:fsamp4 > NCBI:SRA:denormalize ( out_norm_intensity, out_x2na_bin );
342 NCBI:fsamp4 out_norm_intensity
343 = ( NCBI:fsamp4 ) < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .INTENSITY, out_x2na_bin );
344 NCBI:fsamp4 in_norm_intensity
345 = < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_x2na_bin )
346 | < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_2na_bin );
347 physical column NCBI:SRA:Illumina:encoding:INTENSITY #2 .INTENSITY
348 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_x2na_bin )
349 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_2na_bin );
350
351 /* INSDC:tbl:sequence inherited virtual productions
352 * out_qual_phred
353 */
354
355 /* INSDC:SRA:tbl:spotdesc inherited productions
356 * static_fixed_spot_len
357 */
358 };
359
360 /* 4-channel log-odds qualities
361 *
362 * history:
363 * 1.0.2 - updated ancestry
364 * 1.0.3 - updated ancestry
365 * 1.0.4 - updated ancestry
366 * 1.1.0 - updated ancestry
367 */
368 table NCBI:SRA:Illumina:tbl:q4:v2 #1.1.0
369 = NCBI:SRA:Illumina:tbl:v2 #1.0.4
370 , NCBI:SRA:Illumina:qual4 #2.1.0
371 {
372 /* INSDC:SRA:tbl:spotdesc inherited virtual productions
373 * static_fixed_spot_len
374 */
375 };
376
377 /* 1-channel log-odds qualities
378 *
379 * history:
380 * 1.0.2 - updated ancestry
381 * 1.0.3 - updated ancestry
382 * 1.0.4 - updated ancestry
383 * 1.1.0 - updated ancestry
384 */
385 table NCBI:SRA:Illumina:tbl:q1:v2 #1.1
386 = NCBI:SRA:Illumina:tbl:v2 #1.0.4
387 , NCBI:tbl:log_odds_quality #2.1.0
388 {
389 /* INSDC:SRA:tbl:spotdesc inherited productions
390 * static_fixed_spot_len
391 */
392 };
393
394 /* phred qualities
395 *
396 * history:
397 * 1.0.2 - updated ancestry
398 * 1.0.3 - updated ancestry
399 * 1.0.4 - updated ancestry
400 */
401 table NCBI:SRA:Illumina:tbl:phred:v2 #1.0.4
402 = NCBI:SRA:Illumina:tbl:v2 #1.0.4
403 , NCBI:tbl:phred_quality #2.0.3
404 {
405 /* INSDC:SRA:tbl:spotdesc inherited virtual productions
406 * static_fixed_spot_len
407 */
408 };