Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/seq.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * Sequence schema implementation tables | |
29 */ | |
30 version 1; | |
31 | |
32 include 'vdb/vdb.vschema'; | |
33 include 'ncbi/ncbi.vschema'; | |
34 include 'insdc/sra.vschema'; | |
35 | |
36 | |
37 /*-------------------------------------------------------------------------- | |
38 * n_encoding - implementation | |
39 * introduces common virtual productions | |
40 */ | |
41 table NCBI:tbl:n_encoding #1 | |
42 { | |
43 U8 n_encoding_dummy | |
44 = read_unpack | |
45 | read_ndecode; | |
46 }; | |
47 | |
48 | |
49 /*-------------------------------------------------------------------------- | |
50 * seqloc | |
51 * NCBI sequence locator table | |
52 */ | |
53 table NCBI:tbl:seqloc #1.0 | |
54 { | |
55 /* SEQ_ID | |
56 * a FASTA-style SeqId | |
57 */ | |
58 extern column < ascii > zip_encoding SEQ_ID; | |
59 | |
60 /* SEQ_START | |
61 * provided in both 1 ( default ) and 0-based coordinates | |
62 */ | |
63 extern default column < INSDC:coord:one > izip_encoding SEQ_START; | |
64 readonly column INSDC:coord:zero SEQ_START | |
65 = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START ); | |
66 | |
67 /* SEQ_LEN | |
68 */ | |
69 extern column < INSDC:coord:len > izip_encoding SEQ_LEN; | |
70 }; | |
71 | |
72 | |
73 /*-------------------------------------------------------------------------- | |
74 * base_space - implementation | |
75 * READ column rules | |
76 */ | |
77 | |
78 /* color_from_dna | |
79 * use starting keys and color matrix to convert individual reads | |
80 * to base space. | |
81 */ | |
82 extern function | |
83 INSDC:x2cs:bin NCBI:color_from_dna #1 ( INSDC:x2na:bin bin_x2na, | |
84 INSDC:coord:zero read_start, INSDC:coord:len read_len, | |
85 INSDC:dna:text cs_key, U8 color_matrix ); | |
86 | |
87 | |
88 /* dcmp_base_space | |
89 * table to introduce common virtual productions | |
90 */ | |
91 table NCBI:tbl:dcmp_base_space #1 | |
92 { | |
93 // rules to introduce purely virtual productions | |
94 // never expected to resolve... | |
95 INSDC:dna:text dcmp_virtual_productions | |
96 = out_dcmp_4na_bin | |
97 | out_dcmp_x2na_bin | |
98 | out_dcmp_2na_bin | |
99 | out_dcmp_2na_packed; | |
100 } | |
101 | |
102 /* history: | |
103 * 1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 | |
104 * 1.0.2 - spotdesc #1.0.2 | |
105 * 1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin" | |
106 */ | |
107 table NCBI:tbl:base_space_common #1.0.3 | |
108 = INSDC:tbl:sequence #1.0.1 | |
109 , INSDC:SRA:tbl:spotdesc #1.0.2 | |
110 , INSDC:SRA:tbl:stats #1.1.0 | |
111 , NCBI:tbl:dcmp_base_space #1.0.0 | |
112 { | |
113 /* INSDC:tbl:sequence inherited virtual productions | |
114 */ | |
115 | |
116 // cs_native - tells user color space is not native | |
117 bool cs_native = < bool > echo < false > (); | |
118 | |
119 // in_cs_key is not writable in base_space | |
120 | |
121 // color-space key is completely artificial | |
122 INSDC:dna:text out_cs_key | |
123 = .CS_KEY | |
124 | < INSDC:dna:text > echo < 'T' > ( out_read_type ) | |
125 | < INSDC:dna:text > echo < 'T' > ( out_read_len ) | |
126 | < INSDC:dna:text > echo < 'T' > (); | |
127 | |
128 // unambiguous synthesized 2cs | |
129 INSDC:2cs:bin out_2cs_bin | |
130 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin ); | |
131 | |
132 // unambiguous unpacked 2na | |
133 INSDC:2na:bin out_2na_bin | |
134 = out_dcmp_2na_bin | |
135 | ( INSDC:2na:bin ) unpack ( out_2na_packed ); | |
136 | |
137 // synthesized color sequence | |
138 INSDC:x2cs:bin out_x2cs_bin | |
139 = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); | |
140 | |
141 // synthesized packed 2cs | |
142 INSDC:2cs:packed out_2cs_packed | |
143 = ( INSDC:2cs:packed ) pack ( out_2cs_bin ); | |
144 | |
145 // synthesized packed 4na | |
146 INSDC:4na:packed out_4na_packed | |
147 = ( INSDC:4na:packed ) pack ( out_4na_bin ); | |
148 | |
149 // synthesized color text | |
150 INSDC:color:text out_color_text | |
151 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); | |
152 | |
153 // published color matrix | |
154 U8 out_color_matrix | |
155 = < U8 > echo < INSDC:color:default_matrix > (); | |
156 | |
157 // spot_len and fixed_spot_len | |
158 INSDC:coord:len base_space_spot_len | |
159 = ( INSDC:coord:len ) row_len ( out_2na_packed ); | |
160 INSDC:coord:len base_space_fixed_spot_len | |
161 = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed ); | |
162 | |
163 | |
164 /* INSDC:tbl:sequence inherited productions | |
165 * out_signal | |
166 * in_dna_text | |
167 * out_4na_bin | |
168 * out_dna_text | |
169 * out_x2na_bin | |
170 * out_2na_packed | |
171 */ | |
172 | |
173 /* INSDC:SRA:tbl:stats inherited productions | |
174 * in_stats_bin | |
175 */ | |
176 | |
177 /* NCBI:tbl:dcmp_base_space inherited productions | |
178 * out_dcmp_2na_bin | |
179 * out_dcmp_4na_bin | |
180 * out_dcmp_x2na_bin | |
181 * out_dcmp_2na_packed | |
182 */ | |
183 }; | |
184 | |
185 | |
186 /* base_space_nocol | |
187 * this table describes viewing rules | |
188 * but omits writing rules and physical column description | |
189 * in order to support older tables | |
190 * | |
191 * history: | |
192 * 1.0.1 - base explicitly upon base_space_common #1.0.1 | |
193 * 1.0.2 - base explicitly upon base_space_common #1.0.2 | |
194 * 1.0.3 - " " 1.0.3 | |
195 */ | |
196 table NCBI:tbl:base_space_nocol #1.0.3 | |
197 = NCBI:tbl:base_space_common #1.0.3 | |
198 , NCBI:tbl:n_encoding #1 | |
199 { | |
200 // incoming is disabled | |
201 | |
202 // synthesized dna text | |
203 INSDC:dna:text out_dna_text | |
204 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); | |
205 | |
206 // synthesized 4na | |
207 INSDC:4na:bin out_4na_bin | |
208 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); | |
209 | |
210 // unpacked 2na with ambiguities | |
211 INSDC:x2na:bin out_x2na_bin | |
212 = ( INSDC:x2na:bin ) read_ndecode; | |
213 | |
214 // interface with n-encoded qualities | |
215 U8 read_unpack = out_2na_bin; | |
216 | |
217 /* INSDC:tbl:sequence inherited productions | |
218 * out_signal | |
219 * out_2na_packed | |
220 */ | |
221 | |
222 /* NCBI:tbl:n_encoding inherited productions | |
223 * read_ndecode | |
224 */ | |
225 }; | |
226 | |
227 /* base_space #1 | |
228 * this schema brings in standard .READ column for v1 tables | |
229 * | |
230 * history: | |
231 * 1.0.1 - base explicitly upon base_space_nocol #1.0.1 | |
232 * 1.0.2 - base explicitly upon base_space_nocol #1.0.2 | |
233 * 1.0.3 - base explicitly upon base_space_nocol #1.0.3 | |
234 */ | |
235 table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3 | |
236 { | |
237 // 2-bit 2na representation (0..3) | |
238 INSDC:2na:packed out_2na_packed = .READ; | |
239 | |
240 // no rules for writing to .READ | |
241 | |
242 /* INSDC:tbl:sequence inherited productions | |
243 * out_signal | |
244 */ | |
245 | |
246 /* NCBI:tbl:n_encoding inherited productions | |
247 * read_ndecode | |
248 */ | |
249 }; | |
250 | |
251 | |
252 /* base_space #2 | |
253 * standard current base-space table | |
254 * | |
255 * history: | |
256 * 2.0.2 - base_space_common #1.0.2 | |
257 * 2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well | |
258 */ | |
259 table NCBI:tbl:base_space #2.0.3 | |
260 = NCBI:tbl:base_space_common #1.0.3 | |
261 , NCBI:tbl:dcmp_base_space #1 | |
262 { | |
263 /* input rules | |
264 */ | |
265 | |
266 // input text | |
267 INSDC:dna:text in_dna_text | |
268 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ ); | |
269 | |
270 // input 4na bin | |
271 INSDC:4na:bin in_4na_bin | |
272 = < INSDC:4na:bin > range_validate < 0, 15 > ( READ ) | |
273 | ( INSDC:4na:bin ) unpack ( in_4na_packed ) | |
274 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text ) | |
275 | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin ); | |
276 | |
277 // input 4na packed | |
278 INSDC:4na:packed in_4na_packed = READ; | |
279 | |
280 // input x2na bin | |
281 INSDC:x2na:bin in_x2na_bin | |
282 = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ ) | |
283 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin ); | |
284 | |
285 // input 2na bin | |
286 INSDC:2na:bin in_2na_bin | |
287 = < INSDC:2na:bin > range_validate < 0, 3 > ( READ ) | |
288 | ( INSDC:2na:bin ) unpack ( in_2na_packed ) | |
289 | INSDC:SEQ:rand_4na_2na ( in_4na_bin ); | |
290 | |
291 // input 2na packed | |
292 INSDC:2na:packed in_2na_packed = READ; | |
293 | |
294 // input 4na alt-read ( ambiguities ) | |
295 INSDC:4na:bin in_alt_4na_bin | |
296 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin ); | |
297 | |
298 // preparing a feed into stats column | |
299 U8 in_stats_bin = in_2na_bin; | |
300 | |
301 | |
302 /* physical columns | |
303 */ | |
304 | |
305 physical column INSDC:2na:packed .READ | |
306 = in_2na_packed | |
307 | ( INSDC:2na:packed ) pack ( in_2na_bin ); | |
308 | |
309 physical column < INSDC:4na:bin > zip_encoding .ALTREAD | |
310 = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin ); | |
311 | |
312 | |
313 /* output rules | |
314 */ | |
315 | |
316 // output 2na packed | |
317 INSDC:2na:packed out_2na_packed | |
318 = .READ | |
319 | out_dcmp_2na_packed; | |
320 | |
321 // output x2na bin | |
322 INSDC:x2na:bin out_x2na_bin | |
323 = out_dcmp_x2na_bin | |
324 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ); | |
325 | |
326 // output 2na->4na bin | |
327 INSDC:4na:bin out_2na_4na_bin | |
328 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin ); | |
329 | |
330 // output 4na bin | |
331 INSDC:4na:bin out_4na_bin | |
332 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD ) | |
333 | out_dcmp_4na_bin | |
334 | out_2na_4na_bin; | |
335 | |
336 // output text | |
337 INSDC:dna:text out_dna_text | |
338 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); | |
339 | |
340 | |
341 /* INSDC:tbl:sequence inherited productions | |
342 * out_signal | |
343 */ | |
344 | |
345 /* NCBI:tbl:dcmp_base_space inherited productions | |
346 * out_dcmp_2na_bin | |
347 * out_dcmp_4na_bin | |
348 * out_dcmp_x2na_bin | |
349 * out_dcmp_2na_packed | |
350 */ | |
351 }; | |
352 | |
353 | |
354 | |
355 | |
356 /*-------------------------------------------------------------------------- | |
357 * color_space - implementation | |
358 * nucleotide sequences in color space | |
359 */ | |
360 | |
361 extern function | |
362 INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin, | |
363 INSDC:coord:zero read_start, INSDC:coord:len read_len, | |
364 INSDC:dna:text cs_key, U8 color_matrix ); | |
365 | |
366 | |
367 /* dcmp_color_space | |
368 * declares common virtual productions | |
369 */ | |
370 table NCBI:tbl:dcmp_color_space #1 | |
371 { | |
372 // rules to introduce purely virtual productions | |
373 // never expected to resolve... | |
374 INSDC:dna:text dcmp_virtual_productions | |
375 = out_dcmp_x2cs_bin | |
376 | out_dcmp_2cs_bin | |
377 | out_dcmp_2cs_packed; | |
378 } | |
379 | |
380 /* history: | |
381 * 1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1 | |
382 * 1.0.2 - spotdesc #1.0.2 | |
383 * 1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin" | |
384 */ | |
385 table NCBI:tbl:color_space_common #1.0.3 | |
386 = INSDC:tbl:sequence #1.0.1 | |
387 , INSDC:SRA:tbl:spotdesc #1.0.2 | |
388 , INSDC:SRA:tbl:stats #1.1.0 | |
389 , NCBI:tbl:dcmp_color_space #1.0.0 | |
390 { | |
391 // cs_native - tells user color space is native | |
392 bool cs_native = < bool > echo < true > (); | |
393 | |
394 // unambiguous unpacked 2cs | |
395 INSDC:2cs:bin out_2cs_bin | |
396 = out_dcmp_2cs_bin | |
397 | ( INSDC:2cs:bin ) unpack ( out_2cs_packed ); | |
398 | |
399 // unambiguous synthesized 2na | |
400 INSDC:2na:bin out_2na_bin | |
401 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ); | |
402 | |
403 // synthesized unpacked 4na | |
404 INSDC:4na:bin out_4na_bin | |
405 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); | |
406 | |
407 // synthesized dna text | |
408 INSDC:dna:text out_dna_text | |
409 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); | |
410 | |
411 // synthesized dna sequence | |
412 INSDC:x2na:bin out_x2na_bin | |
413 = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); | |
414 | |
415 // synthesized packed 2na | |
416 INSDC:2na:packed out_2na_packed | |
417 = ( INSDC:2na:packed ) pack ( out_2na_bin ); | |
418 | |
419 // synthesized packed 4na | |
420 INSDC:4na:packed out_4na_packed | |
421 = ( INSDC:4na:packed ) pack ( out_4na_bin ); | |
422 | |
423 // synthesized color text | |
424 INSDC:color:text out_color_text | |
425 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); | |
426 | |
427 // spot_len and fixed_spot_len | |
428 INSDC:coord:len color_space_spot_len | |
429 = ( INSDC:coord:len ) row_len ( out_2cs_packed ); | |
430 INSDC:coord:len color_space_fixed_spot_len | |
431 = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed ); | |
432 | |
433 /* INSDC:tbl:sequence inherited productions | |
434 * in_cs_key | |
435 * out_cs_key | |
436 * out_signal | |
437 * out_x2cs_bin | |
438 * in_color_text | |
439 * out_2cs_packed | |
440 * out_color_matrix | |
441 */ | |
442 | |
443 /* INSDC:SRA:tbl:stats inherited productions | |
444 * in_stats_bin | |
445 */ | |
446 | |
447 /* NCBI:tbl:dcmp_color_space inherited productions | |
448 * out_dcmp_2cs_bin | |
449 * out_dcmp_x2cs_bin | |
450 * out_dcmp_2cs_packed | |
451 */ | |
452 }; | |
453 | |
454 /* color_space_nocol | |
455 * this table describes viewing rules | |
456 * but omits writing rules and physical column description | |
457 * in order to support older tables | |
458 * | |
459 * history: | |
460 * 1.0.1 - base explicitly upon color_space_common #1.0.1 | |
461 * 1.0.2 - color_space_common #1.0.2 | |
462 * 1.0.3 - color_space_common #1.0.3 | |
463 */ | |
464 table NCBI:tbl:color_space_nocol #1.0.3 | |
465 = NCBI:tbl:color_space_common #1.0.3 | |
466 , NCBI:tbl:n_encoding #1 | |
467 { | |
468 // incoming is disabled | |
469 | |
470 // v1 color matrix was stored in metadata | |
471 U8 out_color_matrix | |
472 = < U8 > meta:read < "COLOR_MATRIX" > () | |
473 | < U8 > echo < INSDC:color:default_matrix > (); | |
474 | |
475 // unpacked 2cs with ambiguities | |
476 INSDC:x2cs:bin out_x2cs_bin | |
477 = ( INSDC:x2cs:bin ) read_ndecode; | |
478 | |
479 // interface with n-encoded qualities | |
480 U8 read_unpack = out_2cs_bin; | |
481 | |
482 /* INSDC:tbl:sequence inherited productions | |
483 * out_cs_key | |
484 * out_signal | |
485 * out_2cs_packed | |
486 */ | |
487 | |
488 /* NCBI:tbl:n_encoding inherited productions | |
489 * read_ndecode | |
490 */ | |
491 }; | |
492 | |
493 /* color_space #1 | |
494 * this schema brings in .CSREAD and .CS_KEY columns for v1 tables | |
495 * | |
496 * history: | |
497 * 1.0.1 - base explicitly upon color_space_nocol #1.0.1 | |
498 * 1.0.2 - color_space_nocol #1.0.2 | |
499 * 1.0.3 - color_space_nocol #1.0.3 | |
500 */ | |
501 table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3 | |
502 { | |
503 // stored as text | |
504 INSDC:dna:text out_cs_key = .CS_KEY; | |
505 | |
506 // stored color sequence | |
507 INSDC:2cs:packed out_2cs_packed = .CSREAD; | |
508 | |
509 /* INSDC:tbl:sequence inherited productions | |
510 * out_signal | |
511 */ | |
512 | |
513 /* NCBI:tbl:n_encoding inherited productions | |
514 * read_ndecode | |
515 */ | |
516 }; | |
517 | |
518 /* color_space #2 | |
519 * standard current color-space table | |
520 * | |
521 * history: | |
522 * 2.0.1 - base explicitly upon color_space_common #1.0.1 | |
523 * 2.0.2 - base explicitly upon color_space_common #1.0.2 | |
524 * 2.1.0 - introduce hooks for compressed color space | |
525 */ | |
526 table NCBI:tbl:color_space #2.1 | |
527 = NCBI:tbl:color_space_common #1.0.3 | |
528 , NCBI:tbl:dcmp_color_space #1.0.0 | |
529 { | |
530 /* input rules | |
531 */ | |
532 | |
533 // input text is not modified | |
534 // illegal values are not detected here | |
535 INSDC:color:text in_color_text = CSREAD; | |
536 | |
537 // input x2cs bin | |
538 // illegal values will be caught here | |
539 INSDC:x2cs:bin in_x2cs_bin | |
540 = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD ) | |
541 | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text ); | |
542 | |
543 // input 2cs bin | |
544 INSDC:2cs:bin in_2cs_bin | |
545 = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD ) | |
546 | ( INSDC:2cs:bin ) unpack ( in_2cs_packed ) | |
547 | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin ); | |
548 | |
549 // input 2cs packed | |
550 INSDC:2cs:packed in_2cs_packed = CSREAD; | |
551 | |
552 // input x2cs alt-csread ( ambiguity ) | |
553 INSDC:x2cs:bin in_alt_x2cs_bin | |
554 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin ); | |
555 | |
556 // color-space keys ARE modified on input | |
557 INSDC:dna:text in_cs_key | |
558 = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY ); | |
559 | |
560 // color matrix | |
561 U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX ); | |
562 | |
563 // prepairing a feed into stats column | |
564 U8 in_stats_bin = in_2cs_bin; | |
565 | |
566 | |
567 /* physical columns | |
568 */ | |
569 | |
570 physical column INSDC:2cs:packed .CSREAD | |
571 = in_2cs_packed | |
572 | ( INSDC:2cs:packed ) pack ( in_2cs_bin ); | |
573 | |
574 physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD | |
575 = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin ); | |
576 | |
577 physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; | |
578 | |
579 physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix; | |
580 | |
581 | |
582 /* output rules | |
583 */ | |
584 | |
585 // output 2cs packed | |
586 INSDC:2cs:packed out_2cs_packed | |
587 = .CSREAD | |
588 | out_dcmp_2cs_packed; | |
589 | |
590 // unpacked 2cs with ambiguity | |
591 INSDC:x2cs:bin out_x2cs_bin | |
592 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD ) | |
593 | out_dcmp_x2cs_bin | |
594 | ( INSDC:x2cs:bin ) out_2cs_bin; | |
595 | |
596 // read directly from physical column | |
597 INSDC:dna:text out_cs_key = .CS_KEY; | |
598 | |
599 // color matrix may be synthesized | |
600 U8 out_color_matrix | |
601 = .COLOR_MATRIX | |
602 | < U8 > echo < INSDC:color:default_matrix > (); | |
603 | |
604 | |
605 /* INSDC:tbl:sequence inherited productions | |
606 * out_signal | |
607 */ | |
608 | |
609 /* NCBI:tbl:dcmp_color_space inherited productions | |
610 * out_dcmp_2cs_bin | |
611 * out_dcmp_x2cs_bin | |
612 * out_dcmp_2cs_packed | |
613 */ | |
614 }; | |
615 | |
616 | |
617 /*-------------------------------------------------------------------------- | |
618 * protein | |
619 */ | |
620 table NCBI:tbl:protein #1 = INSDC:tbl:protein | |
621 { | |
622 /* upper-case letters */ | |
623 INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text > | |
624 map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN ); | |
625 | |
626 /* std aa */ | |
627 INSDC:aa:bin in_aa_bin | |
628 = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN ) | |
629 | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text ); | |
630 | |
631 /* physical column */ | |
632 physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin; | |
633 | |
634 /* output rules */ | |
635 INSDC:aa:bin out_aa_bin = .PROTEIN; | |
636 INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text > | |
637 map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin ); | |
638 }; | |
639 | |
640 | |
641 /*-------------------------------------------------------------------------- | |
642 * phred | |
643 * standard phred quality representation | |
644 * limits values on input to 1..63 | |
645 * reserves value 0 as ambiguity symbol for reads | |
646 */ | |
647 | |
648 | |
649 /* history: | |
650 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
651 */ | |
652 table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 | |
653 { | |
654 /* [CS]READ - decoding | |
655 */ | |
656 U8 read_ndecode | |
657 = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack ); | |
658 | |
659 /* INSDC:tbl:sequence inherited productions | |
660 * out_qual_phred | |
661 * out_qual_text_phred_33 | |
662 * out_qual_text_phred_64 | |
663 */ | |
664 | |
665 /* NCBI:tbl:n_encoding inherited productions | |
666 * read_unpack | |
667 */ | |
668 }; | |
669 | |
670 /* history: | |
671 * 1.0.1 - base explicitly upon phred_quality_nocol #1.0.1 | |
672 */ | |
673 table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1 | |
674 { | |
675 // read directly as n-encoded phred is compatible with phred | |
676 NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY; | |
677 | |
678 /* INSDC:tbl:sequence inherited productions | |
679 * out_qual_text_phred_33 | |
680 * out_qual_text_phred_64 | |
681 */ | |
682 | |
683 /* NCBI:tbl:n_encoding inherited productions | |
684 * read_unpack | |
685 */ | |
686 }; | |
687 | |
688 /* history: | |
689 * 2.0.1 - added feed of in_stats_qual | |
690 * 2.0.2 - added input of text encodings | |
691 * 2.0.3 - base explicitly upon sequence #1.0.1 | |
692 * 2.0.4 - change compression from izip to zip | |
693 * 2.0.5 - change from zip to delta_average_zip | |
694 */ | |
695 table NCBI:tbl:phred_quality #2.0.4 = INSDC:tbl:sequence #1.0.1 | |
696 { | |
697 // read directly quality as phred | |
698 INSDC:quality:phred out_qual_phred = .QUALITY; | |
699 | |
700 // input rules | |
701 INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY; | |
702 INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY; | |
703 | |
704 INSDC:quality:phred in_qual_phred | |
705 = QUALITY | |
706 | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 ) | |
707 | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 ); | |
708 | |
709 // physical storage | |
710 /*** next line is for future change in production, but we have to wait until supporting code is released to the public ***/ | |
711 // physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY = in_qual_phred; | |
712 /*** NB *** MUST change table version to 2.0.5 and propagate to all derived tables ***/ | |
713 physical column < INSDC:quality:phred > zip_encoding .QUALITY = in_qual_phred; | |
714 | |
715 // feed to compressed statistics | |
716 INSDC:quality:phred in_stats_qual = in_qual_phred; | |
717 | |
718 /* INSDC:tbl:sequence inherited productions | |
719 * out_qual_text_phred_33 | |
720 * out_qual_text_phred_64 | |
721 */ | |
722 }; | |
723 | |
724 | |
725 | |
726 /*-------------------------------------------------------------------------- | |
727 * log_odds | |
728 * log-odds quality score support | |
729 * | |
730 * conversion from log-odds to phred is via formula | |
731 * 10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499 | |
732 * for x = -4..40 : when x = -5, phred = 0 | |
733 */ | |
734 | |
735 // the map function requires two lookup tables: | |
736 // the first table detects every legal value... | |
737 const INSDC:quality:log_odds NCBI:quality:from:log_odds = | |
738 [ | |
739 -6,-5,-4,-3,-2,-1, 0, | |
740 1, 2, 3, 4, 5, 6, 7, 8, 9,10, | |
741 11,12,13,14,15,16,17,18,19,20, | |
742 21,22,23,24,25,26,27,28,29,30, | |
743 31,32,33,34,35,36,37,38,39,40 | |
744 ]; | |
745 | |
746 // ...the second table gives positional translations | |
747 const INSDC:quality:phred NCBI:quality:to:phred = | |
748 [ | |
749 0, 1, 1, 2, 2, 3, 3, | |
750 4, 4, 5, 5, 6, 7, 8, 9,10,10, | |
751 11,12,13,14,15,16,17,18,19,20, | |
752 21,22,23,24,25,26,27,28,29,30, | |
753 31,32,33,34,35,36,37,38,39,40 | |
754 ]; | |
755 | |
756 function | |
757 INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds ) | |
758 { | |
759 // this range enforcement may not be required | |
760 INSDC:quality:log_odds log_odds_clip | |
761 = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds ); | |
762 | |
763 // use the tables above to map from log-odds to phred | |
764 return < INSDC:quality:log_odds, INSDC:quality:phred > | |
765 map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip ); | |
766 } | |
767 | |
768 /* history: | |
769 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
770 */ | |
771 table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 | |
772 { | |
773 /* READ - decoding | |
774 */ | |
775 U8 read_ndecode | |
776 = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack ); | |
777 | |
778 /* QUALITY | |
779 * declared in INSDC:tbl:sequence as phred | |
780 * introduce here as log-odds | |
781 */ | |
782 extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds; | |
783 | |
784 // resolve for phred | |
785 INSDC:quality:phred out_qual_phred | |
786 = out_qual2_phred | |
787 | NCBI:log_odds_to_phred ( out_qual_log_odds ); | |
788 | |
789 /* INSDC:tbl:sequence inherited productions | |
790 * out_qual_text_phred_33 | |
791 * out_qual_text_phred_64 | |
792 */ | |
793 | |
794 /* NCBI:tbl:n_encoding inherited productions | |
795 * read_unpack | |
796 */ | |
797 | |
798 /* NCBI:tbl:log_odds_quality_nocol productions | |
799 * out_qual2_phred | |
800 * out_qual_log_odds | |
801 */ | |
802 }; | |
803 | |
804 /* history: | |
805 * 1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1 | |
806 */ | |
807 table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1 | |
808 { | |
809 // read directly as n-encoded log_odds is compatible with log_odds | |
810 NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY; | |
811 | |
812 /* INSDC:tbl:sequence inherited productions | |
813 * out_qual_text_phred_33 | |
814 * out_qual_text_phred_64 | |
815 */ | |
816 | |
817 /* NCBI:tbl:n_encoding inherited productions | |
818 * read_unpack | |
819 */ | |
820 | |
821 /* NCBI:tbl:log_odds_quality_nocol inherited productions | |
822 * out_qual2_phred | |
823 */ | |
824 }; | |
825 | |
826 /* history: | |
827 * 2.0.1 - base explicitly upon sequence #1.0.1 | |
828 * 2.1.0 - added production of in_qual_phred | |
829 */ | |
830 table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1 | |
831 { | |
832 /* QUALITY | |
833 * declared in INSDC:tbl:sequence as phred | |
834 * introduce here as log-odds | |
835 */ | |
836 extern column INSDC:quality:log_odds QUALITY | |
837 = out_qual_log_odds; | |
838 | |
839 // resolve for phred | |
840 INSDC:quality:phred in_qual_phred | |
841 = NCBI:log_odds_to_phred ( in_qual_log_odds ); | |
842 | |
843 INSDC:quality:phred out_qual_phred | |
844 = NCBI:log_odds_to_phred ( out_qual_log_odds ); | |
845 | |
846 | |
847 /* INSDC:tbl:sequence inherited productions | |
848 * out_qual_text_phred_33 | |
849 * out_qual_text_phred_64 | |
850 */ | |
851 | |
852 /* NCBI:tbl:log_odds_quality_nocol productions | |
853 * out_qual_log_odds | |
854 */ | |
855 }; | |
856 | |
857 /* history: | |
858 * 2.0.1 - added feed of in_stats_qual | |
859 * 2.0.2 - added input of text encodings | |
860 * 2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1 | |
861 * 2.0.4 - changed compression from izip to zip | |
862 * 2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0 | |
863 */ | |
864 table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0 | |
865 { | |
866 INSDC:quality:log_odds out_qual_log_odds= .QUALITY; | |
867 | |
868 extern column INSDC:quality:text:log_odds_64 QUALITY | |
869 = out_qual_text_log_odds_64 | |
870 | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds ); | |
871 | |
872 // input rules | |
873 INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY; | |
874 | |
875 INSDC:quality:log_odds in_qual_log_odds | |
876 = QUALITY | |
877 | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 ); | |
878 | |
879 physical column < INSDC:quality:log_odds > zip_encoding .QUALITY | |
880 = in_qual_log_odds; | |
881 | |
882 // feed to compressed statistics | |
883 INSDC:quality:log_odds in_stats_qual = in_qual_log_odds; | |
884 | |
885 | |
886 /* INSDC:tbl:sequence inherited productions | |
887 * out_qual_text_phred_33 | |
888 * out_qual_text_phred_64 | |
889 */ | |
890 | |
891 /* NCBI:tbl:log_odds_quality productions | |
892 * out_qual_text_log_odds_64 | |
893 */ | |
894 }; |