comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/seq.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * Sequence schema implementation tables
29 */
30 version 1;
31
32 include 'vdb/vdb.vschema';
33 include 'ncbi/ncbi.vschema';
34 include 'insdc/sra.vschema';
35
36
37 /*--------------------------------------------------------------------------
38 * n_encoding - implementation
39 * introduces common virtual productions
40 */
41 table NCBI:tbl:n_encoding #1
42 {
43 U8 n_encoding_dummy
44 = read_unpack
45 | read_ndecode;
46 };
47
48
49 /*--------------------------------------------------------------------------
50 * seqloc
51 * NCBI sequence locator table
52 */
53 table NCBI:tbl:seqloc #1.0
54 {
55 /* SEQ_ID
56 * a FASTA-style SeqId
57 */
58 extern column < ascii > zip_encoding SEQ_ID;
59
60 /* SEQ_START
61 * provided in both 1 ( default ) and 0-based coordinates
62 */
63 extern default column < INSDC:coord:one > izip_encoding SEQ_START;
64 readonly column INSDC:coord:zero SEQ_START
65 = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START );
66
67 /* SEQ_LEN
68 */
69 extern column < INSDC:coord:len > izip_encoding SEQ_LEN;
70 };
71
72
73 /*--------------------------------------------------------------------------
74 * base_space - implementation
75 * READ column rules
76 */
77
78 /* color_from_dna
79 * use starting keys and color matrix to convert individual reads
80 * to base space.
81 */
82 extern function
83 INSDC:x2cs:bin NCBI:color_from_dna #1 ( INSDC:x2na:bin bin_x2na,
84 INSDC:coord:zero read_start, INSDC:coord:len read_len,
85 INSDC:dna:text cs_key, U8 color_matrix );
86
87
88 /* dcmp_base_space
89 * table to introduce common virtual productions
90 */
91 table NCBI:tbl:dcmp_base_space #1
92 {
93 // rules to introduce purely virtual productions
94 // never expected to resolve...
95 INSDC:dna:text dcmp_virtual_productions
96 = out_dcmp_4na_bin
97 | out_dcmp_x2na_bin
98 | out_dcmp_2na_bin
99 | out_dcmp_2na_packed;
100 }
101
102 /* history:
103 * 1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1
104 * 1.0.2 - spotdesc #1.0.2
105 * 1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin"
106 */
107 table NCBI:tbl:base_space_common #1.0.3
108 = INSDC:tbl:sequence #1.0.1
109 , INSDC:SRA:tbl:spotdesc #1.0.2
110 , INSDC:SRA:tbl:stats #1.1.0
111 , NCBI:tbl:dcmp_base_space #1.0.0
112 {
113 /* INSDC:tbl:sequence inherited virtual productions
114 */
115
116 // cs_native - tells user color space is not native
117 bool cs_native = < bool > echo < false > ();
118
119 // in_cs_key is not writable in base_space
120
121 // color-space key is completely artificial
122 INSDC:dna:text out_cs_key
123 = .CS_KEY
124 | < INSDC:dna:text > echo < 'T' > ( out_read_type )
125 | < INSDC:dna:text > echo < 'T' > ( out_read_len )
126 | < INSDC:dna:text > echo < 'T' > ();
127
128 // unambiguous synthesized 2cs
129 INSDC:2cs:bin out_2cs_bin
130 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin );
131
132 // unambiguous unpacked 2na
133 INSDC:2na:bin out_2na_bin
134 = out_dcmp_2na_bin
135 | ( INSDC:2na:bin ) unpack ( out_2na_packed );
136
137 // synthesized color sequence
138 INSDC:x2cs:bin out_x2cs_bin
139 = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix );
140
141 // synthesized packed 2cs
142 INSDC:2cs:packed out_2cs_packed
143 = ( INSDC:2cs:packed ) pack ( out_2cs_bin );
144
145 // synthesized packed 4na
146 INSDC:4na:packed out_4na_packed
147 = ( INSDC:4na:packed ) pack ( out_4na_bin );
148
149 // synthesized color text
150 INSDC:color:text out_color_text
151 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin );
152
153 // published color matrix
154 U8 out_color_matrix
155 = < U8 > echo < INSDC:color:default_matrix > ();
156
157 // spot_len and fixed_spot_len
158 INSDC:coord:len base_space_spot_len
159 = ( INSDC:coord:len ) row_len ( out_2na_packed );
160 INSDC:coord:len base_space_fixed_spot_len
161 = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed );
162
163
164 /* INSDC:tbl:sequence inherited productions
165 * out_signal
166 * in_dna_text
167 * out_4na_bin
168 * out_dna_text
169 * out_x2na_bin
170 * out_2na_packed
171 */
172
173 /* INSDC:SRA:tbl:stats inherited productions
174 * in_stats_bin
175 */
176
177 /* NCBI:tbl:dcmp_base_space inherited productions
178 * out_dcmp_2na_bin
179 * out_dcmp_4na_bin
180 * out_dcmp_x2na_bin
181 * out_dcmp_2na_packed
182 */
183 };
184
185
186 /* base_space_nocol
187 * this table describes viewing rules
188 * but omits writing rules and physical column description
189 * in order to support older tables
190 *
191 * history:
192 * 1.0.1 - base explicitly upon base_space_common #1.0.1
193 * 1.0.2 - base explicitly upon base_space_common #1.0.2
194 * 1.0.3 - " " 1.0.3
195 */
196 table NCBI:tbl:base_space_nocol #1.0.3
197 = NCBI:tbl:base_space_common #1.0.3
198 , NCBI:tbl:n_encoding #1
199 {
200 // incoming is disabled
201
202 // synthesized dna text
203 INSDC:dna:text out_dna_text
204 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin );
205
206 // synthesized 4na
207 INSDC:4na:bin out_4na_bin
208 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin );
209
210 // unpacked 2na with ambiguities
211 INSDC:x2na:bin out_x2na_bin
212 = ( INSDC:x2na:bin ) read_ndecode;
213
214 // interface with n-encoded qualities
215 U8 read_unpack = out_2na_bin;
216
217 /* INSDC:tbl:sequence inherited productions
218 * out_signal
219 * out_2na_packed
220 */
221
222 /* NCBI:tbl:n_encoding inherited productions
223 * read_ndecode
224 */
225 };
226
227 /* base_space #1
228 * this schema brings in standard .READ column for v1 tables
229 *
230 * history:
231 * 1.0.1 - base explicitly upon base_space_nocol #1.0.1
232 * 1.0.2 - base explicitly upon base_space_nocol #1.0.2
233 * 1.0.3 - base explicitly upon base_space_nocol #1.0.3
234 */
235 table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3
236 {
237 // 2-bit 2na representation (0..3)
238 INSDC:2na:packed out_2na_packed = .READ;
239
240 // no rules for writing to .READ
241
242 /* INSDC:tbl:sequence inherited productions
243 * out_signal
244 */
245
246 /* NCBI:tbl:n_encoding inherited productions
247 * read_ndecode
248 */
249 };
250
251
252 /* base_space #2
253 * standard current base-space table
254 *
255 * history:
256 * 2.0.2 - base_space_common #1.0.2
257 * 2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well
258 */
259 table NCBI:tbl:base_space #2.0.3
260 = NCBI:tbl:base_space_common #1.0.3
261 , NCBI:tbl:dcmp_base_space #1
262 {
263 /* input rules
264 */
265
266 // input text
267 INSDC:dna:text in_dna_text
268 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ );
269
270 // input 4na bin
271 INSDC:4na:bin in_4na_bin
272 = < INSDC:4na:bin > range_validate < 0, 15 > ( READ )
273 | ( INSDC:4na:bin ) unpack ( in_4na_packed )
274 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text )
275 | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin );
276
277 // input 4na packed
278 INSDC:4na:packed in_4na_packed = READ;
279
280 // input x2na bin
281 INSDC:x2na:bin in_x2na_bin
282 = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ )
283 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin );
284
285 // input 2na bin
286 INSDC:2na:bin in_2na_bin
287 = < INSDC:2na:bin > range_validate < 0, 3 > ( READ )
288 | ( INSDC:2na:bin ) unpack ( in_2na_packed )
289 | INSDC:SEQ:rand_4na_2na ( in_4na_bin );
290
291 // input 2na packed
292 INSDC:2na:packed in_2na_packed = READ;
293
294 // input 4na alt-read ( ambiguities )
295 INSDC:4na:bin in_alt_4na_bin
296 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin );
297
298 // preparing a feed into stats column
299 U8 in_stats_bin = in_2na_bin;
300
301
302 /* physical columns
303 */
304
305 physical column INSDC:2na:packed .READ
306 = in_2na_packed
307 | ( INSDC:2na:packed ) pack ( in_2na_bin );
308
309 physical column < INSDC:4na:bin > zip_encoding .ALTREAD
310 = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin );
311
312
313 /* output rules
314 */
315
316 // output 2na packed
317 INSDC:2na:packed out_2na_packed
318 = .READ
319 | out_dcmp_2na_packed;
320
321 // output x2na bin
322 INSDC:x2na:bin out_x2na_bin
323 = out_dcmp_x2na_bin
324 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin );
325
326 // output 2na->4na bin
327 INSDC:4na:bin out_2na_4na_bin
328 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin );
329
330 // output 4na bin
331 INSDC:4na:bin out_4na_bin
332 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD )
333 | out_dcmp_4na_bin
334 | out_2na_4na_bin;
335
336 // output text
337 INSDC:dna:text out_dna_text
338 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin );
339
340
341 /* INSDC:tbl:sequence inherited productions
342 * out_signal
343 */
344
345 /* NCBI:tbl:dcmp_base_space inherited productions
346 * out_dcmp_2na_bin
347 * out_dcmp_4na_bin
348 * out_dcmp_x2na_bin
349 * out_dcmp_2na_packed
350 */
351 };
352
353
354
355
356 /*--------------------------------------------------------------------------
357 * color_space - implementation
358 * nucleotide sequences in color space
359 */
360
361 extern function
362 INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin,
363 INSDC:coord:zero read_start, INSDC:coord:len read_len,
364 INSDC:dna:text cs_key, U8 color_matrix );
365
366
367 /* dcmp_color_space
368 * declares common virtual productions
369 */
370 table NCBI:tbl:dcmp_color_space #1
371 {
372 // rules to introduce purely virtual productions
373 // never expected to resolve...
374 INSDC:dna:text dcmp_virtual_productions
375 = out_dcmp_x2cs_bin
376 | out_dcmp_2cs_bin
377 | out_dcmp_2cs_packed;
378 }
379
380 /* history:
381 * 1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1
382 * 1.0.2 - spotdesc #1.0.2
383 * 1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin"
384 */
385 table NCBI:tbl:color_space_common #1.0.3
386 = INSDC:tbl:sequence #1.0.1
387 , INSDC:SRA:tbl:spotdesc #1.0.2
388 , INSDC:SRA:tbl:stats #1.1.0
389 , NCBI:tbl:dcmp_color_space #1.0.0
390 {
391 // cs_native - tells user color space is native
392 bool cs_native = < bool > echo < true > ();
393
394 // unambiguous unpacked 2cs
395 INSDC:2cs:bin out_2cs_bin
396 = out_dcmp_2cs_bin
397 | ( INSDC:2cs:bin ) unpack ( out_2cs_packed );
398
399 // unambiguous synthesized 2na
400 INSDC:2na:bin out_2na_bin
401 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin );
402
403 // synthesized unpacked 4na
404 INSDC:4na:bin out_4na_bin
405 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin );
406
407 // synthesized dna text
408 INSDC:dna:text out_dna_text
409 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin );
410
411 // synthesized dna sequence
412 INSDC:x2na:bin out_x2na_bin
413 = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix );
414
415 // synthesized packed 2na
416 INSDC:2na:packed out_2na_packed
417 = ( INSDC:2na:packed ) pack ( out_2na_bin );
418
419 // synthesized packed 4na
420 INSDC:4na:packed out_4na_packed
421 = ( INSDC:4na:packed ) pack ( out_4na_bin );
422
423 // synthesized color text
424 INSDC:color:text out_color_text
425 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin );
426
427 // spot_len and fixed_spot_len
428 INSDC:coord:len color_space_spot_len
429 = ( INSDC:coord:len ) row_len ( out_2cs_packed );
430 INSDC:coord:len color_space_fixed_spot_len
431 = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed );
432
433 /* INSDC:tbl:sequence inherited productions
434 * in_cs_key
435 * out_cs_key
436 * out_signal
437 * out_x2cs_bin
438 * in_color_text
439 * out_2cs_packed
440 * out_color_matrix
441 */
442
443 /* INSDC:SRA:tbl:stats inherited productions
444 * in_stats_bin
445 */
446
447 /* NCBI:tbl:dcmp_color_space inherited productions
448 * out_dcmp_2cs_bin
449 * out_dcmp_x2cs_bin
450 * out_dcmp_2cs_packed
451 */
452 };
453
454 /* color_space_nocol
455 * this table describes viewing rules
456 * but omits writing rules and physical column description
457 * in order to support older tables
458 *
459 * history:
460 * 1.0.1 - base explicitly upon color_space_common #1.0.1
461 * 1.0.2 - color_space_common #1.0.2
462 * 1.0.3 - color_space_common #1.0.3
463 */
464 table NCBI:tbl:color_space_nocol #1.0.3
465 = NCBI:tbl:color_space_common #1.0.3
466 , NCBI:tbl:n_encoding #1
467 {
468 // incoming is disabled
469
470 // v1 color matrix was stored in metadata
471 U8 out_color_matrix
472 = < U8 > meta:read < "COLOR_MATRIX" > ()
473 | < U8 > echo < INSDC:color:default_matrix > ();
474
475 // unpacked 2cs with ambiguities
476 INSDC:x2cs:bin out_x2cs_bin
477 = ( INSDC:x2cs:bin ) read_ndecode;
478
479 // interface with n-encoded qualities
480 U8 read_unpack = out_2cs_bin;
481
482 /* INSDC:tbl:sequence inherited productions
483 * out_cs_key
484 * out_signal
485 * out_2cs_packed
486 */
487
488 /* NCBI:tbl:n_encoding inherited productions
489 * read_ndecode
490 */
491 };
492
493 /* color_space #1
494 * this schema brings in .CSREAD and .CS_KEY columns for v1 tables
495 *
496 * history:
497 * 1.0.1 - base explicitly upon color_space_nocol #1.0.1
498 * 1.0.2 - color_space_nocol #1.0.2
499 * 1.0.3 - color_space_nocol #1.0.3
500 */
501 table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3
502 {
503 // stored as text
504 INSDC:dna:text out_cs_key = .CS_KEY;
505
506 // stored color sequence
507 INSDC:2cs:packed out_2cs_packed = .CSREAD;
508
509 /* INSDC:tbl:sequence inherited productions
510 * out_signal
511 */
512
513 /* NCBI:tbl:n_encoding inherited productions
514 * read_ndecode
515 */
516 };
517
518 /* color_space #2
519 * standard current color-space table
520 *
521 * history:
522 * 2.0.1 - base explicitly upon color_space_common #1.0.1
523 * 2.0.2 - base explicitly upon color_space_common #1.0.2
524 * 2.1.0 - introduce hooks for compressed color space
525 */
526 table NCBI:tbl:color_space #2.1
527 = NCBI:tbl:color_space_common #1.0.3
528 , NCBI:tbl:dcmp_color_space #1.0.0
529 {
530 /* input rules
531 */
532
533 // input text is not modified
534 // illegal values are not detected here
535 INSDC:color:text in_color_text = CSREAD;
536
537 // input x2cs bin
538 // illegal values will be caught here
539 INSDC:x2cs:bin in_x2cs_bin
540 = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD )
541 | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text );
542
543 // input 2cs bin
544 INSDC:2cs:bin in_2cs_bin
545 = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD )
546 | ( INSDC:2cs:bin ) unpack ( in_2cs_packed )
547 | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin );
548
549 // input 2cs packed
550 INSDC:2cs:packed in_2cs_packed = CSREAD;
551
552 // input x2cs alt-csread ( ambiguity )
553 INSDC:x2cs:bin in_alt_x2cs_bin
554 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin );
555
556 // color-space keys ARE modified on input
557 INSDC:dna:text in_cs_key
558 = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY );
559
560 // color matrix
561 U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX );
562
563 // prepairing a feed into stats column
564 U8 in_stats_bin = in_2cs_bin;
565
566
567 /* physical columns
568 */
569
570 physical column INSDC:2cs:packed .CSREAD
571 = in_2cs_packed
572 | ( INSDC:2cs:packed ) pack ( in_2cs_bin );
573
574 physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD
575 = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin );
576
577 physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key;
578
579 physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix;
580
581
582 /* output rules
583 */
584
585 // output 2cs packed
586 INSDC:2cs:packed out_2cs_packed
587 = .CSREAD
588 | out_dcmp_2cs_packed;
589
590 // unpacked 2cs with ambiguity
591 INSDC:x2cs:bin out_x2cs_bin
592 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD )
593 | out_dcmp_x2cs_bin
594 | ( INSDC:x2cs:bin ) out_2cs_bin;
595
596 // read directly from physical column
597 INSDC:dna:text out_cs_key = .CS_KEY;
598
599 // color matrix may be synthesized
600 U8 out_color_matrix
601 = .COLOR_MATRIX
602 | < U8 > echo < INSDC:color:default_matrix > ();
603
604
605 /* INSDC:tbl:sequence inherited productions
606 * out_signal
607 */
608
609 /* NCBI:tbl:dcmp_color_space inherited productions
610 * out_dcmp_2cs_bin
611 * out_dcmp_x2cs_bin
612 * out_dcmp_2cs_packed
613 */
614 };
615
616
617 /*--------------------------------------------------------------------------
618 * protein
619 */
620 table NCBI:tbl:protein #1 = INSDC:tbl:protein
621 {
622 /* upper-case letters */
623 INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text >
624 map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN );
625
626 /* std aa */
627 INSDC:aa:bin in_aa_bin
628 = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN )
629 | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text );
630
631 /* physical column */
632 physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin;
633
634 /* output rules */
635 INSDC:aa:bin out_aa_bin = .PROTEIN;
636 INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text >
637 map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin );
638 };
639
640
641 /*--------------------------------------------------------------------------
642 * phred
643 * standard phred quality representation
644 * limits values on input to 1..63
645 * reserves value 0 as ambiguity symbol for reads
646 */
647
648
649 /* history:
650 * 1.0.1 - base explicitly upon sequence #1.0.1
651 */
652 table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1
653 {
654 /* [CS]READ - decoding
655 */
656 U8 read_ndecode
657 = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack );
658
659 /* INSDC:tbl:sequence inherited productions
660 * out_qual_phred
661 * out_qual_text_phred_33
662 * out_qual_text_phred_64
663 */
664
665 /* NCBI:tbl:n_encoding inherited productions
666 * read_unpack
667 */
668 };
669
670 /* history:
671 * 1.0.1 - base explicitly upon phred_quality_nocol #1.0.1
672 */
673 table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1
674 {
675 // read directly as n-encoded phred is compatible with phred
676 NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY;
677
678 /* INSDC:tbl:sequence inherited productions
679 * out_qual_text_phred_33
680 * out_qual_text_phred_64
681 */
682
683 /* NCBI:tbl:n_encoding inherited productions
684 * read_unpack
685 */
686 };
687
688 /* history:
689 * 2.0.1 - added feed of in_stats_qual
690 * 2.0.2 - added input of text encodings
691 * 2.0.3 - base explicitly upon sequence #1.0.1
692 * 2.0.4 - change compression from izip to zip
693 * 2.0.5 - change from zip to delta_average_zip
694 */
695 table NCBI:tbl:phred_quality #2.0.4 = INSDC:tbl:sequence #1.0.1
696 {
697 // read directly quality as phred
698 INSDC:quality:phred out_qual_phred = .QUALITY;
699
700 // input rules
701 INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY;
702 INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY;
703
704 INSDC:quality:phred in_qual_phred
705 = QUALITY
706 | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 )
707 | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 );
708
709 // physical storage
710 /*** next line is for future change in production, but we have to wait until supporting code is released to the public ***/
711 // physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY = in_qual_phred;
712 /*** NB *** MUST change table version to 2.0.5 and propagate to all derived tables ***/
713 physical column < INSDC:quality:phred > zip_encoding .QUALITY = in_qual_phred;
714
715 // feed to compressed statistics
716 INSDC:quality:phred in_stats_qual = in_qual_phred;
717
718 /* INSDC:tbl:sequence inherited productions
719 * out_qual_text_phred_33
720 * out_qual_text_phred_64
721 */
722 };
723
724
725
726 /*--------------------------------------------------------------------------
727 * log_odds
728 * log-odds quality score support
729 *
730 * conversion from log-odds to phred is via formula
731 * 10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499
732 * for x = -4..40 : when x = -5, phred = 0
733 */
734
735 // the map function requires two lookup tables:
736 // the first table detects every legal value...
737 const INSDC:quality:log_odds NCBI:quality:from:log_odds =
738 [
739 -6,-5,-4,-3,-2,-1, 0,
740 1, 2, 3, 4, 5, 6, 7, 8, 9,10,
741 11,12,13,14,15,16,17,18,19,20,
742 21,22,23,24,25,26,27,28,29,30,
743 31,32,33,34,35,36,37,38,39,40
744 ];
745
746 // ...the second table gives positional translations
747 const INSDC:quality:phred NCBI:quality:to:phred =
748 [
749 0, 1, 1, 2, 2, 3, 3,
750 4, 4, 5, 5, 6, 7, 8, 9,10,10,
751 11,12,13,14,15,16,17,18,19,20,
752 21,22,23,24,25,26,27,28,29,30,
753 31,32,33,34,35,36,37,38,39,40
754 ];
755
756 function
757 INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds )
758 {
759 // this range enforcement may not be required
760 INSDC:quality:log_odds log_odds_clip
761 = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds );
762
763 // use the tables above to map from log-odds to phred
764 return < INSDC:quality:log_odds, INSDC:quality:phred >
765 map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip );
766 }
767
768 /* history:
769 * 1.0.1 - base explicitly upon sequence #1.0.1
770 */
771 table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1
772 {
773 /* READ - decoding
774 */
775 U8 read_ndecode
776 = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack );
777
778 /* QUALITY
779 * declared in INSDC:tbl:sequence as phred
780 * introduce here as log-odds
781 */
782 extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds;
783
784 // resolve for phred
785 INSDC:quality:phred out_qual_phred
786 = out_qual2_phred
787 | NCBI:log_odds_to_phred ( out_qual_log_odds );
788
789 /* INSDC:tbl:sequence inherited productions
790 * out_qual_text_phred_33
791 * out_qual_text_phred_64
792 */
793
794 /* NCBI:tbl:n_encoding inherited productions
795 * read_unpack
796 */
797
798 /* NCBI:tbl:log_odds_quality_nocol productions
799 * out_qual2_phred
800 * out_qual_log_odds
801 */
802 };
803
804 /* history:
805 * 1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1
806 */
807 table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1
808 {
809 // read directly as n-encoded log_odds is compatible with log_odds
810 NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY;
811
812 /* INSDC:tbl:sequence inherited productions
813 * out_qual_text_phred_33
814 * out_qual_text_phred_64
815 */
816
817 /* NCBI:tbl:n_encoding inherited productions
818 * read_unpack
819 */
820
821 /* NCBI:tbl:log_odds_quality_nocol inherited productions
822 * out_qual2_phred
823 */
824 };
825
826 /* history:
827 * 2.0.1 - base explicitly upon sequence #1.0.1
828 * 2.1.0 - added production of in_qual_phred
829 */
830 table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1
831 {
832 /* QUALITY
833 * declared in INSDC:tbl:sequence as phred
834 * introduce here as log-odds
835 */
836 extern column INSDC:quality:log_odds QUALITY
837 = out_qual_log_odds;
838
839 // resolve for phred
840 INSDC:quality:phred in_qual_phred
841 = NCBI:log_odds_to_phred ( in_qual_log_odds );
842
843 INSDC:quality:phred out_qual_phred
844 = NCBI:log_odds_to_phred ( out_qual_log_odds );
845
846
847 /* INSDC:tbl:sequence inherited productions
848 * out_qual_text_phred_33
849 * out_qual_text_phred_64
850 */
851
852 /* NCBI:tbl:log_odds_quality_nocol productions
853 * out_qual_log_odds
854 */
855 };
856
857 /* history:
858 * 2.0.1 - added feed of in_stats_qual
859 * 2.0.2 - added input of text encodings
860 * 2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1
861 * 2.0.4 - changed compression from izip to zip
862 * 2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0
863 */
864 table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0
865 {
866 INSDC:quality:log_odds out_qual_log_odds= .QUALITY;
867
868 extern column INSDC:quality:text:log_odds_64 QUALITY
869 = out_qual_text_log_odds_64
870 | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds );
871
872 // input rules
873 INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY;
874
875 INSDC:quality:log_odds in_qual_log_odds
876 = QUALITY
877 | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 );
878
879 physical column < INSDC:quality:log_odds > zip_encoding .QUALITY
880 = in_qual_log_odds;
881
882 // feed to compressed statistics
883 INSDC:quality:log_odds in_stats_qual = in_qual_log_odds;
884
885
886 /* INSDC:tbl:sequence inherited productions
887 * out_qual_text_phred_33
888 * out_qual_text_phred_64
889 */
890
891 /* NCBI:tbl:log_odds_quality productions
892 * out_qual_text_log_odds_64
893 */
894 };