Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/align/align.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * VDB Alignment types, functions and tables | |
29 */ | |
30 version 1; | |
31 | |
32 include 'vdb/vdb.vschema'; | |
33 include 'ncbi/seq.vschema'; | |
34 include 'ncbi/sra.vschema'; | |
35 include 'ncbi/stats.vschema'; | |
36 include 'align/seq.vschema'; | |
37 include 'align/qstat.vschema'; | |
38 include 'sra/abi.vschema'; | |
39 include 'align/mate-cache.vschema'; | |
40 | |
41 | |
42 /*-------------------------------------------------------------------------- | |
43 * data types | |
44 */ | |
45 | |
46 /* ploidy | |
47 * the number of sets of chromosomes in a cell | |
48 */ | |
49 typedef U32 NCBI:align:ploidy; | |
50 | |
51 /* ro_type | |
52 * the type of event causing ref-offset | |
53 */ | |
54 typedef U8 NCBI:align:ro_type; | |
55 | |
56 const NCBI:align:ro_type NCBI:align:ro_normal = 0; // normal ref-offset | |
57 const NCBI:align:ro_type NCBI:align:ro_soft_clip = 1; // soft-clipping | |
58 const NCBI:align:ro_type NCBI:align:ro_intron_plus = 2; // intron on positive strand | |
59 const NCBI:align:ro_type NCBI:align:ro_intron_minus = 3; // intron on negative strand | |
60 const NCBI:align:ro_type NCBI:align:ro_intron_unknown = 4; // intron strand not specified | |
61 const NCBI:align:ro_type NCBI:align:ro_complete_genomics = 5; // | |
62 | |
63 | |
64 /*-------------------------------------------------------------------------- | |
65 * functions | |
66 */ | |
67 | |
68 | |
69 /* cigar | |
70 * construct "cigar" alignment string or length arrays | |
71 * | |
72 * "ctype" [ CONST ] - select variant of format | |
73 * 0 => both matches and mismatches represented as M | |
74 * 1 => matches represented as '=' mismatches as 'X' | |
75 * | |
76 * "has_mismatch" [ DATA ] - a boolean for each base in aligned sequence | |
77 * where a value of false means the base aligned to the reference | |
78 * | |
79 * "has_ref_offset" [ DATA ] - a boolean for each base in the aligned sequence | |
80 * where a value of true means there is a corresponding offset to position on reference | |
81 * | |
82 * "ref_offset" [ DATA ] - a packed sequence of signed offsets to aligned position | |
83 * one entry for every true in "has_ref_offset" | |
84 * | |
85 * "read_len" [ DATA ] - v2: elem_count defines PLOIDY and values are an actual length of reads in spot | |
86 */ | |
87 extern function | |
88 ascii NCBI:align:cigar #1 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, | |
89 I32 ref_offset, * INSDC:coord:len ref_len ) = ALIGN:cigar; | |
90 | |
91 /* history: | |
92 * 2.1 - added "ref_offset_type" optional parameter | |
93 * NB - reverting to 2.0 due to linker bug in older code | |
94 */ | |
95 extern function < type T > | |
96 T NCBI:align:cigar #2.0 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, | |
97 I32 ref_offset, INSDC:coord:len read_len, * INSDC:coord:len ref_len, NCBI:align:ro_type ref_offset_type ) | |
98 = ALIGN:cigar_2; | |
99 | |
100 extern function U32 NCBI:align:edit_distance #1 | |
101 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset ); | |
102 | |
103 extern function U32 NCBI:align:edit_distance #2 | |
104 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len, *INSDC:coord:len read_len) | |
105 = NCBI:align:edit_distance_2; | |
106 | |
107 extern function U32 NCBI:align:edit_distance #3 | |
108 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len) | |
109 = NCBI:align:edit_distance_3; | |
110 | |
111 /* rna_orientation | |
112 * reads column REF_OFFSET_TYPE | |
113 * returns '+' if has: | |
114 * at least one NCBI:align:ro_intron_plus | |
115 * none of NCBI:align:ro_intron_minus | |
116 * returns '-' if has: | |
117 * at least one NCBI:align:ro_intron_minus | |
118 * none of NCBI:align:ro_intron_plus | |
119 * returns empty string otherwise | |
120 */ | |
121 extern function | |
122 ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type ref_offset_type ); | |
123 | |
124 /* project_from_sequence | |
125 * projects column from SEQUENCE | |
126 * | |
127 * "T" [ TYPE ] | |
128 * | |
129 * "col" [ CONST ] | |
130 * "use_read_len" [ CONST ] whether subset by read_len or by read_id only | |
131 * | |
132 * "seq_spot_id" [ DATA ] | |
133 * | |
134 * "seq_read_id" [ DATA ] | |
135 */ | |
136 extern function < type T > | |
137 T NCBI:align:project_from_sequence #1 < ascii col> ( I64 seq_spot_id, INSDC:coord:one seq_read_id ) | |
138 = ALIGN:project_from_sequence; | |
139 | |
140 | |
141 /* align_restore_read | |
142 * restores read by applying alignment-based difference to ref_read | |
143 * | |
144 * "ref_read" [ DATA ] | |
145 * | |
146 * "has_mismatch" [ DATA ] and "mismatch" [ DATA ] | |
147 * | |
148 * "has_ref_offset" [ DATA ] and "ref_offset" [ DATA ] | |
149 */ | |
150 extern function | |
151 INSDC:4na:bin NCBI:align:align_restore_read #1 ( INSDC:4na:bin ref_read, bool has_mismatch, | |
152 INSDC:4na:bin mismatch, bool has_ref_offset, I32 ref_offset * INSDC:coord:len read_len) | |
153 = ALIGN:align_restore_read; | |
154 | |
155 | |
156 /* raw_restore_read | |
157 * restores read by applying alignment-based difference to align_read | |
158 * | |
159 * "align_read" [ DATA ] | |
160 * | |
161 * "ref_orientation" [ DATA ] | |
162 */ | |
163 extern function | |
164 INSDC:4na:bin NCBI:align:raw_restore_read #1 ( INSDC:4na:bin align_read, bool ref_orientation ) | |
165 = ALIGN:raw_restore_read; | |
166 | |
167 | |
168 /* raw_restore_qual | |
169 * restores quality by applying alignment-based difference to align_qual | |
170 * | |
171 * "align_qual" [ DATA ] | |
172 * | |
173 * "ref_orientation" [ DATA ] | |
174 */ | |
175 extern function | |
176 INSDC:quality:phred NCBI:align:raw_restore_qual #1 ( INSDC:quality:phred align_qual, bool ref_orientation ); | |
177 | |
178 | |
179 /* ref_sub_select | |
180 * projects reference from sequence | |
181 * | |
182 * "id" [ DATA ] | |
183 * | |
184 * "start" [ DATA ] and "len" [ DATA ] | |
185 * | |
186 * "ref_ploidy" [ DATA, OPTIONAL ] | |
187 */ | |
188 extern function | |
189 INSDC:4na:bin NCBI:align:ref_sub_select #1 ( I64 id, INSDC:coord:zero start, | |
190 INSDC:coord:len len * U32 ref_ploidy) | |
191 = ALIGN:ref_sub_select; | |
192 | |
193 | |
194 /* ref_restore_read | |
195 * restores read from central storage | |
196 * | |
197 * "cmp_rd" [ DATA ] | |
198 * | |
199 * "seq_id" [ DATA ] | |
200 * | |
201 * "seq_start" [ DATA ] and "seq_len" [ DATA ] | |
202 */ | |
203 extern function | |
204 INSDC:4na:bin NCBI:align:ref_restore_read #1 ( INSDC:4na:bin cmp_rd, ascii seq_id, | |
205 INSDC:coord:one seq_start, INSDC:coord:len seq_len) | |
206 = ALIGN:ref_restore_read; | |
207 | |
208 | |
209 /* seq_restore_read | |
210 * projects read from align_deflate table to SEQUENCE | |
211 * | |
212 * "cmp_rd" [ DATA ] | |
213 * | |
214 * "align_id" [ DATA ] | |
215 * | |
216 * "read_len" [ DATA ] | |
217 * | |
218 * "rd_type" [ DATA ] | |
219 */ | |
220 extern function | |
221 INSDC:4na:bin NCBI:align:seq_restore_read #1 ( INSDC:4na:bin cmp_rd, I64 align_id, | |
222 INSDC:coord:len read_len, INSDC:SRA:xread_type rd_type ) | |
223 = ALIGN:seq_restore_read; | |
224 | |
225 | |
226 /* seq_restore_linkage_group | |
227 * projects LINKAGE_GROUP from PRIMARY_ALIGNMENT table to SEQUENCE | |
228 * | |
229 * "cmp_linkage_group" [ DATA ] | |
230 * | |
231 * "align_id" [ DATA ] | |
232 */ | |
233 extern function | |
234 ascii NCBI:align:seq_restore_linkage_group #1 ( ascii cmp_linkage_group, | |
235 I64 align_id ) | |
236 = ALIGN:seq_restore_linkage_group; | |
237 | |
238 | |
239 /* generate_has_mismatch | |
240 * generates has mismatch by doing actual compare of reference and subject, | |
241 * *ref_offsets move comparisons reference-wise | |
242 * | |
243 * "reference" [ DATA ] | |
244 * | |
245 * "subject" [ DATA ] | |
246 * | |
247 * "has_ref_offset" [ DATA ] | |
248 * | |
249 * "ref_offset" [ DATA ] | |
250 */ | |
251 extern function | |
252 bool NCBI:align:generate_has_mismatch #1 ( INSDC:4na:bin reference, | |
253 INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset) | |
254 = ALIGN:generate_has_mismatch; | |
255 | |
256 | |
257 /* generate_mismatch | |
258 * | |
259 * "reference" [ DATA ] | |
260 * | |
261 * "subject" [ DATA ] | |
262 * | |
263 * "has_ref_offset" [ DATA ] | |
264 * | |
265 * "ref_offset" [ DATA ] | |
266 */ | |
267 extern function | |
268 INSDC:4na:bin NCBI:align:generate_mismatch #1 ( INSDC:4na:bin reference, | |
269 INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset ) | |
270 = ALIGN:generate_mismatch; | |
271 | |
272 | |
273 /* ref_pos | |
274 * retrieves the alignment's positions on the reference | |
275 * one per PLOIDY | |
276 * | |
277 * "ref_id" [ DATA ] | |
278 * | |
279 * "ref_start" [ DATA ] - one per PLOIDY | |
280 */ | |
281 extern function | |
282 INSDC:coord:zero NCBI:align:ref_pos #1 ( I64 ref_id, INSDC:coord:zero ref_start ); | |
283 | |
284 | |
285 /* ref_name | |
286 * retrieve the name from the reference | |
287 * | |
288 * "ref_id" [ DATA ] | |
289 */ | |
290 extern function | |
291 ascii NCBI:align:ref_name #1 ( I64 ref_id ); | |
292 | |
293 | |
294 /* ref_seq_id | |
295 * retrieve the seq_id from the reference | |
296 * | |
297 * "ref_id" [ DATA ] | |
298 */ | |
299 extern function | |
300 ascii NCBI:align:ref_seq_id #1 ( I64 ref_id ); | |
301 | |
302 | |
303 /* local_ref_id | |
304 * convert global ref_start into ref_id | |
305 */ | |
306 extern function | |
307 I64 NCBI:align:local_ref_id #1 ( U64 global_ref_start ); | |
308 | |
309 | |
310 /* global_ref_id | |
311 * convert global ref_start into ref_id | |
312 */ | |
313 extern function | |
314 INSDC:coord:zero NCBI:align:local_ref_start #1 ( U64 global_ref_start ); | |
315 | |
316 /* not_my_row | |
317 * removes current row_id from the list | |
318 */ | |
319 extern function I64 NCBI:align:not_my_row #1 ( I64 list ); | |
320 | |
321 /* template_len | |
322 * compute template length, i.e. the distance from the left-most to the | |
323 * right-most matching reference position | |
324 */ | |
325 extern function I32 NCBI:align:template_len #1 ( | |
326 INSDC:coord:zero pos, INSDC:coord:zero mate_pos, | |
327 INSDC:coord:len reflen, INSDC:coord:len mate_reflen, | |
328 ascii ref_name, ascii mate_ref_name, INSDC:coord:one read_id); | |
329 | |
330 /* get_sam_flags | |
331 * compute the flags that would be in a SAM file | |
332 * | |
333 * version 1 works with full Alignment databases. | |
334 * version 2 works with Alignment databases that have had SEQUENCE removed. | |
335 */ | |
336 extern function U32 NCBI:align:get_sam_flags #1 ( | |
337 INSDC:coord:len read_len, INSDC:coord:one read_id, I32 template_len, | |
338 bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter); | |
339 | |
340 extern function U32 NCBI:align:get_sam_flags #2 ( | |
341 I64 mate_id, INSDC:coord:one read_id, I32 template_len, | |
342 bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter) | |
343 = NCBI:align:get_sam_flags_2; | |
344 | |
345 /* get_left_soft_clip | |
346 * compute the length of the soft clip on the left edge of the alignment | |
347 */ | |
348 extern function INSDC:coord:len NCBI:align:get_left_soft_clip #1 | |
349 ( bool has_ref_offset, I32 ref_offset ); | |
350 | |
351 extern function INSDC:coord:len NCBI:align:get_left_soft_clip #2 | |
352 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len ) | |
353 = NCBI:align:get_left_soft_clip_2; | |
354 | |
355 /* get_right_soft_clip | |
356 * compute the length of the soft clip on the right edge of the alignment | |
357 */ | |
358 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #1 | |
359 ( bool has_mismatch, INSDC:coord:len left_clip * bool has_ref_offset ); | |
360 | |
361 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #2 | |
362 ( bool has_mismatch, INSDC:coord:len left_clip, bool has_ref_offset, I32 ref_offset ) | |
363 = NCBI:align:get_right_soft_clip_2; | |
364 | |
365 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #3 | |
366 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ) | |
367 = NCBI:align:get_right_soft_clip_3; | |
368 | |
369 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #4 | |
370 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, INSDC:coord:len ref_len ) | |
371 = NCBI:align:get_right_soft_clip_4; | |
372 | |
373 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5 | |
374 ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len ) | |
375 = NCBI:align:get_right_soft_clip_5; | |
376 | |
377 /* get_clipped_cigar | |
378 * compute the CIGAR string with the soft clipping removed | |
379 */ | |
380 extern function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar ); | |
381 | |
382 extern function < type T > | |
383 T NCBI:align:get_clipped_cigar #2 ( ascii cigar, INSDC:coord:len cigar_len ) = NCBI:align:get_clipped_cigar_2; | |
384 | |
385 /* get_clipped_ref_offset | |
386 * compute the reference offsets with the soft clipping removed | |
387 */ | |
388 extern function I32 NCBI:align:get_clipped_ref_offset #1 | |
389 ( bool has_ref_offset, I32 ref_offset ); | |
390 | |
391 /* clip | |
392 * remove the soft clipped bases (or qualities, or has_mismatch, or cetera) | |
393 * works with things whose lengths are the same as SEQUENCE.READ | |
394 */ | |
395 extern function < type T > T NCBI:align:clip #1 | |
396 ( T object, INSDC:coord:len left_clip, INSDC:coord:len right_clip); | |
397 | |
398 extern function < type T > T NCBI:align:clip #2 | |
399 ( T object, INSDC:coord:len read_len, INSDC:coord:len left_clip, INSDC:coord:len right_clip) | |
400 = NCBI:align:clip_2; | |
401 | |
402 /* get_ref_len | |
403 * compute reference length from alignment information | |
404 */ | |
405 extern function INSDC:coord:len NCBI:align:get_ref_len #1 | |
406 ( bool has_ref_offset, I32 ref_offset, * INSDC:coord:len right_clip ); | |
407 | |
408 extern function INSDC:coord:len NCBI:align:get_ref_len_2 #2 | |
409 ( bool has_ref_offset, I32 ref_offset) | |
410 = NCBI:align:get_ref_len_2; | |
411 | |
412 | |
413 /* get_mismatch_read | |
414 * generate the READ with matching bases replaced with '=' | |
415 */ | |
416 extern function ascii NCBI:align:get_mismatch_read #1 | |
417 ( bool has_mismatch, INSDC:dna:text mismatch ); | |
418 | |
419 /* get_ref_mismatch | |
420 * shows mismatch positions in reference space | |
421 */ | |
422 function bool NCBI:align:get_ref_mismatch #1 | |
423 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
424 INSDC:coord:len ref_len ); | |
425 | |
426 /* get_ref_insert | |
427 * shows positions of inserts in reference space | |
428 * i.e. an insert occurs between each pair of true's | |
429 */ | |
430 function bool NCBI:align:get_ref_insert #1 | |
431 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
432 INSDC:coord:len ref_len ); | |
433 | |
434 /* get_ref_delete | |
435 * shows positions of deleted bases in reference space | |
436 */ | |
437 function bool NCBI:align:get_ref_delete #1 | |
438 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
439 INSDC:coord:len ref_len ); | |
440 | |
441 extern function INSDC:quality:phred NCBI:align:compress_quality #1 | |
442 ( INSDC:quality:phred quality, bool preserved ); | |
443 | |
444 extern function INSDC:quality:phred NCBI:align:decompress_quality #1 | |
445 < INSDC:quality:phred restored_qual_value > | |
446 ( INSDC:quality:phred cmp_quality, bool preserved ); | |
447 | |
448 /* make_cmp_read_start | |
449 * | |
450 */ | |
451 extern function INSDC:coord:zero NCBI:align:make_read_start #1 | |
452 (INSDC:coord:len read_len); | |
453 | |
454 /* make_cmp_read_desc | |
455 * determines whether an element of "operand" is aligned | |
456 * by looking at the corresponding element of "align_id" | |
457 * | |
458 * zeros out unaligned elements of operand, unless "invert" is true, | |
459 * in which case it zeros out aligned elements. | |
460 * | |
461 * "T" [ TYPE ] - type of operand | |
462 * | |
463 * "invert" [ CONST ] - if true, invert the logic of which elements | |
464 * to zero out. | |
465 * | |
466 * "operand" [ DATA ] - uncompressed data | |
467 * | |
468 * "align_id" [ DATA ] - indication of alignment | |
469 */ | |
470 extern function < type T > | |
471 T NCBI:align:make_cmp_read_desc #1 <bool invert>(T operand, I64 align_id); | |
472 | |
473 /* seq_construct_read | |
474 * assembles read from aligned and unaligned parts | |
475 */ | |
476 extern function < type T > | |
477 T NCBI:align:seq_construct_read #1 ( | |
478 T aligned, INSDC:coord:len aligned_read_len, | |
479 T unaligned, INSDC:coord:len unaligned_read_len ); | |
480 | |
481 extern function I64 NCBI:align:get_mate_align_id #1 ( I64 spot_id ); | |
482 | |
483 /*-------------------------------------------------------------------------- | |
484 * tables | |
485 */ | |
486 | |
487 | |
488 /* ref_block_cmn | |
489 * common implementation ancestor for reference block | |
490 */ | |
491 table NCBI:align:tbl:ref_block_cmn #1.0.0 | |
492 { | |
493 readonly column ascii REF_TABLE | |
494 = < ascii > meta:read < "CONFIG/REF_TABLE" > () | |
495 | < ascii > echo < 'REFERENCE' > (); | |
496 | |
497 // REF_ID is rowid in Reference Table REF_TABLE | |
498 extern column I64 REF_ID | |
499 = out_ref_id; | |
500 | |
501 // this is a redefinition of REF_START | |
502 // REF_START is the offset within REFERENCE.READ | |
503 extern column INSDC:coord:zero REF_START | |
504 = out_ref_start; | |
505 | |
506 // global REF_START | |
507 extern column U64 GLOBAL_REF_START | |
508 = out_global_ref_start; | |
509 | |
510 // REF_LEN the length of a read projection on reference | |
511 INSDC:coord:len out_ref_len_internal | |
512 = NCBI:align:get_ref_len_2 ( out_has_ref_offset, out_ref_offset ) | |
513 | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset ); | |
514 | |
515 INSDC:coord:len out_ref_len | |
516 = .REF_LEN | |
517 /* | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset, out_right_clip ) */ | |
518 | out_ref_len_internal; | |
519 | |
520 physical column < INSDC:coord:len > izip_encoding .REF_LEN = REF_LEN; | |
521 extern column INSDC:coord:len REF_LEN = out_ref_len; | |
522 | |
523 // REF_ORIENTATION - relative orientation of original raw read to the reference | |
524 // false -> same orientation, true -> opposite orientation | |
525 // alignment and reference are always in the same orientation | |
526 extern column bool_encoding REF_ORIENTATION; | |
527 | |
528 // REF_PLOIDY | |
529 extern column < U32 > izip_encoding REF_PLOIDY; | |
530 | |
531 /* REF_POS | |
532 * per PLOIDY | |
533 */ | |
534 readonly column INSDC:coord:zero REF_POS | |
535 = NCBI:align:ref_pos ( out_ref_id, out_ref_start ); | |
536 | |
537 /* REF_NAME | |
538 * the name of the reference | |
539 */ | |
540 readonly column ascii REF_NAME | |
541 = NCBI:align:ref_name ( out_ref_id ); | |
542 | |
543 /* REF_SEQ_ID | |
544 */ | |
545 readonly column ascii REF_SEQ_ID | |
546 = NCBI:align:ref_seq_id ( out_ref_id ) | |
547 | < ascii > echo < '' > (); | |
548 }; | |
549 | |
550 | |
551 /* global_ref_block | |
552 * reference block favoring global ref-start | |
553 */ | |
554 table NCBI:align:tbl:global_ref_block #1.0.0 | |
555 = NCBI:align:tbl:ref_block_cmn #1.0.0 | |
556 { | |
557 U64 out_global_ref_start = .GLOBAL_REF_START; | |
558 physical < U64 > izip_encoding .GLOBAL_REF_START = GLOBAL_REF_START; | |
559 | |
560 I64 out_ref_id = NCBI:align:local_ref_id ( .GLOBAL_REF_START ); | |
561 INSDC:coord:zero out_ref_start = NCBI:align:local_ref_start ( .GLOBAL_REF_START ); | |
562 }; | |
563 | |
564 | |
565 /* local_ref_block | |
566 * reference block favoring local ref-start | |
567 */ | |
568 table NCBI:align:tbl:local_ref_block #1.0.0 | |
569 = NCBI:align:tbl:ref_block_cmn #1.0.0 | |
570 { | |
571 I64 out_ref_id = .REF_ID; | |
572 physical < I64 > izip_encoding .REF_ID = REF_ID; | |
573 | |
574 INSDC:coord:zero out_ref_start = .REF_START; | |
575 physical < INSDC:coord:zero > izip_encoding .REF_START = REF_START; | |
576 }; | |
577 | |
578 | |
579 /* align_cmn | |
580 * common interface and implementation for alignment object | |
581 * | |
582 * History: | |
583 * 2.1 - added REF_OFFSET_TYPE and RNA_ORIENTATION columns | |
584 * updated all cigar calculations | |
585 */ | |
586 table NCBI:align:tbl:align_cmn #2.1 | |
587 = NCBI:tbl:base_space_common #1.0.3 | |
588 , NCBI:SRA:tbl:stats #1.2.0 | |
589 , NCBI:align:tbl:ref_block_cmn #1.0.0 | |
590 { | |
591 bool is_secondary = out_is_secondary; | |
592 // temporary key | |
593 extern column < U32 > izip_encoding TMP_KEY_ID; | |
594 | |
595 extern column <ascii> zip_encoding LINKAGE_GROUP; | |
596 | |
597 | |
598 /* Raw Sequence Block */ | |
599 // Points to sequence table, which may contain more information about the raw sequence. | |
600 // row id in SEQUENCE table; 0 if not linked | |
601 extern column < I64 > izip_encoding SEQ_SPOT_ID; | |
602 | |
603 // read number in SEQUENCE table; { SEQ_SPOT_ID, SEQ_READ_ID } is the unique link to the sequence | |
604 extern column < INSDC:coord:one > izip_encoding SEQ_READ_ID; | |
605 | |
606 | |
607 /* Soft-Clipped data block */ | |
608 | |
609 readonly column INSDC:coord:len LEFT_SOFT_CLIP | |
610 = NCBI:align:get_left_soft_clip ( HAS_REF_OFFSET, REF_OFFSET, out_read_len ); | |
611 | |
612 INSDC:coord:len out_right_clip | |
613 = NCBI:align:get_right_soft_clip #5 ( out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len ) | |
614 | NCBI:align:get_right_soft_clip #4 ( out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len ) | |
615 | NCBI:align:get_right_soft_clip #3 ( out_has_ref_offset, out_ref_offset, out_ref_len ) | |
616 | NCBI:align:get_right_soft_clip #2 ( out_has_mismatch, LEFT_SOFT_CLIP, out_has_ref_offset, out_ref_offset ); | |
617 readonly column INSDC:coord:len RIGHT_SOFT_CLIP = out_right_clip; | |
618 | |
619 readonly column ascii CLIPPED_CIGAR_LONG | |
620 = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); | |
621 | |
622 readonly column INSDC:coord:len CLIPPED_CIGAR_LONG_LEN | |
623 = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); | |
624 | |
625 readonly column ascii CLIPPED_CIGAR_SHORT | |
626 = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); | |
627 | |
628 readonly column INSDC:coord:len CLIPPED_CIGAR_SHORT_LEN | |
629 = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); | |
630 | |
631 bool out_clipped_has_mismatch | |
632 = < bool > NCBI:align:clip (out_has_mismatch, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
633 | |
634 readonly column ascii CLIPPED_HAS_MISMATCH | |
635 = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_mismatch ); | |
636 | |
637 readonly column bool CLIPPED_HAS_MISMATCH = out_clipped_has_mismatch; | |
638 | |
639 bool out_clipped_has_ref_offset | |
640 = < bool > NCBI:align:clip (HAS_REF_OFFSET, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
641 | |
642 readonly column ascii CLIPPED_HAS_REF_OFFSET | |
643 = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_ref_offset ); | |
644 | |
645 readonly column bool CLIPPED_HAS_REF_OFFSET = out_clipped_has_ref_offset; | |
646 | |
647 // TBD cannot be computed right unless HAS_MISMATCH and! READ_LEN is used | |
648 readonly column INSDC:dna:text CLIPPED_MISMATCH | |
649 = < INSDC:dna:text > NCBI:align:clip #1 ( out_mismatch_dna_text, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
650 | |
651 readonly column I32 CLIPPED_REF_OFFSET | |
652 = NCBI:align:get_clipped_ref_offset ( HAS_REF_OFFSET, REF_OFFSET ); | |
653 | |
654 readonly column INSDC:quality:phred CLIPPED_QUALITY | |
655 = < INSDC:quality:phred > NCBI:align:clip (out_qual_phred, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
656 | |
657 readonly column INSDC:dna:text CLIPPED_READ | |
658 = < INSDC:dna:text > NCBI:align:clip (READ, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
659 | |
660 /* Sequence Block */ | |
661 | |
662 extern column < NCBI:align:ploidy > izip_encoding PLOIDY; | |
663 | |
664 // Number of reads per spot; corresponds to the number of alternative alignments | |
665 // all alternative alignments are computed against the same reference region | |
666 U32 out_nreads | |
667 = .PLOIDY | |
668 | < U32 > echo < 1 > (); | |
669 | |
670 // READ_START and READ_LEN are position and length of the sequence | |
671 physical < INSDC:coord:zero > izip_encoding .READ_START = READ_START; | |
672 INSDC:coord:zero out_read_start | |
673 = .READ_START | |
674 | < INSDC:coord:zero > echo < 0 > (); | |
675 | |
676 physical < INSDC:coord:len > izip_encoding .READ_LEN = READ_LEN; | |
677 | |
678 INSDC:coord:len align_spot_len = ( INSDC:coord:len ) row_len ( out_has_ref_offset ); | |
679 INSDC:coord:len out_read_len | |
680 = .READ_LEN | |
681 | align_spot_len; | |
682 | |
683 // associated qualities | |
684 extern column INSDC:quality:phred CMP_QUALITY | |
685 = .CMP_QUALITY | |
686 | out_cmp_quality; | |
687 physical column < INSDC:quality:phred > zip_encoding .CMP_QUALITY = CMP_QUALITY; | |
688 | |
689 INSDC:quality:phred out_raw_qual = < INSDC:quality:phred > | |
690 NCBI:align:project_from_sequence < '( INSDC:quality:phred ) QUALITY'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
691 INSDC:quality:phred out_qual_phred | |
692 = NCBI:align:raw_restore_qual ( out_raw_qual, .REF_ORIENTATION ) | |
693 | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); | |
694 readonly column INSDC:quality:text:phred_33 SAM_QUALITY = QUALITY ; | |
695 | |
696 // project read group and name | |
697 ascii out_spot_group = < ascii > simple_sub_select < 'SEQUENCE','SPOT_GROUP'> (.SEQ_SPOT_ID); | |
698 | |
699 | |
700 INSDC:SRA:spotid_t tmp_seq_spot_id | |
701 = cast ( .SEQ_SPOT_ID ) | |
702 ; | |
703 physical <ascii> zip_encoding .SEQ_NAME = SEQ_NAME; | |
704 extern column ascii SEQ_NAME | |
705 = .SEQ_NAME | |
706 | < ascii > simple_sub_select < 'SEQUENCE','NAME'> (.SEQ_SPOT_ID) | |
707 | sprintf < "%u" > ( tmp_seq_spot_id ); | |
708 | |
709 // compute sam flags | |
710 /* blows up parser: starts at schema-tbl.c:2138 | |
711 readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags(MATE_ALIGN_ID, | |
712 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
713 out_mate_ref_orientation, is_secondary); | |
714 */ | |
715 INSDC:coord:len projected_read_len | |
716 = < INSDC:coord:len > simple_sub_select < 'SEQUENCE', 'READ_LEN' > ( .SEQ_SPOT_ID ); | |
717 | |
718 readonly column U32 SAM_FLAGS | |
719 = NCBI:align:get_sam_flags #1 (projected_read_len, | |
720 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
721 out_mate_ref_orientation, is_secondary, out_rd_filter) | |
722 | NCBI:align:get_sam_flags #2 (out_mate_align_id, | |
723 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
724 out_mate_ref_orientation, is_secondary, out_rd_filter); | |
725 | |
726 ascii out_name_fmt = < ascii > echo < '$R' > (); | |
727 | |
728 INSDC:coord:zero trim_start | |
729 = < INSDC:coord:zero > echo < 0 > (); | |
730 INSDC:coord:len trim_len | |
731 = align_spot_len; | |
732 | |
733 ascii out_label | |
734 = .LABEL | |
735 | < ascii > echo < "ploidy1" > (); | |
736 INSDC:coord:zero out_label_start | |
737 = .LABEL_START | |
738 | < INSDC:coord:zero > echo < 0 > (); | |
739 INSDC:coord:len out_label_len | |
740 = .LABEL_LEN | |
741 | < INSDC:coord:len > echo < 7 > (); | |
742 | |
743 physical < INSDC:SRA:read_filter > zip_encoding .RD_FILTER = READ_FILTER; | |
744 INSDC:SRA:read_filter out_rd_filter | |
745 = .RD_FILTER | |
746 | < INSDC:SRA:read_filter > NCBI:align:project_from_sequence < 'READ_FILTER' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ) | |
747 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_len ); | |
748 | |
749 INSDC:SRA:platform_id out_platform | |
750 = .PLATFORM | |
751 | < INSDC:SRA:platform_id > simple_sub_select < 'SEQUENCE','PLATFORM'> (.SEQ_SPOT_ID) | |
752 | < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > (); | |
753 | |
754 U8 out_alignment_count = <U8> NCBI:align:project_from_sequence < 'ALIGNMENT_COUNT' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
755 | |
756 /* out_read_type | |
757 * set to SRA_READ_TYPE_FORWARD + SRA_READ_TYPE_BIOLOGICAL | |
758 * which has a constant value of 3 | |
759 */ | |
760 INSDC:SRA:xread_type out_read_type | |
761 = < INSDC:SRA:xread_type > echo < 3 > ( out_read_len ); | |
762 | |
763 // stats inputs | |
764 bool in_stats_bin = HAS_REF_OFFSET; | |
765 | |
766 INSDC:coord:len _alt_in_read_len | |
767 = READ_LEN | |
768 | ( INSDC:coord:len ) row_len #1 ( HAS_REF_OFFSET ); | |
769 | |
770 INSDC:SRA:xread_type _alt_in_read_type | |
771 = READ_TYPE | |
772 | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (_alt_in_read_len); | |
773 | |
774 readonly column ascii MISMATCH_READ | |
775 = NCBI:align:get_mismatch_read ( out_has_mismatch, out_mismatch_dna_text ); | |
776 | |
777 /* Alignment block */ | |
778 | |
779 // MAPQ - single value quality of the mapping; the scale is submitter specific | |
780 extern column < I32 > izip_encoding MAPQ; | |
781 | |
782 extern column INSDC:coord:zero MATE_REF_POS = out_mate_ref_pos; | |
783 extern column INSDC:coord:len MATE_REF_LEN = out_mate_ref_len; | |
784 extern column I64 MATE_REF_ID = out_mate_ref_id; | |
785 extern column I32 TEMPLATE_LEN = out_template_len; | |
786 extern column bool MATE_REF_ORIENTATION = out_mate_ref_orientation; | |
787 readonly column ascii MATE_REF_NAME = NCBI:align:ref_name ( out_mate_ref_id ); | |
788 readonly column ascii MATE_REF_SEQ_ID = NCBI:align:ref_seq_id( out_mate_ref_id ); | |
789 readonly column U8 ALIGNMENT_COUNT = out_alignment_count; | |
790 | |
791 | |
792 /******************************** | |
793 * Columns representing CIGARs | |
794 ********************************/ | |
795 | |
796 | |
797 // one value per base i.e. length is same as sum of READ_LEN | |
798 // partitioned by READ_START and READ_LEN into alternative alignments | |
799 // flags the shifts in reference position preceeding the base | |
800 // if sequence of a partitioned read starts with a ref_offset and one or more mismatches | |
801 // then it represents a left soft clip | |
802 // any run of mismatches at the end represents a right soft clip | |
803 | |
804 readonly column ascii HAS_REF_OFFSET = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_ref_offset ); | |
805 extern column bool_encoding HAS_REF_OFFSET; | |
806 bool out_has_ref_offset = .HAS_REF_OFFSET; | |
807 | |
808 // has number of elements equal to number of true elements in HAS_REF_OFFSET | |
809 extern column < I32 > izip_encoding REF_OFFSET; | |
810 I32 out_ref_offset = .REF_OFFSET; | |
811 | |
812 // the type of offset recorded in REF_OFFSET | |
813 extern column < NCBI:align:ro_type > izip_encoding REF_OFFSET_TYPE; | |
814 NCBI:align:ro_type out_ro_type = .REF_OFFSET_TYPE; | |
815 | |
816 // DISPLAY Columns | |
817 | |
818 readonly column I64 ALIGN_ID = row_id (); | |
819 | |
820 // get projection of the reference | |
821 readonly column INSDC:dna:text REF_READ | |
822 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( REF_READ ); | |
823 | |
824 readonly column INSDC:4na:bin REF_READ | |
825 = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len, .REF_PLOIDY) | |
826 | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len ); | |
827 | |
828 INSDC:4na:bin ref_read_internal | |
829 = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal, .REF_PLOIDY) | |
830 | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal); | |
831 | |
832 // text forms of reads | |
833 INSDC:dna:text out_dna_text | |
834 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); | |
835 readonly column INSDC:dna:text RAW_READ | |
836 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_raw_read ); | |
837 readonly column INSDC:4na:bin RAW_READ | |
838 = out_raw_read; | |
839 | |
840 // CIGARs | |
841 readonly column ascii CIGAR_LONG | |
842 = < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
843 | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
844 | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
845 ; | |
846 readonly column INSDC:coord:len CIGAR_LONG_LEN | |
847 = < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
848 | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
849 | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
850 ; | |
851 readonly column ascii CIGAR_SHORT | |
852 = < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
853 | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
854 | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
855 ; | |
856 readonly column INSDC:coord:len CIGAR_SHORT_LEN | |
857 = < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
858 | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
859 | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
860 ; | |
861 | |
862 readonly column ascii RNA_ORIENTATION | |
863 = NCBI:align:rna_orientation ( out_ro_type ) | |
864 ; | |
865 | |
866 readonly column U32 EDIT_DISTANCE | |
867 = NCBI:align:edit_distance #3 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len) | |
868 | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len, out_read_len) | |
869 | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len) | |
870 | NCBI:align:edit_distance #1 (out_has_mismatch, out_has_ref_offset, out_ref_offset); | |
871 | |
872 readonly column ascii HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_mismatch ); | |
873 | |
874 // needed for backward compatibility | |
875 readonly column ascii SEQ_SPOT_GROUP = out_spot_group; | |
876 | |
877 | |
878 /* These columns are purely informational. */ | |
879 bool out_ref_mismatch = NCBI:align:get_ref_mismatch ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
880 readonly column ascii REF_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_mismatch ); | |
881 readonly column bool REF_MISMATCH = out_ref_mismatch; | |
882 | |
883 bool out_ref_insert = NCBI:align:get_ref_insert ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
884 readonly column ascii REF_INSERT = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_insert ); | |
885 readonly column bool REF_INSERT = out_ref_insert; | |
886 | |
887 bool out_ref_delete = NCBI:align:get_ref_delete ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
888 readonly column ascii REF_DELETE = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_delete ); | |
889 readonly column bool REF_DELETE = out_ref_delete; | |
890 | |
891 }; | |
892 | |
893 | |
894 /* align_full | |
895 * aligns externally stored sequence against reference | |
896 * alignment transcript is calculated | |
897 * | |
898 * History: | |
899 * 1.1 - respond to changes in base table | |
900 */ | |
901 table NCBI:align:tbl:align_full #1.1 | |
902 = NCBI:align:tbl:align_cmn #2.1 | |
903 { | |
904 bool out_is_secondary = <bool> echo < true > (); | |
905 // restore reads to its raw form (orientation is restored) | |
906 | |
907 INSDC:4na:bin out_raw_read | |
908 = < INSDC:4na:bin > simple_sub_select < 'PRIMARY_ALIGNMENT', '( INSDC:4na:bin ) RAW_READ' > (.PRIMARY_ALIGNMENT_ID) | |
909 | < INSDC:4na:bin > NCBI:align:project_from_sequence < '( INSDC:4na:bin ) READ'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
910 | |
911 INSDC:4na:bin out_4na_bin | |
912 = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset, .READ_LEN ) | |
913 | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset ) | |
914 | NCBI:align:raw_restore_read ( out_raw_read, .REF_ORIENTATION ); | |
915 | |
916 | |
917 // flags mismatches with the reference | |
918 // produced by actual comparison of REF_READ and READ | |
919 // TMP_HAS_MISMATCH is a hack to speed up retrieval during coverage recalculation | |
920 column bool_encoding TMP_HAS_MISMATCH; | |
921 bool out_has_mismatch | |
922 = .TMP_HAS_MISMATCH | |
923 | NCBI:align:generate_has_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); | |
924 readonly column bool HAS_MISMATCH = out_has_mismatch; | |
925 | |
926 INSDC:4na:bin out_mismatch_4na_bin | |
927 = NCBI:align:generate_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); | |
928 | |
929 INSDC:4na:bin tmp_out_mismatch_4na_bin = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( .TMP_MISMATCH ); | |
930 | |
931 // temporary column for reference coverage calculation | |
932 column < INSDC:dna:text> zip_encoding TMP_MISMATCH; | |
933 | |
934 INSDC:dna:text out_mismatch_dna_text | |
935 = .TMP_MISMATCH | |
936 | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); | |
937 | |
938 readonly column INSDC:dna:text MISMATCH = out_mismatch_dna_text; | |
939 readonly column INSDC:4na:bin MISMATCH = out_mismatch_4na_bin; | |
940 | |
941 physical column < INSDC:coord:zero > izip_encoding .MATE_REF_POS = MATE_REF_POS; | |
942 INSDC:coord:zero out_mate_ref_pos = .MATE_REF_POS | |
943 | < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); | |
944 | |
945 physical column < I64 > izip_encoding .MATE_REF_ID = MATE_REF_ID; | |
946 I64 out_mate_ref_id = .MATE_REF_ID | |
947 | < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); | |
948 | |
949 INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); | |
950 physical column < I32 > izip_encoding .TEMPLATE_LEN = TEMPLATE_LEN; | |
951 I32 out_template_len = .TEMPLATE_LEN | |
952 | NCBI:align:template_len(REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); | |
953 | |
954 physical column < bool > izip_encoding .MATE_REF_ORIENTATION = MATE_REF_ORIENTATION; | |
955 bool out_mate_ref_orientation = .MATE_REF_ORIENTATION | |
956 | < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); | |
957 | |
958 I64 out_mate_align_id = .MATE_ALIGN_ID; | |
959 physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; | |
960 extern column I64 MATE_ALIGN_ID = out_mate_align_id; | |
961 | |
962 physical column < I64 > izip_encoding .PRIMARY_ALIGNMENT_ID = PRIMARY_ALIGNMENT_ID; | |
963 | |
964 I32 read_idx = <I32> cast (.SEQ_READ_ID); | |
965 extern column I64 PRIMARY_ALIGNMENT_ID | |
966 = .PRIMARY_ALIGNMENT_ID | |
967 | <I64> simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID' > (.SEQ_SPOT_ID,.SEQ_READ_ID); | |
968 | |
969 }; | |
970 | |
971 | |
972 /* compressed_by_reference | |
973 * aligns internally represented sequence against reference | |
974 * alignment transcript is stored | |
975 * original sequence is reconstructed | |
976 * | |
977 * History: | |
978 * 1.2 - respond to changes in base table | |
979 */ | |
980 table NCBI:align:tbl:compressed_by_reference #1.2 | |
981 = NCBI:align:tbl:align_cmn #2.1 | |
982 { | |
983 bool out_is_secondary = <bool> echo < false > (); | |
984 | |
985 // one value per base i.e. length is same as sum of READ_LEN | |
986 // partitioned by READ_START and READ_LEN into alternative alignments | |
987 // flags mismatches with the reference | |
988 extern default column bool_encoding HAS_MISMATCH; | |
989 bool out_has_mismatch = .HAS_MISMATCH; | |
990 | |
991 // has number of elements equal to number of true elements in HAS_MISMATCH | |
992 extern column INSDC:dna:text MISMATCH | |
993 { | |
994 read = out_mismatch_dna_text; | |
995 validate = < INSDC:dna:text > compare ( in_mismatch_dna_text, out_mismatch_dna_text ); | |
996 } | |
997 | |
998 INSDC:dna:text in_mismatch_dna_text | |
999 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( MISMATCH ); | |
1000 | |
1001 INSDC:4na:bin in_mismatch_4na_bin | |
1002 = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_mismatch_dna_text ); | |
1003 | |
1004 extern column < ascii > zip_encoding ALIGN_GROUP; | |
1005 | |
1006 physical column < INSDC:4na:bin > zip_encoding .MISMATCH = in_mismatch_4na_bin; | |
1007 | |
1008 INSDC:4na:bin out_mismatch_4na_bin = .MISMATCH; | |
1009 INSDC:dna:text out_mismatch_dna_text | |
1010 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); | |
1011 | |
1012 I64 out_mate_align_id | |
1013 = .MATE_ALIGN_ID | |
1014 | NCBI:align:get_mate_align_id (.SEQ_SPOT_ID); | |
1015 | |
1016 physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; | |
1017 extern column I64 MATE_ALIGN_ID = out_mate_align_id; | |
1018 | |
1019 // restore reads from alignment columns and the reference | |
1020 // optional .READ_LEN size defines PLOIDY | |
1021 INSDC:4na:bin out_4na_bin | |
1022 = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset, .READ_LEN ) | |
1023 | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset ); | |
1024 | |
1025 // restore reads to its raw form (orientation is restored) | |
1026 INSDC:4na:bin out_raw_read = NCBI:align:raw_restore_read (out_4na_bin,.REF_ORIENTATION); | |
1027 | |
1028 I64 primary_align_pair = < I64 > simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID'> (.SEQ_SPOT_ID); | |
1029 I64 out_mate_ref_id = < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); | |
1030 bool out_mate_ref_orientation = < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); | |
1031 INSDC:coord:zero out_mate_ref_pos = < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); | |
1032 INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); | |
1033 readonly column U32 MATE_EDIT_DISTANCE = < U32 > simple_sub_select < '','EDIT_DISTANCE'> (MATE_ALIGN_ID); | |
1034 readonly column ascii MATE_CIGAR_LONG = < ascii > simple_sub_select < '','CIGAR_LONG'> (MATE_ALIGN_ID); | |
1035 readonly column ascii MATE_CIGAR_SHORT = < ascii > simple_sub_select < '','CIGAR_SHORT'> (MATE_ALIGN_ID); | |
1036 readonly column INSDC:coord:len MATE_CIGAR_LONG_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_LONG_LEN'> (MATE_ALIGN_ID); | |
1037 readonly column INSDC:coord:len MATE_CIGAR_SHORT_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_SHORT_LEN'> (MATE_ALIGN_ID); | |
1038 | |
1039 I32 out_template_len = NCBI:align:template_len (REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); | |
1040 }; | |
1041 | |
1042 | |
1043 /* align_sorted | |
1044 * deflated alignment data sorted against reference | |
1045 * | |
1046 * History: | |
1047 * 1.2 - respond to changes in base table | |
1048 */ | |
1049 table NCBI:align:tbl:align_sorted #1.2 | |
1050 = NCBI:align:tbl:compressed_by_reference #1.2 | |
1051 , NCBI:align:tbl:global_ref_block #1.0.0 | |
1052 { | |
1053 // 128K | |
1054 column default limit = 131072; | |
1055 }; | |
1056 | |
1057 | |
1058 /* align_unsorted | |
1059 * deflated alignment unsorted data | |
1060 * | |
1061 * History: | |
1062 * 1.2 - respond to changes in base table | |
1063 */ | |
1064 table NCBI:align:tbl:align_unsorted #1.2 | |
1065 = NCBI:align:tbl:compressed_by_reference #1.2 | |
1066 , NCBI:align:tbl:local_ref_block #1.0.0 | |
1067 { | |
1068 // 128K | |
1069 column default limit = 131072; | |
1070 }; | |
1071 | |
1072 | |
1073 /* align_mate_sorted | |
1074 * | |
1075 * History: | |
1076 * 1.1 - respond to changes in base table | |
1077 */ | |
1078 table NCBI:align:tbl:align_mate_sorted #1.1 | |
1079 = NCBI:align:tbl:align_full #1.1 | |
1080 , NCBI:align:tbl:global_ref_block #1.0.0 | |
1081 { | |
1082 // 128K | |
1083 column default limit = 131072; | |
1084 }; | |
1085 | |
1086 | |
1087 /* align_mate_unsorted | |
1088 * | |
1089 * History: | |
1090 * 1.1 - respond to changes in base table | |
1091 */ | |
1092 table NCBI:align:tbl:align_mate_unsorted #1.1 | |
1093 = NCBI:align:tbl:align_full #1.1 | |
1094 , NCBI:align:tbl:local_ref_block #1.0.0 | |
1095 { | |
1096 // 128K | |
1097 column default limit = 131072; | |
1098 }; | |
1099 | |
1100 /* align_allele | |
1101 * alleles coverage extension | |
1102 * | |
1103 * History: | |
1104 * 1.2 - respond to changes in base table | |
1105 */ | |
1106 table NCBI:align:tbl:align_allele #1.2 | |
1107 = NCBI:align:tbl:align_unsorted #1.2 | |
1108 { | |
1109 extern column < I64 > izip_encoding EVIDENCE_ALIGNMENT_IDS; | |
1110 | |
1111 /* | |
1112 INSDC:quality:phred out_qual_phred | |
1113 = < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); | |
1114 */ | |
1115 }; | |
1116 | |
1117 /*-------------------------------------------------------------------------- | |
1118 * seq | |
1119 * alignment sequence table | |
1120 */ | |
1121 physical | |
1122 I64 NCBI:align:sorted:alignment_id_encoding #1.0 | |
1123 { | |
1124 decode | |
1125 { | |
1126 I64 outliers_removed = iunzip ( @ ); | |
1127 return < I64 > outlier_decode < 0 > ( outliers_removed ); | |
1128 } | |
1129 | |
1130 encode | |
1131 { | |
1132 I64 outliers_removed = < I64 > outlier_encode < 0 > ( @ ); | |
1133 return izip ( outliers_removed ); | |
1134 } | |
1135 } | |
1136 | |
1137 | |
1138 table NCBI:align:tbl:seq #1.1 = | |
1139 NCBI:tbl:base_space #2.0.3, | |
1140 NCBI:tbl:phred_quality #2.0.4, | |
1141 NCBI:align:tbl:cmp_base_space #1, | |
1142 NCBI:SRA:tbl:spotdesc #1.0.2, | |
1143 NCBI:SRA:tbl:stats #1.2.0 | |
1144 { | |
1145 // 128K | |
1146 column default limit = 131072; | |
1147 | |
1148 // gets primary record in alignment table (size of column is NREADS) | |
1149 // if sorted - should used special encoding | |
1150 extern column <I64> izip_encoding PRIMARY_ALIGNMENT_ID; | |
1151 | |
1152 INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); | |
1153 INSDC:coord:len trim_len = _spot_len; | |
1154 | |
1155 // size is NREADS | |
1156 extern column < U8 > zip_encoding ALIGNMENT_COUNT; | |
1157 | |
1158 // auto-generate name from row-id | |
1159 ascii out_name_fmt = < ascii > echo < '$R' > (); | |
1160 | |
1161 // temparary column | |
1162 extern column < U64 > izip_encoding TMP_KEY_ID; | |
1163 | |
1164 // restored READ | |
1165 INSDC:4na:bin out_dcmp_4na_bin | |
1166 = NCBI:align:seq_restore_read (out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE); | |
1167 | |
1168 extern column < U64 > izip_encoding TI; | |
1169 | |
1170 extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; | |
1171 | |
1172 // restored LINKAGE_GROUP | |
1173 readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | |
1174 | .CMP_LINKAGE_GROUP; | |
1175 }; | |
1176 | |
1177 | |
1178 table NCBI:align:tbl:cs_seq #1.2 | |
1179 { | |
1180 /* writable columns */ | |
1181 extern column INSDC:color:text CMP_CSREAD | |
1182 = out_cmp_color_text | |
1183 ; | |
1184 | |
1185 extern column < INSDC:dna:text > zip_encoding CS_KEY; | |
1186 | |
1187 extern default column < INSDC:quality:phred > zip_encoding QUALITY; | |
1188 | |
1189 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_ID; | |
1190 | |
1191 extern column < U8 > zip_encoding ALIGNMENT_COUNT; | |
1192 | |
1193 extern column < INSDC:SRA:platform_id > zip_encoding PLATFORM; | |
1194 | |
1195 extern column < ascii > zip_encoding LABEL; | |
1196 extern column < INSDC:coord:zero > izip_encoding LABEL_START; | |
1197 extern column < INSDC:coord:len > izip_encoding LABEL_LEN; | |
1198 | |
1199 extern column < INSDC:SRA:xread_type > zip_encoding READ_TYPE; | |
1200 extern column < INSDC:coord:zero > izip_encoding READ_START; | |
1201 extern column < INSDC:coord:len > izip_encoding READ_LEN; | |
1202 extern column < INSDC:SRA:read_filter > zip_encoding READ_FILTER; | |
1203 | |
1204 extern column < U64 > izip_encoding TMP_KEY_ID; | |
1205 | |
1206 extern column < ascii > zip_encoding SPOT_GROUP; | |
1207 | |
1208 extern column < U64 > izip_encoding TI; | |
1209 | |
1210 /* writing rules */ | |
1211 INSDC:x2cs:bin in_cmp_x2cs_bin | |
1212 = < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( CMP_CSREAD ) | |
1213 ; | |
1214 INSDC:2cs:bin in_cmp_2cs_bin | |
1215 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ) | |
1216 ; | |
1217 INSDC:x2cs:bin in_cmp_alt_x2cs_bin | |
1218 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ) | |
1219 ; | |
1220 physical column INSDC:2cs:packed .CMP_CSREAD | |
1221 = ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ) | |
1222 ; | |
1223 physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD | |
1224 = < INSDC:x2cs:bin > trim < ALIGN_LEFT, 0 > ( in_cmp_alt_x2cs_bin ) | |
1225 ; | |
1226 | |
1227 /* reading rules */ | |
1228 INSDC:2cs:packed phys_cmp_2cs_packed | |
1229 = .CMP_CSREAD | |
1230 ; | |
1231 INSDC:x2cs:bin phys_cmp_alt_x2cs_bin | |
1232 = .CMP_ALTCSREAD | |
1233 ; | |
1234 INSDC:2cs:packed phys_2cs_packed | |
1235 = .CSREAD | |
1236 ; | |
1237 INSDC:x2cs:bin phys_alt_x2cs_bin | |
1238 = .ALTCSREAD | |
1239 ; | |
1240 INSDC:2cs:bin out_cmp_2cs_bin | |
1241 = ( INSDC:2cs:bin ) unpack ( phys_cmp_2cs_packed ) | |
1242 ; | |
1243 INSDC:2cs:bin out_2cs_bin | |
1244 = ( INSDC:2cs:bin ) unpack ( phys_2cs_packed ) | |
1245 ; | |
1246 INSDC:x2cs:bin out_cmp_x2cs_bin | |
1247 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, phys_cmp_alt_x2cs_bin ) | |
1248 | ( INSDC:x2cs:bin ) out_cmp_2cs_bin | |
1249 ; | |
1250 INSDC:x2cs:bin out_x2cs_bin | |
1251 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, phys_alt_x2cs_bin ) | |
1252 | ( INSDC:x2cs:bin ) out_2cs_bin | |
1253 ; | |
1254 INSDC:color:text out_cmp_color_text | |
1255 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ) | |
1256 ; | |
1257 INSDC:color:text out_color_text | |
1258 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ) | |
1259 ; | |
1260 | |
1261 /* triggers from stats */ | |
1262 INSDC:quality:phred in_qual_phred | |
1263 = QUALITY | |
1264 ; | |
1265 INSDC:coord:len in_read_len | |
1266 = READ_LEN | |
1267 ; | |
1268 INSDC:SRA:xread_type in_read_type | |
1269 = READ_TYPE | |
1270 ; | |
1271 ascii in_spot_group | |
1272 = SPOT_GROUP | |
1273 ; | |
1274 trigger meta_stats | |
1275 = NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type, in_spot_group ) | |
1276 | NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type ) | |
1277 ; | |
1278 trigger qual_stats | |
1279 = NCBI:SRA:phred_stats_trigger #1 ( in_qual_phred ) | |
1280 ; | |
1281 | |
1282 extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; | |
1283 | |
1284 // restored LINKAGE_GROUP | |
1285 readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | |
1286 | .CMP_LINKAGE_GROUP; | |
1287 }; | |
1288 | |
1289 table NCBI:align:view:cs_seq #1.1 = NCBI:align:tbl:cs_seq #1.2 | |
1290 { | |
1291 // various READ columns | |
1292 default readonly column INSDC:dna:text READ | |
1293 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_dcmp_4na_bin ) | |
1294 | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) | |
1295 ; | |
1296 readonly column INSDC:4na:bin READ = out_dcmp_4na_bin | out_4na_bin; | |
1297 readonly column INSDC:4na:packed READ = pack ( out_dcmp_4na_bin ) | pack ( out_4na_bin ); | |
1298 readonly column INSDC:x2na:bin READ = out_dcmp_x2na_bin | out_x2na_bin; | |
1299 readonly column INSDC:2na:bin READ = out_dcmp_2na_bin | out_2na_bin; | |
1300 INSDC:2na:bin out_dcmp_2na_bin | |
1301 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2na_bin ) | |
1302 ; | |
1303 INSDC:2na:bin out_2na_bin | |
1304 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ) | |
1305 ; | |
1306 readonly column INSDC:2na:packed READ = pack ( out_dcmp_2na_bin ) | pack ( out_2na_bin ); | |
1307 | |
1308 // decompression in base space | |
1309 INSDC:coord:len cmp_read_len | |
1310 = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < true > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) | |
1311 ; | |
1312 INSDC:coord:zero cmp_read_start | |
1313 = NCBI:align:make_read_start #1 ( cmp_read_len ) | |
1314 ; | |
1315 INSDC:x2na:bin out_cmp_x2na_bin | |
1316 = NCBI:dna_from_color #1 ( out_cmp_x2cs_bin, cmp_read_start, cmp_read_len, .CS_KEY, color_matrix ) | |
1317 ; | |
1318 INSDC:x2na:bin out_x2na_bin | |
1319 = NCBI:dna_from_color #1 ( out_x2cs_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) | |
1320 ; | |
1321 INSDC:4na:bin out_cmp_4na_bin | |
1322 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_cmp_x2na_bin ) | |
1323 ; | |
1324 INSDC:4na:bin out_4na_bin | |
1325 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ) | |
1326 ; | |
1327 INSDC:4na:bin out_dcmp_4na_bin | |
1328 = NCBI:align:seq_restore_read ( out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE ) | |
1329 ; | |
1330 | |
1331 | |
1332 // various CSREAD columns | |
1333 default readonly column INSDC:color:text CSREAD | |
1334 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_dcmp_x2cs_bin ) | |
1335 | out_color_text; | |
1336 readonly column INSDC:x2cs:bin CSREAD = out_dcmp_x2cs_bin | out_x2cs_bin; | |
1337 readonly column INSDC:2cs:bin CSREAD = out_dcmp_2cs_bin | out_2cs_bin; | |
1338 INSDC:2cs:bin out_dcmp_2cs_bin | |
1339 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2cs_bin ) | |
1340 ; | |
1341 readonly column INSDC:2cs:packed CSREAD = pack ( out_dcmp_2cs_bin ) | out_2cs_bin; | |
1342 | |
1343 | |
1344 // decompression in color space | |
1345 INSDC:x2na:bin out_dcmp_x2na_bin | |
1346 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ) | |
1347 ; | |
1348 INSDC:x2cs:bin out_dcmp_x2na_x2cs_bin | |
1349 = NCBI:color_from_dna #1 ( out_dcmp_x2na_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) | |
1350 ; | |
1351 INSDC:coord:len aligned_read_len | |
1352 = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < false > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) | |
1353 ; | |
1354 INSDC:x2cs:bin out_dcmp_x2cs_bin | |
1355 = < INSDC:x2cs:bin > NCBI:align:seq_construct_read #1 ( out_dcmp_x2na_x2cs_bin, .READ_LEN, out_cmp_x2cs_bin, cmp_read_len ) | |
1356 ; | |
1357 | |
1358 // CS_NATIVE - dynamic | |
1359 U32 cmp_csread_row_len | |
1360 = row_len #1 ( phys_cmp_2cs_packed ) | |
1361 ; | |
1362 U32 cmp_csread_not_zero | |
1363 = < U32 > clip < 0, 1 > ( cmp_csread_row_len ) | |
1364 ; | |
1365 readonly column bool CS_NATIVE | |
1366 = < U32, bool > map < [ 0, 1 ], [ false, true ] > ( cmp_cs_read_not_zero ) | |
1367 ; | |
1368 | |
1369 // COLOR_MATRIX | |
1370 readonly column U8 COLOR_MATRIX | |
1371 = color_matrix | |
1372 ; | |
1373 U8 color_matrix | |
1374 = < U8 > echo < INSDC:color:default_matrix > () | |
1375 ; | |
1376 | |
1377 // various QUALITY types | |
1378 readonly column INSDC:quality:text:phred_33 QUALITY | |
1379 = out_qual_text_phred_33 | |
1380 | ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( .QUALITY ); | |
1381 readonly column INSDC:quality:text:phred_64 QUALITY | |
1382 = out_qual_text_phred_64 | |
1383 | ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( .QUALITY ); | |
1384 | |
1385 // SPOT_LEN | |
1386 INSDC:coord:len spot_len | |
1387 = ( INSDC:coord:len ) row_len ( out_dcmp_4na_bin ) | |
1388 | ( INSDC:coord:len ) row_len ( out_4na_bin ) | |
1389 ; | |
1390 readonly column INSDC:coord:len SPOT_LEN = spot_len; | |
1391 | |
1392 // TRIM_START | |
1393 readonly column INSDC:coord:zero TRIM_START | |
1394 = < INSDC:coord:zero > echo < 0 > () | |
1395 ; | |
1396 readonly column INSDC:coord:one TRIM_START | |
1397 = < INSDC:coord:one > echo < 1 > () | |
1398 ; | |
1399 // TRIM_LEN | |
1400 readonly column INSDC:coord:len TRIM_LEN = spot_len; | |
1401 | |
1402 // MIN_SPOT_ID | |
1403 readonly column INSDC:SRA:spotid_t MIN_SPOT_ID | |
1404 = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MIN" > () | |
1405 ; | |
1406 // MAX_SPOT_ID | |
1407 readonly column INSDC:SRA:spotid_t MAX_SPOT_ID | |
1408 = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MAX" > () | |
1409 ; | |
1410 // SPOT_COUNT | |
1411 readonly column U64 SPOT_COUNT | |
1412 = < U64 > meta:value < "STATS/TABLE/SPOT_COUNT" > () | |
1413 ; | |
1414 // BASE_COUNT | |
1415 U64 base_count | |
1416 = < U64 > meta:value < "STATS/TABLE/BASE_COUNT" > () | |
1417 ; | |
1418 readonly column U64 BASE_COUNT = base_count; | |
1419 // BIO_BASE_COUNT | |
1420 readonly column U64 BIO_BASE_COUNT | |
1421 = < U64 > meta:value < "STATS/TABLE/BIO_BASE_COUNT" > () | |
1422 ; | |
1423 // CMP_BASE_COUNT | |
1424 readonly column U64 CMP_BASE_COUNT | |
1425 = < U64 > meta:value < "STATS/TABLE/CMP_BASE_COUNT" > () | |
1426 | base_count | |
1427 ; | |
1428 | |
1429 // various PLATFORM | |
1430 // TBD | |
1431 | |
1432 // SPOT_ID | |
1433 I64 rowid_64 = row_id (); | |
1434 readonly column INSDC:SRA:spotid_t SPOT_ID | |
1435 = cast ( rowid_64 ) | |
1436 ; | |
1437 | |
1438 readonly column ascii NAME | |
1439 = sprintf < "%u" > ( SPOT_ID ) | |
1440 ; | |
1441 | |
1442 }; | |
1443 | |
1444 | |
1445 /*********************************** | |
1446 * Reference table - to store reference sequences | |
1447 * Sequences are divided in chunks. Two sequences never share a chunk. | |
1448 * SEQ_LEN - real size of a chunk should never exceed MAX_SEQ_LEN when it is set | |
1449 * READ - inherited from NCBI:tbl:base_space | |
1450 * CMP_READ,CMP_ALTREAD - are inherited from NCBI:align:tbl:cmp_base_space | |
1451 * SEQ_ID,SEQ_START,SEQ_LEN are inherited from NCBI:align:tbl:seqloc | |
1452 * .skey contains NAME of the chunk - it corresponds to actual name used in BAM (chr1,chr2, etc....) | |
1453 * | |
1454 * SEQ_START,SEQ_LEN,MAX_SEQ_LEN,SEQID and rowlen(READ) operate the following way | |
1455 * - SEQ_LEN < MAX_SEQ_LEN - should only happen on the last chunk of the sequence | |
1456 * - .READ is absent - there should be a retrieval from external services by SEQ_ID,SEQ_START,SEQ_LEN | |
1457 * - rowlen(.READ) = 0 && SEQ_START==0 (used as flag) - the sequence is SEQ_LEN repetition of 'N' | |
1458 * - rowlen(.READ) = 0 && SEQ_START >= 1 - the sequence have to be fetched from external sources | |
1459 * - 0 < rowlen(.READ)< SEQ_LEN -- the sequence have to be filled with 'N's | |
1460 * | |
1461 v***********************************/ | |
1462 table NCBI:align:tbl:reference #2 = | |
1463 NCBI:align:tbl:cmp_base_space #1, | |
1464 NCBI:tbl:base_space #2.0.3, | |
1465 NCBI:tbl:seqloc #1, | |
1466 NCBI:SRA:tbl:stats #1.2.0 | |
1467 { | |
1468 INSDC:quality:phred out_qual_phred | |
1469 = < INSDC:quality:phred > echo < 30 > ( out_dcmp_4na_bin ); | |
1470 | |
1471 // MAX_SEQ_LEN - should be a constant == static column | |
1472 extern column < U32 > izip_encoding MAX_SEQ_LEN; | |
1473 | |
1474 // indicates if sequence has circular structure | |
1475 // copied from refSeq | |
1476 extern column bool_encoding CIRCULAR; | |
1477 | |
1478 // make CS_KEY writable | |
1479 INSDC:dna:text in_cs_key | |
1480 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn', 'ACGTN' > ( CS_KEY ); | |
1481 physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; | |
1482 | |
1483 U32 in_spot_len = SEQ_LEN; | |
1484 | |
1485 INSDC:coord:len _alt_in_read_len | |
1486 = READ_LEN | |
1487 | SEQ_LEN; | |
1488 | |
1489 INSDC:SRA:xread_type _alt_in_read_type | |
1490 = READ_TYPE | |
1491 | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (); | |
1492 | |
1493 // extra columns needed for CS conversion | |
1494 INSDC:coord:zero out_read_start = < INSDC:coord:zero > echo < 0 > (); | |
1495 INSDC:coord:len out_read_len = .SEQ_LEN; | |
1496 | |
1497 extern column utf8 NAME = out_spot_name_utf8; | |
1498 physical utf8 .NAME = idx:text:insert #1.0 < 'i_name' > ( NAME ); | |
1499 | |
1500 utf8 out_spot_name_utf8 = idx:text:project #1.0 < 'i_name' > (.NAME ); | |
1501 | |
1502 ascii out_spot_name = cast ( out_spot_name_utf8 ); | |
1503 | |
1504 INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); | |
1505 INSDC:coord:len trim_len = base_space_spot_len; | |
1506 | |
1507 ascii out_label | |
1508 = < ascii > echo < "reference" > (); | |
1509 INSDC:coord:zero out_label_start | |
1510 = < INSDC:coord:zero > echo < 0 > (); | |
1511 INSDC:coord:len out_label_len | |
1512 = < INSDC:coord:len > echo < 9 > (); | |
1513 | |
1514 U32 out_nreads | |
1515 = < U32 > echo < 1 > (); | |
1516 INSDC:SRA:xread_type out_read_type | |
1517 = < INSDC:SRA:xread_type > echo < 3 > (); | |
1518 INSDC:SRA:read_filter out_rd_filter | |
1519 = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > (); | |
1520 | |
1521 | |
1522 // Columns of computed coverages by alignment | |
1523 | |
1524 // TBD: use percentiles instead of min/max? | |
1525 // maximum value clipped at 255 of the coverage density | |
1526 // for a chunk | |
1527 extern column < U8 > izip_encoding CGRAPH_HIGH; | |
1528 | |
1529 // minimum value clipped at 255 of the coverage density | |
1530 // for a chunk | |
1531 extern column < U8 > izip_encoding CGRAPH_LOW; | |
1532 | |
1533 // count of the number of mismatches in the chunk | |
1534 extern column < U32 > izip_encoding CGRAPH_MISMATCHES; | |
1535 | |
1536 // count of the number of inserts and deletes in the chunk | |
1537 extern column < U32 > izip_encoding CGRAPH_INDELS; | |
1538 | |
1539 // List of row ids from alignment tables | |
1540 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; | |
1541 extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; | |
1542 extern column < I64 > izip_encoding EVIDENCE_INTERVAL_IDS; | |
1543 | |
1544 // both OVERLAP_REF_* columns are array of three elements, matching number of *_IDS columns above. | |
1545 // points back to an offset where the alignments to this chunk start | |
1546 extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; | |
1547 // indicates the length of the longest tail of the alignmnent to this chunk which start in previous chunks | |
1548 // if value of an element in this col is zero corresponding value of OVERLAP_REF_POS is meaningless | |
1549 extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; | |
1550 | |
1551 // Mechanism to seach for NAME | |
1552 readonly column vdb:row_id_range NAME_RANGE | |
1553 = idx:text:lookup #1.0 < 'i_name', 'QUERY_SEQ_NAME' > (); | |
1554 | |
1555 // Fully instantiates READ | |
1556 INSDC:4na:bin out_dcmp_4na_bin | |
1557 = NCBI:align:ref_restore_read (out_cmp_4na_bin, .SEQ_ID, .SEQ_START, .SEQ_LEN); | |
1558 } | |
1559 | |
1560 // THE DATABASES | |
1561 database NCBI:align:db:alignment_sorted #1.3 | |
1562 { | |
1563 table NCBI:align:tbl:reference #2 REFERENCE; | |
1564 table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; | |
1565 table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; | |
1566 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
1567 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
1568 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
1569 }; | |
1570 | |
1571 database NCBI:align:db:alignment_unsorted #1.3 | |
1572 { | |
1573 table NCBI:align:tbl:reference #2 REFERENCE; | |
1574 table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; | |
1575 table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; | |
1576 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
1577 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
1578 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
1579 }; | |
1580 | |
1581 database NCBI:align:db:alignment_evidence #1.3 | |
1582 { | |
1583 table NCBI:align:tbl:reference #2 REFERENCE; | |
1584 table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; | |
1585 table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; | |
1586 table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; | |
1587 table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; | |
1588 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
1589 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
1590 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
1591 }; | |
1592 | |
1593 database NCBI:align:db:alignment_evidence_sorted #1.2 | |
1594 { | |
1595 table NCBI:align:tbl:reference #2 REFERENCE; | |
1596 table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; | |
1597 table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; | |
1598 table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; | |
1599 table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; | |
1600 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
1601 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
1602 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
1603 }; | |
1604 | |
1605 database NCBI:align:db:unaligned #1 | |
1606 { | |
1607 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
1608 table NCBI:SRA:ABI:tbl:v2 #1.0.4 CS_SEQUENCE; | |
1609 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
1610 }; |