Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/align/align.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * VDB Alignment types, functions and tables | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'ncbi/seq.vschema'; | |
| 34 include 'ncbi/sra.vschema'; | |
| 35 include 'ncbi/stats.vschema'; | |
| 36 include 'align/seq.vschema'; | |
| 37 include 'align/qstat.vschema'; | |
| 38 include 'sra/abi.vschema'; | |
| 39 include 'align/mate-cache.vschema'; | |
| 40 | |
| 41 | |
| 42 /*-------------------------------------------------------------------------- | |
| 43 * data types | |
| 44 */ | |
| 45 | |
| 46 /* ploidy | |
| 47 * the number of sets of chromosomes in a cell | |
| 48 */ | |
| 49 typedef U32 NCBI:align:ploidy; | |
| 50 | |
| 51 /* ro_type | |
| 52 * the type of event causing ref-offset | |
| 53 */ | |
| 54 typedef U8 NCBI:align:ro_type; | |
| 55 | |
| 56 const NCBI:align:ro_type NCBI:align:ro_normal = 0; // normal ref-offset | |
| 57 const NCBI:align:ro_type NCBI:align:ro_soft_clip = 1; // soft-clipping | |
| 58 const NCBI:align:ro_type NCBI:align:ro_intron_plus = 2; // intron on positive strand | |
| 59 const NCBI:align:ro_type NCBI:align:ro_intron_minus = 3; // intron on negative strand | |
| 60 const NCBI:align:ro_type NCBI:align:ro_intron_unknown = 4; // intron strand not specified | |
| 61 const NCBI:align:ro_type NCBI:align:ro_complete_genomics = 5; // | |
| 62 | |
| 63 | |
| 64 /*-------------------------------------------------------------------------- | |
| 65 * functions | |
| 66 */ | |
| 67 | |
| 68 | |
| 69 /* cigar | |
| 70 * construct "cigar" alignment string or length arrays | |
| 71 * | |
| 72 * "ctype" [ CONST ] - select variant of format | |
| 73 * 0 => both matches and mismatches represented as M | |
| 74 * 1 => matches represented as '=' mismatches as 'X' | |
| 75 * | |
| 76 * "has_mismatch" [ DATA ] - a boolean for each base in aligned sequence | |
| 77 * where a value of false means the base aligned to the reference | |
| 78 * | |
| 79 * "has_ref_offset" [ DATA ] - a boolean for each base in the aligned sequence | |
| 80 * where a value of true means there is a corresponding offset to position on reference | |
| 81 * | |
| 82 * "ref_offset" [ DATA ] - a packed sequence of signed offsets to aligned position | |
| 83 * one entry for every true in "has_ref_offset" | |
| 84 * | |
| 85 * "read_len" [ DATA ] - v2: elem_count defines PLOIDY and values are an actual length of reads in spot | |
| 86 */ | |
| 87 extern function | |
| 88 ascii NCBI:align:cigar #1 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, | |
| 89 I32 ref_offset, * INSDC:coord:len ref_len ) = ALIGN:cigar; | |
| 90 | |
| 91 /* history: | |
| 92 * 2.1 - added "ref_offset_type" optional parameter | |
| 93 * NB - reverting to 2.0 due to linker bug in older code | |
| 94 */ | |
| 95 extern function < type T > | |
| 96 T NCBI:align:cigar #2.0 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, | |
| 97 I32 ref_offset, INSDC:coord:len read_len, * INSDC:coord:len ref_len, NCBI:align:ro_type ref_offset_type ) | |
| 98 = ALIGN:cigar_2; | |
| 99 | |
| 100 extern function U32 NCBI:align:edit_distance #1 | |
| 101 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset ); | |
| 102 | |
| 103 extern function U32 NCBI:align:edit_distance #2 | |
| 104 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len, *INSDC:coord:len read_len) | |
| 105 = NCBI:align:edit_distance_2; | |
| 106 | |
| 107 extern function U32 NCBI:align:edit_distance #3 | |
| 108 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len) | |
| 109 = NCBI:align:edit_distance_3; | |
| 110 | |
| 111 /* rna_orientation | |
| 112 * reads column REF_OFFSET_TYPE | |
| 113 * returns '+' if has: | |
| 114 * at least one NCBI:align:ro_intron_plus | |
| 115 * none of NCBI:align:ro_intron_minus | |
| 116 * returns '-' if has: | |
| 117 * at least one NCBI:align:ro_intron_minus | |
| 118 * none of NCBI:align:ro_intron_plus | |
| 119 * returns empty string otherwise | |
| 120 */ | |
| 121 extern function | |
| 122 ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type ref_offset_type ); | |
| 123 | |
| 124 /* project_from_sequence | |
| 125 * projects column from SEQUENCE | |
| 126 * | |
| 127 * "T" [ TYPE ] | |
| 128 * | |
| 129 * "col" [ CONST ] | |
| 130 * "use_read_len" [ CONST ] whether subset by read_len or by read_id only | |
| 131 * | |
| 132 * "seq_spot_id" [ DATA ] | |
| 133 * | |
| 134 * "seq_read_id" [ DATA ] | |
| 135 */ | |
| 136 extern function < type T > | |
| 137 T NCBI:align:project_from_sequence #1 < ascii col> ( I64 seq_spot_id, INSDC:coord:one seq_read_id ) | |
| 138 = ALIGN:project_from_sequence; | |
| 139 | |
| 140 | |
| 141 /* align_restore_read | |
| 142 * restores read by applying alignment-based difference to ref_read | |
| 143 * | |
| 144 * "ref_read" [ DATA ] | |
| 145 * | |
| 146 * "has_mismatch" [ DATA ] and "mismatch" [ DATA ] | |
| 147 * | |
| 148 * "has_ref_offset" [ DATA ] and "ref_offset" [ DATA ] | |
| 149 */ | |
| 150 extern function | |
| 151 INSDC:4na:bin NCBI:align:align_restore_read #1 ( INSDC:4na:bin ref_read, bool has_mismatch, | |
| 152 INSDC:4na:bin mismatch, bool has_ref_offset, I32 ref_offset * INSDC:coord:len read_len) | |
| 153 = ALIGN:align_restore_read; | |
| 154 | |
| 155 | |
| 156 /* raw_restore_read | |
| 157 * restores read by applying alignment-based difference to align_read | |
| 158 * | |
| 159 * "align_read" [ DATA ] | |
| 160 * | |
| 161 * "ref_orientation" [ DATA ] | |
| 162 */ | |
| 163 extern function | |
| 164 INSDC:4na:bin NCBI:align:raw_restore_read #1 ( INSDC:4na:bin align_read, bool ref_orientation ) | |
| 165 = ALIGN:raw_restore_read; | |
| 166 | |
| 167 | |
| 168 /* raw_restore_qual | |
| 169 * restores quality by applying alignment-based difference to align_qual | |
| 170 * | |
| 171 * "align_qual" [ DATA ] | |
| 172 * | |
| 173 * "ref_orientation" [ DATA ] | |
| 174 */ | |
| 175 extern function | |
| 176 INSDC:quality:phred NCBI:align:raw_restore_qual #1 ( INSDC:quality:phred align_qual, bool ref_orientation ); | |
| 177 | |
| 178 | |
| 179 /* ref_sub_select | |
| 180 * projects reference from sequence | |
| 181 * | |
| 182 * "id" [ DATA ] | |
| 183 * | |
| 184 * "start" [ DATA ] and "len" [ DATA ] | |
| 185 * | |
| 186 * "ref_ploidy" [ DATA, OPTIONAL ] | |
| 187 */ | |
| 188 extern function | |
| 189 INSDC:4na:bin NCBI:align:ref_sub_select #1 ( I64 id, INSDC:coord:zero start, | |
| 190 INSDC:coord:len len * U32 ref_ploidy) | |
| 191 = ALIGN:ref_sub_select; | |
| 192 | |
| 193 | |
| 194 /* ref_restore_read | |
| 195 * restores read from central storage | |
| 196 * | |
| 197 * "cmp_rd" [ DATA ] | |
| 198 * | |
| 199 * "seq_id" [ DATA ] | |
| 200 * | |
| 201 * "seq_start" [ DATA ] and "seq_len" [ DATA ] | |
| 202 */ | |
| 203 extern function | |
| 204 INSDC:4na:bin NCBI:align:ref_restore_read #1 ( INSDC:4na:bin cmp_rd, ascii seq_id, | |
| 205 INSDC:coord:one seq_start, INSDC:coord:len seq_len) | |
| 206 = ALIGN:ref_restore_read; | |
| 207 | |
| 208 | |
| 209 /* seq_restore_read | |
| 210 * projects read from align_deflate table to SEQUENCE | |
| 211 * | |
| 212 * "cmp_rd" [ DATA ] | |
| 213 * | |
| 214 * "align_id" [ DATA ] | |
| 215 * | |
| 216 * "read_len" [ DATA ] | |
| 217 * | |
| 218 * "rd_type" [ DATA ] | |
| 219 */ | |
| 220 extern function | |
| 221 INSDC:4na:bin NCBI:align:seq_restore_read #1 ( INSDC:4na:bin cmp_rd, I64 align_id, | |
| 222 INSDC:coord:len read_len, INSDC:SRA:xread_type rd_type ) | |
| 223 = ALIGN:seq_restore_read; | |
| 224 | |
| 225 | |
| 226 /* seq_restore_linkage_group | |
| 227 * projects LINKAGE_GROUP from PRIMARY_ALIGNMENT table to SEQUENCE | |
| 228 * | |
| 229 * "cmp_linkage_group" [ DATA ] | |
| 230 * | |
| 231 * "align_id" [ DATA ] | |
| 232 */ | |
| 233 extern function | |
| 234 ascii NCBI:align:seq_restore_linkage_group #1 ( ascii cmp_linkage_group, | |
| 235 I64 align_id ) | |
| 236 = ALIGN:seq_restore_linkage_group; | |
| 237 | |
| 238 | |
| 239 /* generate_has_mismatch | |
| 240 * generates has mismatch by doing actual compare of reference and subject, | |
| 241 * *ref_offsets move comparisons reference-wise | |
| 242 * | |
| 243 * "reference" [ DATA ] | |
| 244 * | |
| 245 * "subject" [ DATA ] | |
| 246 * | |
| 247 * "has_ref_offset" [ DATA ] | |
| 248 * | |
| 249 * "ref_offset" [ DATA ] | |
| 250 */ | |
| 251 extern function | |
| 252 bool NCBI:align:generate_has_mismatch #1 ( INSDC:4na:bin reference, | |
| 253 INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset) | |
| 254 = ALIGN:generate_has_mismatch; | |
| 255 | |
| 256 | |
| 257 /* generate_mismatch | |
| 258 * | |
| 259 * "reference" [ DATA ] | |
| 260 * | |
| 261 * "subject" [ DATA ] | |
| 262 * | |
| 263 * "has_ref_offset" [ DATA ] | |
| 264 * | |
| 265 * "ref_offset" [ DATA ] | |
| 266 */ | |
| 267 extern function | |
| 268 INSDC:4na:bin NCBI:align:generate_mismatch #1 ( INSDC:4na:bin reference, | |
| 269 INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset ) | |
| 270 = ALIGN:generate_mismatch; | |
| 271 | |
| 272 | |
| 273 /* ref_pos | |
| 274 * retrieves the alignment's positions on the reference | |
| 275 * one per PLOIDY | |
| 276 * | |
| 277 * "ref_id" [ DATA ] | |
| 278 * | |
| 279 * "ref_start" [ DATA ] - one per PLOIDY | |
| 280 */ | |
| 281 extern function | |
| 282 INSDC:coord:zero NCBI:align:ref_pos #1 ( I64 ref_id, INSDC:coord:zero ref_start ); | |
| 283 | |
| 284 | |
| 285 /* ref_name | |
| 286 * retrieve the name from the reference | |
| 287 * | |
| 288 * "ref_id" [ DATA ] | |
| 289 */ | |
| 290 extern function | |
| 291 ascii NCBI:align:ref_name #1 ( I64 ref_id ); | |
| 292 | |
| 293 | |
| 294 /* ref_seq_id | |
| 295 * retrieve the seq_id from the reference | |
| 296 * | |
| 297 * "ref_id" [ DATA ] | |
| 298 */ | |
| 299 extern function | |
| 300 ascii NCBI:align:ref_seq_id #1 ( I64 ref_id ); | |
| 301 | |
| 302 | |
| 303 /* local_ref_id | |
| 304 * convert global ref_start into ref_id | |
| 305 */ | |
| 306 extern function | |
| 307 I64 NCBI:align:local_ref_id #1 ( U64 global_ref_start ); | |
| 308 | |
| 309 | |
| 310 /* global_ref_id | |
| 311 * convert global ref_start into ref_id | |
| 312 */ | |
| 313 extern function | |
| 314 INSDC:coord:zero NCBI:align:local_ref_start #1 ( U64 global_ref_start ); | |
| 315 | |
| 316 /* not_my_row | |
| 317 * removes current row_id from the list | |
| 318 */ | |
| 319 extern function I64 NCBI:align:not_my_row #1 ( I64 list ); | |
| 320 | |
| 321 /* template_len | |
| 322 * compute template length, i.e. the distance from the left-most to the | |
| 323 * right-most matching reference position | |
| 324 */ | |
| 325 extern function I32 NCBI:align:template_len #1 ( | |
| 326 INSDC:coord:zero pos, INSDC:coord:zero mate_pos, | |
| 327 INSDC:coord:len reflen, INSDC:coord:len mate_reflen, | |
| 328 ascii ref_name, ascii mate_ref_name, INSDC:coord:one read_id); | |
| 329 | |
| 330 /* get_sam_flags | |
| 331 * compute the flags that would be in a SAM file | |
| 332 * | |
| 333 * version 1 works with full Alignment databases. | |
| 334 * version 2 works with Alignment databases that have had SEQUENCE removed. | |
| 335 */ | |
| 336 extern function U32 NCBI:align:get_sam_flags #1 ( | |
| 337 INSDC:coord:len read_len, INSDC:coord:one read_id, I32 template_len, | |
| 338 bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter); | |
| 339 | |
| 340 extern function U32 NCBI:align:get_sam_flags #2 ( | |
| 341 I64 mate_id, INSDC:coord:one read_id, I32 template_len, | |
| 342 bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter) | |
| 343 = NCBI:align:get_sam_flags_2; | |
| 344 | |
| 345 /* get_left_soft_clip | |
| 346 * compute the length of the soft clip on the left edge of the alignment | |
| 347 */ | |
| 348 extern function INSDC:coord:len NCBI:align:get_left_soft_clip #1 | |
| 349 ( bool has_ref_offset, I32 ref_offset ); | |
| 350 | |
| 351 extern function INSDC:coord:len NCBI:align:get_left_soft_clip #2 | |
| 352 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len ) | |
| 353 = NCBI:align:get_left_soft_clip_2; | |
| 354 | |
| 355 /* get_right_soft_clip | |
| 356 * compute the length of the soft clip on the right edge of the alignment | |
| 357 */ | |
| 358 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #1 | |
| 359 ( bool has_mismatch, INSDC:coord:len left_clip * bool has_ref_offset ); | |
| 360 | |
| 361 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #2 | |
| 362 ( bool has_mismatch, INSDC:coord:len left_clip, bool has_ref_offset, I32 ref_offset ) | |
| 363 = NCBI:align:get_right_soft_clip_2; | |
| 364 | |
| 365 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #3 | |
| 366 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ) | |
| 367 = NCBI:align:get_right_soft_clip_3; | |
| 368 | |
| 369 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #4 | |
| 370 ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, INSDC:coord:len ref_len ) | |
| 371 = NCBI:align:get_right_soft_clip_4; | |
| 372 | |
| 373 extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5 | |
| 374 ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len ) | |
| 375 = NCBI:align:get_right_soft_clip_5; | |
| 376 | |
| 377 /* get_clipped_cigar | |
| 378 * compute the CIGAR string with the soft clipping removed | |
| 379 */ | |
| 380 extern function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar ); | |
| 381 | |
| 382 extern function < type T > | |
| 383 T NCBI:align:get_clipped_cigar #2 ( ascii cigar, INSDC:coord:len cigar_len ) = NCBI:align:get_clipped_cigar_2; | |
| 384 | |
| 385 /* get_clipped_ref_offset | |
| 386 * compute the reference offsets with the soft clipping removed | |
| 387 */ | |
| 388 extern function I32 NCBI:align:get_clipped_ref_offset #1 | |
| 389 ( bool has_ref_offset, I32 ref_offset ); | |
| 390 | |
| 391 /* clip | |
| 392 * remove the soft clipped bases (or qualities, or has_mismatch, or cetera) | |
| 393 * works with things whose lengths are the same as SEQUENCE.READ | |
| 394 */ | |
| 395 extern function < type T > T NCBI:align:clip #1 | |
| 396 ( T object, INSDC:coord:len left_clip, INSDC:coord:len right_clip); | |
| 397 | |
| 398 extern function < type T > T NCBI:align:clip #2 | |
| 399 ( T object, INSDC:coord:len read_len, INSDC:coord:len left_clip, INSDC:coord:len right_clip) | |
| 400 = NCBI:align:clip_2; | |
| 401 | |
| 402 /* get_ref_len | |
| 403 * compute reference length from alignment information | |
| 404 */ | |
| 405 extern function INSDC:coord:len NCBI:align:get_ref_len #1 | |
| 406 ( bool has_ref_offset, I32 ref_offset, * INSDC:coord:len right_clip ); | |
| 407 | |
| 408 extern function INSDC:coord:len NCBI:align:get_ref_len_2 #2 | |
| 409 ( bool has_ref_offset, I32 ref_offset) | |
| 410 = NCBI:align:get_ref_len_2; | |
| 411 | |
| 412 | |
| 413 /* get_mismatch_read | |
| 414 * generate the READ with matching bases replaced with '=' | |
| 415 */ | |
| 416 extern function ascii NCBI:align:get_mismatch_read #1 | |
| 417 ( bool has_mismatch, INSDC:dna:text mismatch ); | |
| 418 | |
| 419 /* get_ref_mismatch | |
| 420 * shows mismatch positions in reference space | |
| 421 */ | |
| 422 function bool NCBI:align:get_ref_mismatch #1 | |
| 423 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
| 424 INSDC:coord:len ref_len ); | |
| 425 | |
| 426 /* get_ref_insert | |
| 427 * shows positions of inserts in reference space | |
| 428 * i.e. an insert occurs between each pair of true's | |
| 429 */ | |
| 430 function bool NCBI:align:get_ref_insert #1 | |
| 431 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
| 432 INSDC:coord:len ref_len ); | |
| 433 | |
| 434 /* get_ref_delete | |
| 435 * shows positions of deleted bases in reference space | |
| 436 */ | |
| 437 function bool NCBI:align:get_ref_delete #1 | |
| 438 ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, | |
| 439 INSDC:coord:len ref_len ); | |
| 440 | |
| 441 extern function INSDC:quality:phred NCBI:align:compress_quality #1 | |
| 442 ( INSDC:quality:phred quality, bool preserved ); | |
| 443 | |
| 444 extern function INSDC:quality:phred NCBI:align:decompress_quality #1 | |
| 445 < INSDC:quality:phred restored_qual_value > | |
| 446 ( INSDC:quality:phred cmp_quality, bool preserved ); | |
| 447 | |
| 448 /* make_cmp_read_start | |
| 449 * | |
| 450 */ | |
| 451 extern function INSDC:coord:zero NCBI:align:make_read_start #1 | |
| 452 (INSDC:coord:len read_len); | |
| 453 | |
| 454 /* make_cmp_read_desc | |
| 455 * determines whether an element of "operand" is aligned | |
| 456 * by looking at the corresponding element of "align_id" | |
| 457 * | |
| 458 * zeros out unaligned elements of operand, unless "invert" is true, | |
| 459 * in which case it zeros out aligned elements. | |
| 460 * | |
| 461 * "T" [ TYPE ] - type of operand | |
| 462 * | |
| 463 * "invert" [ CONST ] - if true, invert the logic of which elements | |
| 464 * to zero out. | |
| 465 * | |
| 466 * "operand" [ DATA ] - uncompressed data | |
| 467 * | |
| 468 * "align_id" [ DATA ] - indication of alignment | |
| 469 */ | |
| 470 extern function < type T > | |
| 471 T NCBI:align:make_cmp_read_desc #1 <bool invert>(T operand, I64 align_id); | |
| 472 | |
| 473 /* seq_construct_read | |
| 474 * assembles read from aligned and unaligned parts | |
| 475 */ | |
| 476 extern function < type T > | |
| 477 T NCBI:align:seq_construct_read #1 ( | |
| 478 T aligned, INSDC:coord:len aligned_read_len, | |
| 479 T unaligned, INSDC:coord:len unaligned_read_len ); | |
| 480 | |
| 481 extern function I64 NCBI:align:get_mate_align_id #1 ( I64 spot_id ); | |
| 482 | |
| 483 /*-------------------------------------------------------------------------- | |
| 484 * tables | |
| 485 */ | |
| 486 | |
| 487 | |
| 488 /* ref_block_cmn | |
| 489 * common implementation ancestor for reference block | |
| 490 */ | |
| 491 table NCBI:align:tbl:ref_block_cmn #1.0.0 | |
| 492 { | |
| 493 readonly column ascii REF_TABLE | |
| 494 = < ascii > meta:read < "CONFIG/REF_TABLE" > () | |
| 495 | < ascii > echo < 'REFERENCE' > (); | |
| 496 | |
| 497 // REF_ID is rowid in Reference Table REF_TABLE | |
| 498 extern column I64 REF_ID | |
| 499 = out_ref_id; | |
| 500 | |
| 501 // this is a redefinition of REF_START | |
| 502 // REF_START is the offset within REFERENCE.READ | |
| 503 extern column INSDC:coord:zero REF_START | |
| 504 = out_ref_start; | |
| 505 | |
| 506 // global REF_START | |
| 507 extern column U64 GLOBAL_REF_START | |
| 508 = out_global_ref_start; | |
| 509 | |
| 510 // REF_LEN the length of a read projection on reference | |
| 511 INSDC:coord:len out_ref_len_internal | |
| 512 = NCBI:align:get_ref_len_2 ( out_has_ref_offset, out_ref_offset ) | |
| 513 | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset ); | |
| 514 | |
| 515 INSDC:coord:len out_ref_len | |
| 516 = .REF_LEN | |
| 517 /* | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset, out_right_clip ) */ | |
| 518 | out_ref_len_internal; | |
| 519 | |
| 520 physical column < INSDC:coord:len > izip_encoding .REF_LEN = REF_LEN; | |
| 521 extern column INSDC:coord:len REF_LEN = out_ref_len; | |
| 522 | |
| 523 // REF_ORIENTATION - relative orientation of original raw read to the reference | |
| 524 // false -> same orientation, true -> opposite orientation | |
| 525 // alignment and reference are always in the same orientation | |
| 526 extern column bool_encoding REF_ORIENTATION; | |
| 527 | |
| 528 // REF_PLOIDY | |
| 529 extern column < U32 > izip_encoding REF_PLOIDY; | |
| 530 | |
| 531 /* REF_POS | |
| 532 * per PLOIDY | |
| 533 */ | |
| 534 readonly column INSDC:coord:zero REF_POS | |
| 535 = NCBI:align:ref_pos ( out_ref_id, out_ref_start ); | |
| 536 | |
| 537 /* REF_NAME | |
| 538 * the name of the reference | |
| 539 */ | |
| 540 readonly column ascii REF_NAME | |
| 541 = NCBI:align:ref_name ( out_ref_id ); | |
| 542 | |
| 543 /* REF_SEQ_ID | |
| 544 */ | |
| 545 readonly column ascii REF_SEQ_ID | |
| 546 = NCBI:align:ref_seq_id ( out_ref_id ) | |
| 547 | < ascii > echo < '' > (); | |
| 548 }; | |
| 549 | |
| 550 | |
| 551 /* global_ref_block | |
| 552 * reference block favoring global ref-start | |
| 553 */ | |
| 554 table NCBI:align:tbl:global_ref_block #1.0.0 | |
| 555 = NCBI:align:tbl:ref_block_cmn #1.0.0 | |
| 556 { | |
| 557 U64 out_global_ref_start = .GLOBAL_REF_START; | |
| 558 physical < U64 > izip_encoding .GLOBAL_REF_START = GLOBAL_REF_START; | |
| 559 | |
| 560 I64 out_ref_id = NCBI:align:local_ref_id ( .GLOBAL_REF_START ); | |
| 561 INSDC:coord:zero out_ref_start = NCBI:align:local_ref_start ( .GLOBAL_REF_START ); | |
| 562 }; | |
| 563 | |
| 564 | |
| 565 /* local_ref_block | |
| 566 * reference block favoring local ref-start | |
| 567 */ | |
| 568 table NCBI:align:tbl:local_ref_block #1.0.0 | |
| 569 = NCBI:align:tbl:ref_block_cmn #1.0.0 | |
| 570 { | |
| 571 I64 out_ref_id = .REF_ID; | |
| 572 physical < I64 > izip_encoding .REF_ID = REF_ID; | |
| 573 | |
| 574 INSDC:coord:zero out_ref_start = .REF_START; | |
| 575 physical < INSDC:coord:zero > izip_encoding .REF_START = REF_START; | |
| 576 }; | |
| 577 | |
| 578 | |
| 579 /* align_cmn | |
| 580 * common interface and implementation for alignment object | |
| 581 * | |
| 582 * History: | |
| 583 * 2.1 - added REF_OFFSET_TYPE and RNA_ORIENTATION columns | |
| 584 * updated all cigar calculations | |
| 585 */ | |
| 586 table NCBI:align:tbl:align_cmn #2.1 | |
| 587 = NCBI:tbl:base_space_common #1.0.3 | |
| 588 , NCBI:SRA:tbl:stats #1.2.0 | |
| 589 , NCBI:align:tbl:ref_block_cmn #1.0.0 | |
| 590 { | |
| 591 bool is_secondary = out_is_secondary; | |
| 592 // temporary key | |
| 593 extern column < U32 > izip_encoding TMP_KEY_ID; | |
| 594 | |
| 595 extern column <ascii> zip_encoding LINKAGE_GROUP; | |
| 596 | |
| 597 | |
| 598 /* Raw Sequence Block */ | |
| 599 // Points to sequence table, which may contain more information about the raw sequence. | |
| 600 // row id in SEQUENCE table; 0 if not linked | |
| 601 extern column < I64 > izip_encoding SEQ_SPOT_ID; | |
| 602 | |
| 603 // read number in SEQUENCE table; { SEQ_SPOT_ID, SEQ_READ_ID } is the unique link to the sequence | |
| 604 extern column < INSDC:coord:one > izip_encoding SEQ_READ_ID; | |
| 605 | |
| 606 | |
| 607 /* Soft-Clipped data block */ | |
| 608 | |
| 609 readonly column INSDC:coord:len LEFT_SOFT_CLIP | |
| 610 = NCBI:align:get_left_soft_clip ( HAS_REF_OFFSET, REF_OFFSET, out_read_len ); | |
| 611 | |
| 612 INSDC:coord:len out_right_clip | |
| 613 = NCBI:align:get_right_soft_clip #5 ( out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len ) | |
| 614 | NCBI:align:get_right_soft_clip #4 ( out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len ) | |
| 615 | NCBI:align:get_right_soft_clip #3 ( out_has_ref_offset, out_ref_offset, out_ref_len ) | |
| 616 | NCBI:align:get_right_soft_clip #2 ( out_has_mismatch, LEFT_SOFT_CLIP, out_has_ref_offset, out_ref_offset ); | |
| 617 readonly column INSDC:coord:len RIGHT_SOFT_CLIP = out_right_clip; | |
| 618 | |
| 619 readonly column ascii CLIPPED_CIGAR_LONG | |
| 620 = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); | |
| 621 | |
| 622 readonly column INSDC:coord:len CLIPPED_CIGAR_LONG_LEN | |
| 623 = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); | |
| 624 | |
| 625 readonly column ascii CLIPPED_CIGAR_SHORT | |
| 626 = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); | |
| 627 | |
| 628 readonly column INSDC:coord:len CLIPPED_CIGAR_SHORT_LEN | |
| 629 = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); | |
| 630 | |
| 631 bool out_clipped_has_mismatch | |
| 632 = < bool > NCBI:align:clip (out_has_mismatch, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
| 633 | |
| 634 readonly column ascii CLIPPED_HAS_MISMATCH | |
| 635 = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_mismatch ); | |
| 636 | |
| 637 readonly column bool CLIPPED_HAS_MISMATCH = out_clipped_has_mismatch; | |
| 638 | |
| 639 bool out_clipped_has_ref_offset | |
| 640 = < bool > NCBI:align:clip (HAS_REF_OFFSET, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
| 641 | |
| 642 readonly column ascii CLIPPED_HAS_REF_OFFSET | |
| 643 = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_ref_offset ); | |
| 644 | |
| 645 readonly column bool CLIPPED_HAS_REF_OFFSET = out_clipped_has_ref_offset; | |
| 646 | |
| 647 // TBD cannot be computed right unless HAS_MISMATCH and! READ_LEN is used | |
| 648 readonly column INSDC:dna:text CLIPPED_MISMATCH | |
| 649 = < INSDC:dna:text > NCBI:align:clip #1 ( out_mismatch_dna_text, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
| 650 | |
| 651 readonly column I32 CLIPPED_REF_OFFSET | |
| 652 = NCBI:align:get_clipped_ref_offset ( HAS_REF_OFFSET, REF_OFFSET ); | |
| 653 | |
| 654 readonly column INSDC:quality:phred CLIPPED_QUALITY | |
| 655 = < INSDC:quality:phred > NCBI:align:clip (out_qual_phred, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
| 656 | |
| 657 readonly column INSDC:dna:text CLIPPED_READ | |
| 658 = < INSDC:dna:text > NCBI:align:clip (READ, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); | |
| 659 | |
| 660 /* Sequence Block */ | |
| 661 | |
| 662 extern column < NCBI:align:ploidy > izip_encoding PLOIDY; | |
| 663 | |
| 664 // Number of reads per spot; corresponds to the number of alternative alignments | |
| 665 // all alternative alignments are computed against the same reference region | |
| 666 U32 out_nreads | |
| 667 = .PLOIDY | |
| 668 | < U32 > echo < 1 > (); | |
| 669 | |
| 670 // READ_START and READ_LEN are position and length of the sequence | |
| 671 physical < INSDC:coord:zero > izip_encoding .READ_START = READ_START; | |
| 672 INSDC:coord:zero out_read_start | |
| 673 = .READ_START | |
| 674 | < INSDC:coord:zero > echo < 0 > (); | |
| 675 | |
| 676 physical < INSDC:coord:len > izip_encoding .READ_LEN = READ_LEN; | |
| 677 | |
| 678 INSDC:coord:len align_spot_len = ( INSDC:coord:len ) row_len ( out_has_ref_offset ); | |
| 679 INSDC:coord:len out_read_len | |
| 680 = .READ_LEN | |
| 681 | align_spot_len; | |
| 682 | |
| 683 // associated qualities | |
| 684 extern column INSDC:quality:phred CMP_QUALITY | |
| 685 = .CMP_QUALITY | |
| 686 | out_cmp_quality; | |
| 687 physical column < INSDC:quality:phred > zip_encoding .CMP_QUALITY = CMP_QUALITY; | |
| 688 | |
| 689 INSDC:quality:phred out_raw_qual = < INSDC:quality:phred > | |
| 690 NCBI:align:project_from_sequence < '( INSDC:quality:phred ) QUALITY'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
| 691 INSDC:quality:phred out_qual_phred | |
| 692 = NCBI:align:raw_restore_qual ( out_raw_qual, .REF_ORIENTATION ) | |
| 693 | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); | |
| 694 readonly column INSDC:quality:text:phred_33 SAM_QUALITY = QUALITY ; | |
| 695 | |
| 696 // project read group and name | |
| 697 ascii out_spot_group = < ascii > simple_sub_select < 'SEQUENCE','SPOT_GROUP'> (.SEQ_SPOT_ID); | |
| 698 | |
| 699 | |
| 700 INSDC:SRA:spotid_t tmp_seq_spot_id | |
| 701 = cast ( .SEQ_SPOT_ID ) | |
| 702 ; | |
| 703 physical <ascii> zip_encoding .SEQ_NAME = SEQ_NAME; | |
| 704 extern column ascii SEQ_NAME | |
| 705 = .SEQ_NAME | |
| 706 | < ascii > simple_sub_select < 'SEQUENCE','NAME'> (.SEQ_SPOT_ID) | |
| 707 | sprintf < "%u" > ( tmp_seq_spot_id ); | |
| 708 | |
| 709 // compute sam flags | |
| 710 /* blows up parser: starts at schema-tbl.c:2138 | |
| 711 readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags(MATE_ALIGN_ID, | |
| 712 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
| 713 out_mate_ref_orientation, is_secondary); | |
| 714 */ | |
| 715 INSDC:coord:len projected_read_len | |
| 716 = < INSDC:coord:len > simple_sub_select < 'SEQUENCE', 'READ_LEN' > ( .SEQ_SPOT_ID ); | |
| 717 | |
| 718 readonly column U32 SAM_FLAGS | |
| 719 = NCBI:align:get_sam_flags #1 (projected_read_len, | |
| 720 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
| 721 out_mate_ref_orientation, is_secondary, out_rd_filter) | |
| 722 | NCBI:align:get_sam_flags #2 (out_mate_align_id, | |
| 723 .SEQ_READ_ID, out_template_len, REF_ORIENTATION, | |
| 724 out_mate_ref_orientation, is_secondary, out_rd_filter); | |
| 725 | |
| 726 ascii out_name_fmt = < ascii > echo < '$R' > (); | |
| 727 | |
| 728 INSDC:coord:zero trim_start | |
| 729 = < INSDC:coord:zero > echo < 0 > (); | |
| 730 INSDC:coord:len trim_len | |
| 731 = align_spot_len; | |
| 732 | |
| 733 ascii out_label | |
| 734 = .LABEL | |
| 735 | < ascii > echo < "ploidy1" > (); | |
| 736 INSDC:coord:zero out_label_start | |
| 737 = .LABEL_START | |
| 738 | < INSDC:coord:zero > echo < 0 > (); | |
| 739 INSDC:coord:len out_label_len | |
| 740 = .LABEL_LEN | |
| 741 | < INSDC:coord:len > echo < 7 > (); | |
| 742 | |
| 743 physical < INSDC:SRA:read_filter > zip_encoding .RD_FILTER = READ_FILTER; | |
| 744 INSDC:SRA:read_filter out_rd_filter | |
| 745 = .RD_FILTER | |
| 746 | < INSDC:SRA:read_filter > NCBI:align:project_from_sequence < 'READ_FILTER' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ) | |
| 747 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_len ); | |
| 748 | |
| 749 INSDC:SRA:platform_id out_platform | |
| 750 = .PLATFORM | |
| 751 | < INSDC:SRA:platform_id > simple_sub_select < 'SEQUENCE','PLATFORM'> (.SEQ_SPOT_ID) | |
| 752 | < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > (); | |
| 753 | |
| 754 U8 out_alignment_count = <U8> NCBI:align:project_from_sequence < 'ALIGNMENT_COUNT' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
| 755 | |
| 756 /* out_read_type | |
| 757 * set to SRA_READ_TYPE_FORWARD + SRA_READ_TYPE_BIOLOGICAL | |
| 758 * which has a constant value of 3 | |
| 759 */ | |
| 760 INSDC:SRA:xread_type out_read_type | |
| 761 = < INSDC:SRA:xread_type > echo < 3 > ( out_read_len ); | |
| 762 | |
| 763 // stats inputs | |
| 764 bool in_stats_bin = HAS_REF_OFFSET; | |
| 765 | |
| 766 INSDC:coord:len _alt_in_read_len | |
| 767 = READ_LEN | |
| 768 | ( INSDC:coord:len ) row_len #1 ( HAS_REF_OFFSET ); | |
| 769 | |
| 770 INSDC:SRA:xread_type _alt_in_read_type | |
| 771 = READ_TYPE | |
| 772 | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (_alt_in_read_len); | |
| 773 | |
| 774 readonly column ascii MISMATCH_READ | |
| 775 = NCBI:align:get_mismatch_read ( out_has_mismatch, out_mismatch_dna_text ); | |
| 776 | |
| 777 /* Alignment block */ | |
| 778 | |
| 779 // MAPQ - single value quality of the mapping; the scale is submitter specific | |
| 780 extern column < I32 > izip_encoding MAPQ; | |
| 781 | |
| 782 extern column INSDC:coord:zero MATE_REF_POS = out_mate_ref_pos; | |
| 783 extern column INSDC:coord:len MATE_REF_LEN = out_mate_ref_len; | |
| 784 extern column I64 MATE_REF_ID = out_mate_ref_id; | |
| 785 extern column I32 TEMPLATE_LEN = out_template_len; | |
| 786 extern column bool MATE_REF_ORIENTATION = out_mate_ref_orientation; | |
| 787 readonly column ascii MATE_REF_NAME = NCBI:align:ref_name ( out_mate_ref_id ); | |
| 788 readonly column ascii MATE_REF_SEQ_ID = NCBI:align:ref_seq_id( out_mate_ref_id ); | |
| 789 readonly column U8 ALIGNMENT_COUNT = out_alignment_count; | |
| 790 | |
| 791 | |
| 792 /******************************** | |
| 793 * Columns representing CIGARs | |
| 794 ********************************/ | |
| 795 | |
| 796 | |
| 797 // one value per base i.e. length is same as sum of READ_LEN | |
| 798 // partitioned by READ_START and READ_LEN into alternative alignments | |
| 799 // flags the shifts in reference position preceeding the base | |
| 800 // if sequence of a partitioned read starts with a ref_offset and one or more mismatches | |
| 801 // then it represents a left soft clip | |
| 802 // any run of mismatches at the end represents a right soft clip | |
| 803 | |
| 804 readonly column ascii HAS_REF_OFFSET = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_ref_offset ); | |
| 805 extern column bool_encoding HAS_REF_OFFSET; | |
| 806 bool out_has_ref_offset = .HAS_REF_OFFSET; | |
| 807 | |
| 808 // has number of elements equal to number of true elements in HAS_REF_OFFSET | |
| 809 extern column < I32 > izip_encoding REF_OFFSET; | |
| 810 I32 out_ref_offset = .REF_OFFSET; | |
| 811 | |
| 812 // the type of offset recorded in REF_OFFSET | |
| 813 extern column < NCBI:align:ro_type > izip_encoding REF_OFFSET_TYPE; | |
| 814 NCBI:align:ro_type out_ro_type = .REF_OFFSET_TYPE; | |
| 815 | |
| 816 // DISPLAY Columns | |
| 817 | |
| 818 readonly column I64 ALIGN_ID = row_id (); | |
| 819 | |
| 820 // get projection of the reference | |
| 821 readonly column INSDC:dna:text REF_READ | |
| 822 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( REF_READ ); | |
| 823 | |
| 824 readonly column INSDC:4na:bin REF_READ | |
| 825 = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len, .REF_PLOIDY) | |
| 826 | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len ); | |
| 827 | |
| 828 INSDC:4na:bin ref_read_internal | |
| 829 = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal, .REF_PLOIDY) | |
| 830 | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal); | |
| 831 | |
| 832 // text forms of reads | |
| 833 INSDC:dna:text out_dna_text | |
| 834 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); | |
| 835 readonly column INSDC:dna:text RAW_READ | |
| 836 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_raw_read ); | |
| 837 readonly column INSDC:4na:bin RAW_READ | |
| 838 = out_raw_read; | |
| 839 | |
| 840 // CIGARs | |
| 841 readonly column ascii CIGAR_LONG | |
| 842 = < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
| 843 | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
| 844 | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
| 845 ; | |
| 846 readonly column INSDC:coord:len CIGAR_LONG_LEN | |
| 847 = < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
| 848 | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
| 849 | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
| 850 ; | |
| 851 readonly column ascii CIGAR_SHORT | |
| 852 = < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
| 853 | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
| 854 | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
| 855 ; | |
| 856 readonly column INSDC:coord:len CIGAR_SHORT_LEN | |
| 857 = < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) | |
| 858 | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) | |
| 859 | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) | |
| 860 ; | |
| 861 | |
| 862 readonly column ascii RNA_ORIENTATION | |
| 863 = NCBI:align:rna_orientation ( out_ro_type ) | |
| 864 ; | |
| 865 | |
| 866 readonly column U32 EDIT_DISTANCE | |
| 867 = NCBI:align:edit_distance #3 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len) | |
| 868 | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len, out_read_len) | |
| 869 | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len) | |
| 870 | NCBI:align:edit_distance #1 (out_has_mismatch, out_has_ref_offset, out_ref_offset); | |
| 871 | |
| 872 readonly column ascii HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_mismatch ); | |
| 873 | |
| 874 // needed for backward compatibility | |
| 875 readonly column ascii SEQ_SPOT_GROUP = out_spot_group; | |
| 876 | |
| 877 | |
| 878 /* These columns are purely informational. */ | |
| 879 bool out_ref_mismatch = NCBI:align:get_ref_mismatch ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
| 880 readonly column ascii REF_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_mismatch ); | |
| 881 readonly column bool REF_MISMATCH = out_ref_mismatch; | |
| 882 | |
| 883 bool out_ref_insert = NCBI:align:get_ref_insert ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
| 884 readonly column ascii REF_INSERT = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_insert ); | |
| 885 readonly column bool REF_INSERT = out_ref_insert; | |
| 886 | |
| 887 bool out_ref_delete = NCBI:align:get_ref_delete ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); | |
| 888 readonly column ascii REF_DELETE = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_delete ); | |
| 889 readonly column bool REF_DELETE = out_ref_delete; | |
| 890 | |
| 891 }; | |
| 892 | |
| 893 | |
| 894 /* align_full | |
| 895 * aligns externally stored sequence against reference | |
| 896 * alignment transcript is calculated | |
| 897 * | |
| 898 * History: | |
| 899 * 1.1 - respond to changes in base table | |
| 900 */ | |
| 901 table NCBI:align:tbl:align_full #1.1 | |
| 902 = NCBI:align:tbl:align_cmn #2.1 | |
| 903 { | |
| 904 bool out_is_secondary = <bool> echo < true > (); | |
| 905 // restore reads to its raw form (orientation is restored) | |
| 906 | |
| 907 INSDC:4na:bin out_raw_read | |
| 908 = < INSDC:4na:bin > simple_sub_select < 'PRIMARY_ALIGNMENT', '( INSDC:4na:bin ) RAW_READ' > (.PRIMARY_ALIGNMENT_ID) | |
| 909 | < INSDC:4na:bin > NCBI:align:project_from_sequence < '( INSDC:4na:bin ) READ'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); | |
| 910 | |
| 911 INSDC:4na:bin out_4na_bin | |
| 912 = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset, .READ_LEN ) | |
| 913 | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset ) | |
| 914 | NCBI:align:raw_restore_read ( out_raw_read, .REF_ORIENTATION ); | |
| 915 | |
| 916 | |
| 917 // flags mismatches with the reference | |
| 918 // produced by actual comparison of REF_READ and READ | |
| 919 // TMP_HAS_MISMATCH is a hack to speed up retrieval during coverage recalculation | |
| 920 column bool_encoding TMP_HAS_MISMATCH; | |
| 921 bool out_has_mismatch | |
| 922 = .TMP_HAS_MISMATCH | |
| 923 | NCBI:align:generate_has_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); | |
| 924 readonly column bool HAS_MISMATCH = out_has_mismatch; | |
| 925 | |
| 926 INSDC:4na:bin out_mismatch_4na_bin | |
| 927 = NCBI:align:generate_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); | |
| 928 | |
| 929 INSDC:4na:bin tmp_out_mismatch_4na_bin = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( .TMP_MISMATCH ); | |
| 930 | |
| 931 // temporary column for reference coverage calculation | |
| 932 column < INSDC:dna:text> zip_encoding TMP_MISMATCH; | |
| 933 | |
| 934 INSDC:dna:text out_mismatch_dna_text | |
| 935 = .TMP_MISMATCH | |
| 936 | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); | |
| 937 | |
| 938 readonly column INSDC:dna:text MISMATCH = out_mismatch_dna_text; | |
| 939 readonly column INSDC:4na:bin MISMATCH = out_mismatch_4na_bin; | |
| 940 | |
| 941 physical column < INSDC:coord:zero > izip_encoding .MATE_REF_POS = MATE_REF_POS; | |
| 942 INSDC:coord:zero out_mate_ref_pos = .MATE_REF_POS | |
| 943 | < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); | |
| 944 | |
| 945 physical column < I64 > izip_encoding .MATE_REF_ID = MATE_REF_ID; | |
| 946 I64 out_mate_ref_id = .MATE_REF_ID | |
| 947 | < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); | |
| 948 | |
| 949 INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); | |
| 950 physical column < I32 > izip_encoding .TEMPLATE_LEN = TEMPLATE_LEN; | |
| 951 I32 out_template_len = .TEMPLATE_LEN | |
| 952 | NCBI:align:template_len(REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); | |
| 953 | |
| 954 physical column < bool > izip_encoding .MATE_REF_ORIENTATION = MATE_REF_ORIENTATION; | |
| 955 bool out_mate_ref_orientation = .MATE_REF_ORIENTATION | |
| 956 | < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); | |
| 957 | |
| 958 I64 out_mate_align_id = .MATE_ALIGN_ID; | |
| 959 physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; | |
| 960 extern column I64 MATE_ALIGN_ID = out_mate_align_id; | |
| 961 | |
| 962 physical column < I64 > izip_encoding .PRIMARY_ALIGNMENT_ID = PRIMARY_ALIGNMENT_ID; | |
| 963 | |
| 964 I32 read_idx = <I32> cast (.SEQ_READ_ID); | |
| 965 extern column I64 PRIMARY_ALIGNMENT_ID | |
| 966 = .PRIMARY_ALIGNMENT_ID | |
| 967 | <I64> simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID' > (.SEQ_SPOT_ID,.SEQ_READ_ID); | |
| 968 | |
| 969 }; | |
| 970 | |
| 971 | |
| 972 /* compressed_by_reference | |
| 973 * aligns internally represented sequence against reference | |
| 974 * alignment transcript is stored | |
| 975 * original sequence is reconstructed | |
| 976 * | |
| 977 * History: | |
| 978 * 1.2 - respond to changes in base table | |
| 979 */ | |
| 980 table NCBI:align:tbl:compressed_by_reference #1.2 | |
| 981 = NCBI:align:tbl:align_cmn #2.1 | |
| 982 { | |
| 983 bool out_is_secondary = <bool> echo < false > (); | |
| 984 | |
| 985 // one value per base i.e. length is same as sum of READ_LEN | |
| 986 // partitioned by READ_START and READ_LEN into alternative alignments | |
| 987 // flags mismatches with the reference | |
| 988 extern default column bool_encoding HAS_MISMATCH; | |
| 989 bool out_has_mismatch = .HAS_MISMATCH; | |
| 990 | |
| 991 // has number of elements equal to number of true elements in HAS_MISMATCH | |
| 992 extern column INSDC:dna:text MISMATCH | |
| 993 { | |
| 994 read = out_mismatch_dna_text; | |
| 995 validate = < INSDC:dna:text > compare ( in_mismatch_dna_text, out_mismatch_dna_text ); | |
| 996 } | |
| 997 | |
| 998 INSDC:dna:text in_mismatch_dna_text | |
| 999 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( MISMATCH ); | |
| 1000 | |
| 1001 INSDC:4na:bin in_mismatch_4na_bin | |
| 1002 = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_mismatch_dna_text ); | |
| 1003 | |
| 1004 extern column < ascii > zip_encoding ALIGN_GROUP; | |
| 1005 | |
| 1006 physical column < INSDC:4na:bin > zip_encoding .MISMATCH = in_mismatch_4na_bin; | |
| 1007 | |
| 1008 INSDC:4na:bin out_mismatch_4na_bin = .MISMATCH; | |
| 1009 INSDC:dna:text out_mismatch_dna_text | |
| 1010 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); | |
| 1011 | |
| 1012 I64 out_mate_align_id | |
| 1013 = .MATE_ALIGN_ID | |
| 1014 | NCBI:align:get_mate_align_id (.SEQ_SPOT_ID); | |
| 1015 | |
| 1016 physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; | |
| 1017 extern column I64 MATE_ALIGN_ID = out_mate_align_id; | |
| 1018 | |
| 1019 // restore reads from alignment columns and the reference | |
| 1020 // optional .READ_LEN size defines PLOIDY | |
| 1021 INSDC:4na:bin out_4na_bin | |
| 1022 = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset, .READ_LEN ) | |
| 1023 | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset ); | |
| 1024 | |
| 1025 // restore reads to its raw form (orientation is restored) | |
| 1026 INSDC:4na:bin out_raw_read = NCBI:align:raw_restore_read (out_4na_bin,.REF_ORIENTATION); | |
| 1027 | |
| 1028 I64 primary_align_pair = < I64 > simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID'> (.SEQ_SPOT_ID); | |
| 1029 I64 out_mate_ref_id = < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); | |
| 1030 bool out_mate_ref_orientation = < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); | |
| 1031 INSDC:coord:zero out_mate_ref_pos = < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); | |
| 1032 INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); | |
| 1033 readonly column U32 MATE_EDIT_DISTANCE = < U32 > simple_sub_select < '','EDIT_DISTANCE'> (MATE_ALIGN_ID); | |
| 1034 readonly column ascii MATE_CIGAR_LONG = < ascii > simple_sub_select < '','CIGAR_LONG'> (MATE_ALIGN_ID); | |
| 1035 readonly column ascii MATE_CIGAR_SHORT = < ascii > simple_sub_select < '','CIGAR_SHORT'> (MATE_ALIGN_ID); | |
| 1036 readonly column INSDC:coord:len MATE_CIGAR_LONG_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_LONG_LEN'> (MATE_ALIGN_ID); | |
| 1037 readonly column INSDC:coord:len MATE_CIGAR_SHORT_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_SHORT_LEN'> (MATE_ALIGN_ID); | |
| 1038 | |
| 1039 I32 out_template_len = NCBI:align:template_len (REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); | |
| 1040 }; | |
| 1041 | |
| 1042 | |
| 1043 /* align_sorted | |
| 1044 * deflated alignment data sorted against reference | |
| 1045 * | |
| 1046 * History: | |
| 1047 * 1.2 - respond to changes in base table | |
| 1048 */ | |
| 1049 table NCBI:align:tbl:align_sorted #1.2 | |
| 1050 = NCBI:align:tbl:compressed_by_reference #1.2 | |
| 1051 , NCBI:align:tbl:global_ref_block #1.0.0 | |
| 1052 { | |
| 1053 // 128K | |
| 1054 column default limit = 131072; | |
| 1055 }; | |
| 1056 | |
| 1057 | |
| 1058 /* align_unsorted | |
| 1059 * deflated alignment unsorted data | |
| 1060 * | |
| 1061 * History: | |
| 1062 * 1.2 - respond to changes in base table | |
| 1063 */ | |
| 1064 table NCBI:align:tbl:align_unsorted #1.2 | |
| 1065 = NCBI:align:tbl:compressed_by_reference #1.2 | |
| 1066 , NCBI:align:tbl:local_ref_block #1.0.0 | |
| 1067 { | |
| 1068 // 128K | |
| 1069 column default limit = 131072; | |
| 1070 }; | |
| 1071 | |
| 1072 | |
| 1073 /* align_mate_sorted | |
| 1074 * | |
| 1075 * History: | |
| 1076 * 1.1 - respond to changes in base table | |
| 1077 */ | |
| 1078 table NCBI:align:tbl:align_mate_sorted #1.1 | |
| 1079 = NCBI:align:tbl:align_full #1.1 | |
| 1080 , NCBI:align:tbl:global_ref_block #1.0.0 | |
| 1081 { | |
| 1082 // 128K | |
| 1083 column default limit = 131072; | |
| 1084 }; | |
| 1085 | |
| 1086 | |
| 1087 /* align_mate_unsorted | |
| 1088 * | |
| 1089 * History: | |
| 1090 * 1.1 - respond to changes in base table | |
| 1091 */ | |
| 1092 table NCBI:align:tbl:align_mate_unsorted #1.1 | |
| 1093 = NCBI:align:tbl:align_full #1.1 | |
| 1094 , NCBI:align:tbl:local_ref_block #1.0.0 | |
| 1095 { | |
| 1096 // 128K | |
| 1097 column default limit = 131072; | |
| 1098 }; | |
| 1099 | |
| 1100 /* align_allele | |
| 1101 * alleles coverage extension | |
| 1102 * | |
| 1103 * History: | |
| 1104 * 1.2 - respond to changes in base table | |
| 1105 */ | |
| 1106 table NCBI:align:tbl:align_allele #1.2 | |
| 1107 = NCBI:align:tbl:align_unsorted #1.2 | |
| 1108 { | |
| 1109 extern column < I64 > izip_encoding EVIDENCE_ALIGNMENT_IDS; | |
| 1110 | |
| 1111 /* | |
| 1112 INSDC:quality:phred out_qual_phred | |
| 1113 = < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); | |
| 1114 */ | |
| 1115 }; | |
| 1116 | |
| 1117 /*-------------------------------------------------------------------------- | |
| 1118 * seq | |
| 1119 * alignment sequence table | |
| 1120 */ | |
| 1121 physical | |
| 1122 I64 NCBI:align:sorted:alignment_id_encoding #1.0 | |
| 1123 { | |
| 1124 decode | |
| 1125 { | |
| 1126 I64 outliers_removed = iunzip ( @ ); | |
| 1127 return < I64 > outlier_decode < 0 > ( outliers_removed ); | |
| 1128 } | |
| 1129 | |
| 1130 encode | |
| 1131 { | |
| 1132 I64 outliers_removed = < I64 > outlier_encode < 0 > ( @ ); | |
| 1133 return izip ( outliers_removed ); | |
| 1134 } | |
| 1135 } | |
| 1136 | |
| 1137 | |
| 1138 table NCBI:align:tbl:seq #1.1 = | |
| 1139 NCBI:tbl:base_space #2.0.3, | |
| 1140 NCBI:tbl:phred_quality #2.0.4, | |
| 1141 NCBI:align:tbl:cmp_base_space #1, | |
| 1142 NCBI:SRA:tbl:spotdesc #1.0.2, | |
| 1143 NCBI:SRA:tbl:stats #1.2.0 | |
| 1144 { | |
| 1145 // 128K | |
| 1146 column default limit = 131072; | |
| 1147 | |
| 1148 // gets primary record in alignment table (size of column is NREADS) | |
| 1149 // if sorted - should used special encoding | |
| 1150 extern column <I64> izip_encoding PRIMARY_ALIGNMENT_ID; | |
| 1151 | |
| 1152 INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); | |
| 1153 INSDC:coord:len trim_len = _spot_len; | |
| 1154 | |
| 1155 // size is NREADS | |
| 1156 extern column < U8 > zip_encoding ALIGNMENT_COUNT; | |
| 1157 | |
| 1158 // auto-generate name from row-id | |
| 1159 ascii out_name_fmt = < ascii > echo < '$R' > (); | |
| 1160 | |
| 1161 // temparary column | |
| 1162 extern column < U64 > izip_encoding TMP_KEY_ID; | |
| 1163 | |
| 1164 // restored READ | |
| 1165 INSDC:4na:bin out_dcmp_4na_bin | |
| 1166 = NCBI:align:seq_restore_read (out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE); | |
| 1167 | |
| 1168 extern column < U64 > izip_encoding TI; | |
| 1169 | |
| 1170 extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; | |
| 1171 | |
| 1172 // restored LINKAGE_GROUP | |
| 1173 readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | |
| 1174 | .CMP_LINKAGE_GROUP; | |
| 1175 }; | |
| 1176 | |
| 1177 | |
| 1178 table NCBI:align:tbl:cs_seq #1.2 | |
| 1179 { | |
| 1180 /* writable columns */ | |
| 1181 extern column INSDC:color:text CMP_CSREAD | |
| 1182 = out_cmp_color_text | |
| 1183 ; | |
| 1184 | |
| 1185 extern column < INSDC:dna:text > zip_encoding CS_KEY; | |
| 1186 | |
| 1187 extern default column < INSDC:quality:phred > zip_encoding QUALITY; | |
| 1188 | |
| 1189 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_ID; | |
| 1190 | |
| 1191 extern column < U8 > zip_encoding ALIGNMENT_COUNT; | |
| 1192 | |
| 1193 extern column < INSDC:SRA:platform_id > zip_encoding PLATFORM; | |
| 1194 | |
| 1195 extern column < ascii > zip_encoding LABEL; | |
| 1196 extern column < INSDC:coord:zero > izip_encoding LABEL_START; | |
| 1197 extern column < INSDC:coord:len > izip_encoding LABEL_LEN; | |
| 1198 | |
| 1199 extern column < INSDC:SRA:xread_type > zip_encoding READ_TYPE; | |
| 1200 extern column < INSDC:coord:zero > izip_encoding READ_START; | |
| 1201 extern column < INSDC:coord:len > izip_encoding READ_LEN; | |
| 1202 extern column < INSDC:SRA:read_filter > zip_encoding READ_FILTER; | |
| 1203 | |
| 1204 extern column < U64 > izip_encoding TMP_KEY_ID; | |
| 1205 | |
| 1206 extern column < ascii > zip_encoding SPOT_GROUP; | |
| 1207 | |
| 1208 extern column < U64 > izip_encoding TI; | |
| 1209 | |
| 1210 /* writing rules */ | |
| 1211 INSDC:x2cs:bin in_cmp_x2cs_bin | |
| 1212 = < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( CMP_CSREAD ) | |
| 1213 ; | |
| 1214 INSDC:2cs:bin in_cmp_2cs_bin | |
| 1215 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ) | |
| 1216 ; | |
| 1217 INSDC:x2cs:bin in_cmp_alt_x2cs_bin | |
| 1218 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ) | |
| 1219 ; | |
| 1220 physical column INSDC:2cs:packed .CMP_CSREAD | |
| 1221 = ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ) | |
| 1222 ; | |
| 1223 physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD | |
| 1224 = < INSDC:x2cs:bin > trim < ALIGN_LEFT, 0 > ( in_cmp_alt_x2cs_bin ) | |
| 1225 ; | |
| 1226 | |
| 1227 /* reading rules */ | |
| 1228 INSDC:2cs:packed phys_cmp_2cs_packed | |
| 1229 = .CMP_CSREAD | |
| 1230 ; | |
| 1231 INSDC:x2cs:bin phys_cmp_alt_x2cs_bin | |
| 1232 = .CMP_ALTCSREAD | |
| 1233 ; | |
| 1234 INSDC:2cs:packed phys_2cs_packed | |
| 1235 = .CSREAD | |
| 1236 ; | |
| 1237 INSDC:x2cs:bin phys_alt_x2cs_bin | |
| 1238 = .ALTCSREAD | |
| 1239 ; | |
| 1240 INSDC:2cs:bin out_cmp_2cs_bin | |
| 1241 = ( INSDC:2cs:bin ) unpack ( phys_cmp_2cs_packed ) | |
| 1242 ; | |
| 1243 INSDC:2cs:bin out_2cs_bin | |
| 1244 = ( INSDC:2cs:bin ) unpack ( phys_2cs_packed ) | |
| 1245 ; | |
| 1246 INSDC:x2cs:bin out_cmp_x2cs_bin | |
| 1247 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, phys_cmp_alt_x2cs_bin ) | |
| 1248 | ( INSDC:x2cs:bin ) out_cmp_2cs_bin | |
| 1249 ; | |
| 1250 INSDC:x2cs:bin out_x2cs_bin | |
| 1251 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, phys_alt_x2cs_bin ) | |
| 1252 | ( INSDC:x2cs:bin ) out_2cs_bin | |
| 1253 ; | |
| 1254 INSDC:color:text out_cmp_color_text | |
| 1255 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ) | |
| 1256 ; | |
| 1257 INSDC:color:text out_color_text | |
| 1258 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ) | |
| 1259 ; | |
| 1260 | |
| 1261 /* triggers from stats */ | |
| 1262 INSDC:quality:phred in_qual_phred | |
| 1263 = QUALITY | |
| 1264 ; | |
| 1265 INSDC:coord:len in_read_len | |
| 1266 = READ_LEN | |
| 1267 ; | |
| 1268 INSDC:SRA:xread_type in_read_type | |
| 1269 = READ_TYPE | |
| 1270 ; | |
| 1271 ascii in_spot_group | |
| 1272 = SPOT_GROUP | |
| 1273 ; | |
| 1274 trigger meta_stats | |
| 1275 = NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type, in_spot_group ) | |
| 1276 | NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type ) | |
| 1277 ; | |
| 1278 trigger qual_stats | |
| 1279 = NCBI:SRA:phred_stats_trigger #1 ( in_qual_phred ) | |
| 1280 ; | |
| 1281 | |
| 1282 extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; | |
| 1283 | |
| 1284 // restored LINKAGE_GROUP | |
| 1285 readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) | |
| 1286 | .CMP_LINKAGE_GROUP; | |
| 1287 }; | |
| 1288 | |
| 1289 table NCBI:align:view:cs_seq #1.1 = NCBI:align:tbl:cs_seq #1.2 | |
| 1290 { | |
| 1291 // various READ columns | |
| 1292 default readonly column INSDC:dna:text READ | |
| 1293 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_dcmp_4na_bin ) | |
| 1294 | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) | |
| 1295 ; | |
| 1296 readonly column INSDC:4na:bin READ = out_dcmp_4na_bin | out_4na_bin; | |
| 1297 readonly column INSDC:4na:packed READ = pack ( out_dcmp_4na_bin ) | pack ( out_4na_bin ); | |
| 1298 readonly column INSDC:x2na:bin READ = out_dcmp_x2na_bin | out_x2na_bin; | |
| 1299 readonly column INSDC:2na:bin READ = out_dcmp_2na_bin | out_2na_bin; | |
| 1300 INSDC:2na:bin out_dcmp_2na_bin | |
| 1301 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2na_bin ) | |
| 1302 ; | |
| 1303 INSDC:2na:bin out_2na_bin | |
| 1304 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ) | |
| 1305 ; | |
| 1306 readonly column INSDC:2na:packed READ = pack ( out_dcmp_2na_bin ) | pack ( out_2na_bin ); | |
| 1307 | |
| 1308 // decompression in base space | |
| 1309 INSDC:coord:len cmp_read_len | |
| 1310 = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < true > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) | |
| 1311 ; | |
| 1312 INSDC:coord:zero cmp_read_start | |
| 1313 = NCBI:align:make_read_start #1 ( cmp_read_len ) | |
| 1314 ; | |
| 1315 INSDC:x2na:bin out_cmp_x2na_bin | |
| 1316 = NCBI:dna_from_color #1 ( out_cmp_x2cs_bin, cmp_read_start, cmp_read_len, .CS_KEY, color_matrix ) | |
| 1317 ; | |
| 1318 INSDC:x2na:bin out_x2na_bin | |
| 1319 = NCBI:dna_from_color #1 ( out_x2cs_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) | |
| 1320 ; | |
| 1321 INSDC:4na:bin out_cmp_4na_bin | |
| 1322 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_cmp_x2na_bin ) | |
| 1323 ; | |
| 1324 INSDC:4na:bin out_4na_bin | |
| 1325 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ) | |
| 1326 ; | |
| 1327 INSDC:4na:bin out_dcmp_4na_bin | |
| 1328 = NCBI:align:seq_restore_read ( out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE ) | |
| 1329 ; | |
| 1330 | |
| 1331 | |
| 1332 // various CSREAD columns | |
| 1333 default readonly column INSDC:color:text CSREAD | |
| 1334 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_dcmp_x2cs_bin ) | |
| 1335 | out_color_text; | |
| 1336 readonly column INSDC:x2cs:bin CSREAD = out_dcmp_x2cs_bin | out_x2cs_bin; | |
| 1337 readonly column INSDC:2cs:bin CSREAD = out_dcmp_2cs_bin | out_2cs_bin; | |
| 1338 INSDC:2cs:bin out_dcmp_2cs_bin | |
| 1339 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2cs_bin ) | |
| 1340 ; | |
| 1341 readonly column INSDC:2cs:packed CSREAD = pack ( out_dcmp_2cs_bin ) | out_2cs_bin; | |
| 1342 | |
| 1343 | |
| 1344 // decompression in color space | |
| 1345 INSDC:x2na:bin out_dcmp_x2na_bin | |
| 1346 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ) | |
| 1347 ; | |
| 1348 INSDC:x2cs:bin out_dcmp_x2na_x2cs_bin | |
| 1349 = NCBI:color_from_dna #1 ( out_dcmp_x2na_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) | |
| 1350 ; | |
| 1351 INSDC:coord:len aligned_read_len | |
| 1352 = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < false > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) | |
| 1353 ; | |
| 1354 INSDC:x2cs:bin out_dcmp_x2cs_bin | |
| 1355 = < INSDC:x2cs:bin > NCBI:align:seq_construct_read #1 ( out_dcmp_x2na_x2cs_bin, .READ_LEN, out_cmp_x2cs_bin, cmp_read_len ) | |
| 1356 ; | |
| 1357 | |
| 1358 // CS_NATIVE - dynamic | |
| 1359 U32 cmp_csread_row_len | |
| 1360 = row_len #1 ( phys_cmp_2cs_packed ) | |
| 1361 ; | |
| 1362 U32 cmp_csread_not_zero | |
| 1363 = < U32 > clip < 0, 1 > ( cmp_csread_row_len ) | |
| 1364 ; | |
| 1365 readonly column bool CS_NATIVE | |
| 1366 = < U32, bool > map < [ 0, 1 ], [ false, true ] > ( cmp_cs_read_not_zero ) | |
| 1367 ; | |
| 1368 | |
| 1369 // COLOR_MATRIX | |
| 1370 readonly column U8 COLOR_MATRIX | |
| 1371 = color_matrix | |
| 1372 ; | |
| 1373 U8 color_matrix | |
| 1374 = < U8 > echo < INSDC:color:default_matrix > () | |
| 1375 ; | |
| 1376 | |
| 1377 // various QUALITY types | |
| 1378 readonly column INSDC:quality:text:phred_33 QUALITY | |
| 1379 = out_qual_text_phred_33 | |
| 1380 | ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( .QUALITY ); | |
| 1381 readonly column INSDC:quality:text:phred_64 QUALITY | |
| 1382 = out_qual_text_phred_64 | |
| 1383 | ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( .QUALITY ); | |
| 1384 | |
| 1385 // SPOT_LEN | |
| 1386 INSDC:coord:len spot_len | |
| 1387 = ( INSDC:coord:len ) row_len ( out_dcmp_4na_bin ) | |
| 1388 | ( INSDC:coord:len ) row_len ( out_4na_bin ) | |
| 1389 ; | |
| 1390 readonly column INSDC:coord:len SPOT_LEN = spot_len; | |
| 1391 | |
| 1392 // TRIM_START | |
| 1393 readonly column INSDC:coord:zero TRIM_START | |
| 1394 = < INSDC:coord:zero > echo < 0 > () | |
| 1395 ; | |
| 1396 readonly column INSDC:coord:one TRIM_START | |
| 1397 = < INSDC:coord:one > echo < 1 > () | |
| 1398 ; | |
| 1399 // TRIM_LEN | |
| 1400 readonly column INSDC:coord:len TRIM_LEN = spot_len; | |
| 1401 | |
| 1402 // MIN_SPOT_ID | |
| 1403 readonly column INSDC:SRA:spotid_t MIN_SPOT_ID | |
| 1404 = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MIN" > () | |
| 1405 ; | |
| 1406 // MAX_SPOT_ID | |
| 1407 readonly column INSDC:SRA:spotid_t MAX_SPOT_ID | |
| 1408 = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MAX" > () | |
| 1409 ; | |
| 1410 // SPOT_COUNT | |
| 1411 readonly column U64 SPOT_COUNT | |
| 1412 = < U64 > meta:value < "STATS/TABLE/SPOT_COUNT" > () | |
| 1413 ; | |
| 1414 // BASE_COUNT | |
| 1415 U64 base_count | |
| 1416 = < U64 > meta:value < "STATS/TABLE/BASE_COUNT" > () | |
| 1417 ; | |
| 1418 readonly column U64 BASE_COUNT = base_count; | |
| 1419 // BIO_BASE_COUNT | |
| 1420 readonly column U64 BIO_BASE_COUNT | |
| 1421 = < U64 > meta:value < "STATS/TABLE/BIO_BASE_COUNT" > () | |
| 1422 ; | |
| 1423 // CMP_BASE_COUNT | |
| 1424 readonly column U64 CMP_BASE_COUNT | |
| 1425 = < U64 > meta:value < "STATS/TABLE/CMP_BASE_COUNT" > () | |
| 1426 | base_count | |
| 1427 ; | |
| 1428 | |
| 1429 // various PLATFORM | |
| 1430 // TBD | |
| 1431 | |
| 1432 // SPOT_ID | |
| 1433 I64 rowid_64 = row_id (); | |
| 1434 readonly column INSDC:SRA:spotid_t SPOT_ID | |
| 1435 = cast ( rowid_64 ) | |
| 1436 ; | |
| 1437 | |
| 1438 readonly column ascii NAME | |
| 1439 = sprintf < "%u" > ( SPOT_ID ) | |
| 1440 ; | |
| 1441 | |
| 1442 }; | |
| 1443 | |
| 1444 | |
| 1445 /*********************************** | |
| 1446 * Reference table - to store reference sequences | |
| 1447 * Sequences are divided in chunks. Two sequences never share a chunk. | |
| 1448 * SEQ_LEN - real size of a chunk should never exceed MAX_SEQ_LEN when it is set | |
| 1449 * READ - inherited from NCBI:tbl:base_space | |
| 1450 * CMP_READ,CMP_ALTREAD - are inherited from NCBI:align:tbl:cmp_base_space | |
| 1451 * SEQ_ID,SEQ_START,SEQ_LEN are inherited from NCBI:align:tbl:seqloc | |
| 1452 * .skey contains NAME of the chunk - it corresponds to actual name used in BAM (chr1,chr2, etc....) | |
| 1453 * | |
| 1454 * SEQ_START,SEQ_LEN,MAX_SEQ_LEN,SEQID and rowlen(READ) operate the following way | |
| 1455 * - SEQ_LEN < MAX_SEQ_LEN - should only happen on the last chunk of the sequence | |
| 1456 * - .READ is absent - there should be a retrieval from external services by SEQ_ID,SEQ_START,SEQ_LEN | |
| 1457 * - rowlen(.READ) = 0 && SEQ_START==0 (used as flag) - the sequence is SEQ_LEN repetition of 'N' | |
| 1458 * - rowlen(.READ) = 0 && SEQ_START >= 1 - the sequence have to be fetched from external sources | |
| 1459 * - 0 < rowlen(.READ)< SEQ_LEN -- the sequence have to be filled with 'N's | |
| 1460 * | |
| 1461 v***********************************/ | |
| 1462 table NCBI:align:tbl:reference #2 = | |
| 1463 NCBI:align:tbl:cmp_base_space #1, | |
| 1464 NCBI:tbl:base_space #2.0.3, | |
| 1465 NCBI:tbl:seqloc #1, | |
| 1466 NCBI:SRA:tbl:stats #1.2.0 | |
| 1467 { | |
| 1468 INSDC:quality:phred out_qual_phred | |
| 1469 = < INSDC:quality:phred > echo < 30 > ( out_dcmp_4na_bin ); | |
| 1470 | |
| 1471 // MAX_SEQ_LEN - should be a constant == static column | |
| 1472 extern column < U32 > izip_encoding MAX_SEQ_LEN; | |
| 1473 | |
| 1474 // indicates if sequence has circular structure | |
| 1475 // copied from refSeq | |
| 1476 extern column bool_encoding CIRCULAR; | |
| 1477 | |
| 1478 // make CS_KEY writable | |
| 1479 INSDC:dna:text in_cs_key | |
| 1480 = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn', 'ACGTN' > ( CS_KEY ); | |
| 1481 physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; | |
| 1482 | |
| 1483 U32 in_spot_len = SEQ_LEN; | |
| 1484 | |
| 1485 INSDC:coord:len _alt_in_read_len | |
| 1486 = READ_LEN | |
| 1487 | SEQ_LEN; | |
| 1488 | |
| 1489 INSDC:SRA:xread_type _alt_in_read_type | |
| 1490 = READ_TYPE | |
| 1491 | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (); | |
| 1492 | |
| 1493 // extra columns needed for CS conversion | |
| 1494 INSDC:coord:zero out_read_start = < INSDC:coord:zero > echo < 0 > (); | |
| 1495 INSDC:coord:len out_read_len = .SEQ_LEN; | |
| 1496 | |
| 1497 extern column utf8 NAME = out_spot_name_utf8; | |
| 1498 physical utf8 .NAME = idx:text:insert #1.0 < 'i_name' > ( NAME ); | |
| 1499 | |
| 1500 utf8 out_spot_name_utf8 = idx:text:project #1.0 < 'i_name' > (.NAME ); | |
| 1501 | |
| 1502 ascii out_spot_name = cast ( out_spot_name_utf8 ); | |
| 1503 | |
| 1504 INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); | |
| 1505 INSDC:coord:len trim_len = base_space_spot_len; | |
| 1506 | |
| 1507 ascii out_label | |
| 1508 = < ascii > echo < "reference" > (); | |
| 1509 INSDC:coord:zero out_label_start | |
| 1510 = < INSDC:coord:zero > echo < 0 > (); | |
| 1511 INSDC:coord:len out_label_len | |
| 1512 = < INSDC:coord:len > echo < 9 > (); | |
| 1513 | |
| 1514 U32 out_nreads | |
| 1515 = < U32 > echo < 1 > (); | |
| 1516 INSDC:SRA:xread_type out_read_type | |
| 1517 = < INSDC:SRA:xread_type > echo < 3 > (); | |
| 1518 INSDC:SRA:read_filter out_rd_filter | |
| 1519 = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > (); | |
| 1520 | |
| 1521 | |
| 1522 // Columns of computed coverages by alignment | |
| 1523 | |
| 1524 // TBD: use percentiles instead of min/max? | |
| 1525 // maximum value clipped at 255 of the coverage density | |
| 1526 // for a chunk | |
| 1527 extern column < U8 > izip_encoding CGRAPH_HIGH; | |
| 1528 | |
| 1529 // minimum value clipped at 255 of the coverage density | |
| 1530 // for a chunk | |
| 1531 extern column < U8 > izip_encoding CGRAPH_LOW; | |
| 1532 | |
| 1533 // count of the number of mismatches in the chunk | |
| 1534 extern column < U32 > izip_encoding CGRAPH_MISMATCHES; | |
| 1535 | |
| 1536 // count of the number of inserts and deletes in the chunk | |
| 1537 extern column < U32 > izip_encoding CGRAPH_INDELS; | |
| 1538 | |
| 1539 // List of row ids from alignment tables | |
| 1540 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; | |
| 1541 extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; | |
| 1542 extern column < I64 > izip_encoding EVIDENCE_INTERVAL_IDS; | |
| 1543 | |
| 1544 // both OVERLAP_REF_* columns are array of three elements, matching number of *_IDS columns above. | |
| 1545 // points back to an offset where the alignments to this chunk start | |
| 1546 extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; | |
| 1547 // indicates the length of the longest tail of the alignmnent to this chunk which start in previous chunks | |
| 1548 // if value of an element in this col is zero corresponding value of OVERLAP_REF_POS is meaningless | |
| 1549 extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; | |
| 1550 | |
| 1551 // Mechanism to seach for NAME | |
| 1552 readonly column vdb:row_id_range NAME_RANGE | |
| 1553 = idx:text:lookup #1.0 < 'i_name', 'QUERY_SEQ_NAME' > (); | |
| 1554 | |
| 1555 // Fully instantiates READ | |
| 1556 INSDC:4na:bin out_dcmp_4na_bin | |
| 1557 = NCBI:align:ref_restore_read (out_cmp_4na_bin, .SEQ_ID, .SEQ_START, .SEQ_LEN); | |
| 1558 } | |
| 1559 | |
| 1560 // THE DATABASES | |
| 1561 database NCBI:align:db:alignment_sorted #1.3 | |
| 1562 { | |
| 1563 table NCBI:align:tbl:reference #2 REFERENCE; | |
| 1564 table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; | |
| 1565 table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; | |
| 1566 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
| 1567 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
| 1568 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
| 1569 }; | |
| 1570 | |
| 1571 database NCBI:align:db:alignment_unsorted #1.3 | |
| 1572 { | |
| 1573 table NCBI:align:tbl:reference #2 REFERENCE; | |
| 1574 table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; | |
| 1575 table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; | |
| 1576 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
| 1577 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
| 1578 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
| 1579 }; | |
| 1580 | |
| 1581 database NCBI:align:db:alignment_evidence #1.3 | |
| 1582 { | |
| 1583 table NCBI:align:tbl:reference #2 REFERENCE; | |
| 1584 table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; | |
| 1585 table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; | |
| 1586 table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; | |
| 1587 table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; | |
| 1588 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
| 1589 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
| 1590 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
| 1591 }; | |
| 1592 | |
| 1593 database NCBI:align:db:alignment_evidence_sorted #1.2 | |
| 1594 { | |
| 1595 table NCBI:align:tbl:reference #2 REFERENCE; | |
| 1596 table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; | |
| 1597 table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; | |
| 1598 table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; | |
| 1599 table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; | |
| 1600 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
| 1601 table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; | |
| 1602 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
| 1603 }; | |
| 1604 | |
| 1605 database NCBI:align:db:unaligned #1 | |
| 1606 { | |
| 1607 table NCBI:align:tbl:seq #1.1 SEQUENCE; | |
| 1608 table NCBI:SRA:ABI:tbl:v2 #1.0.4 CS_SEQUENCE; | |
| 1609 table NCBI:align:tbl:qstat #1.0 QUAL_STAT; | |
| 1610 }; |
