Mercurial > repos > charles_s_test > seqsero2
diff libs/sratoolkit.2.8.0-centos_linux64/schema/align/align.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libs/sratoolkit.2.8.0-centos_linux64/schema/align/align.vschema Mon Nov 27 11:21:07 2017 -0500 @@ -0,0 +1,1610 @@ +/*=========================================================================== +* +* PUBLIC DOMAIN NOTICE +* National Center for Biotechnology Information +* +* This software/database is a "United States Government Work" under the +* terms of the United States Copyright Act. It was written as part of +* the author's official duties as a United States Government employee and +* thus cannot be copyrighted. This software/database is freely available +* to the public for use. The National Library of Medicine and the U.S. +* Government have not placed any restriction on its use or reproduction. +* +* Although all reasonable efforts have been taken to ensure the accuracy +* and reliability of the software and data, the NLM and the U.S. +* Government do not and cannot warrant the performance or results that +* may be obtained by using this software or data. The NLM and the U.S. +* Government disclaim all warranties, express or implied, including +* warranties of performance, merchantability or fitness for any particular +* purpose. +* +* Please cite the author in any work or product based on this material. +* +* =========================================================================== +* +*/ + +/*========================================================================== + * VDB Alignment types, functions and tables + */ +version 1; + +include 'vdb/vdb.vschema'; +include 'ncbi/seq.vschema'; +include 'ncbi/sra.vschema'; +include 'ncbi/stats.vschema'; +include 'align/seq.vschema'; +include 'align/qstat.vschema'; +include 'sra/abi.vschema'; +include 'align/mate-cache.vschema'; + + +/*-------------------------------------------------------------------------- + * data types + */ + +/* ploidy + * the number of sets of chromosomes in a cell + */ +typedef U32 NCBI:align:ploidy; + +/* ro_type + * the type of event causing ref-offset + */ +typedef U8 NCBI:align:ro_type; + +const NCBI:align:ro_type NCBI:align:ro_normal = 0; // normal ref-offset +const NCBI:align:ro_type NCBI:align:ro_soft_clip = 1; // soft-clipping +const NCBI:align:ro_type NCBI:align:ro_intron_plus = 2; // intron on positive strand +const NCBI:align:ro_type NCBI:align:ro_intron_minus = 3; // intron on negative strand +const NCBI:align:ro_type NCBI:align:ro_intron_unknown = 4; // intron strand not specified +const NCBI:align:ro_type NCBI:align:ro_complete_genomics = 5; // + + +/*-------------------------------------------------------------------------- + * functions + */ + + +/* cigar + * construct "cigar" alignment string or length arrays + * + * "ctype" [ CONST ] - select variant of format + * 0 => both matches and mismatches represented as M + * 1 => matches represented as '=' mismatches as 'X' + * + * "has_mismatch" [ DATA ] - a boolean for each base in aligned sequence + * where a value of false means the base aligned to the reference + * + * "has_ref_offset" [ DATA ] - a boolean for each base in the aligned sequence + * where a value of true means there is a corresponding offset to position on reference + * + * "ref_offset" [ DATA ] - a packed sequence of signed offsets to aligned position + * one entry for every true in "has_ref_offset" + * + * "read_len" [ DATA ] - v2: elem_count defines PLOIDY and values are an actual length of reads in spot + */ +extern function +ascii NCBI:align:cigar #1 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, + I32 ref_offset, * INSDC:coord:len ref_len ) = ALIGN:cigar; + +/* history: + * 2.1 - added "ref_offset_type" optional parameter + * NB - reverting to 2.0 due to linker bug in older code + */ +extern function < type T > +T NCBI:align:cigar #2.0 < U8 ctype > ( bool has_mismatch, bool has_ref_offset, + I32 ref_offset, INSDC:coord:len read_len, * INSDC:coord:len ref_len, NCBI:align:ro_type ref_offset_type ) + = ALIGN:cigar_2; + +extern function U32 NCBI:align:edit_distance #1 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset ); + +extern function U32 NCBI:align:edit_distance #2 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len, *INSDC:coord:len read_len) + = NCBI:align:edit_distance_2; + +extern function U32 NCBI:align:edit_distance #3 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len) + = NCBI:align:edit_distance_3; + +/* rna_orientation + * reads column REF_OFFSET_TYPE + * returns '+' if has: + * at least one NCBI:align:ro_intron_plus + * none of NCBI:align:ro_intron_minus + * returns '-' if has: + * at least one NCBI:align:ro_intron_minus + * none of NCBI:align:ro_intron_plus + * returns empty string otherwise + */ +extern function +ascii NCBI:align:rna_orientation #1 ( NCBI:align:ro_type ref_offset_type ); + +/* project_from_sequence + * projects column from SEQUENCE + * + * "T" [ TYPE ] + * + * "col" [ CONST ] + * "use_read_len" [ CONST ] whether subset by read_len or by read_id only + * + * "seq_spot_id" [ DATA ] + * + * "seq_read_id" [ DATA ] + */ +extern function < type T > +T NCBI:align:project_from_sequence #1 < ascii col> ( I64 seq_spot_id, INSDC:coord:one seq_read_id ) + = ALIGN:project_from_sequence; + + +/* align_restore_read + * restores read by applying alignment-based difference to ref_read + * + * "ref_read" [ DATA ] + * + * "has_mismatch" [ DATA ] and "mismatch" [ DATA ] + * + * "has_ref_offset" [ DATA ] and "ref_offset" [ DATA ] + */ +extern function +INSDC:4na:bin NCBI:align:align_restore_read #1 ( INSDC:4na:bin ref_read, bool has_mismatch, + INSDC:4na:bin mismatch, bool has_ref_offset, I32 ref_offset * INSDC:coord:len read_len) + = ALIGN:align_restore_read; + + +/* raw_restore_read + * restores read by applying alignment-based difference to align_read + * + * "align_read" [ DATA ] + * + * "ref_orientation" [ DATA ] + */ +extern function +INSDC:4na:bin NCBI:align:raw_restore_read #1 ( INSDC:4na:bin align_read, bool ref_orientation ) + = ALIGN:raw_restore_read; + + +/* raw_restore_qual + * restores quality by applying alignment-based difference to align_qual + * + * "align_qual" [ DATA ] + * + * "ref_orientation" [ DATA ] + */ +extern function +INSDC:quality:phred NCBI:align:raw_restore_qual #1 ( INSDC:quality:phred align_qual, bool ref_orientation ); + + +/* ref_sub_select + * projects reference from sequence + * + * "id" [ DATA ] + * + * "start" [ DATA ] and "len" [ DATA ] + * + * "ref_ploidy" [ DATA, OPTIONAL ] + */ +extern function +INSDC:4na:bin NCBI:align:ref_sub_select #1 ( I64 id, INSDC:coord:zero start, + INSDC:coord:len len * U32 ref_ploidy) + = ALIGN:ref_sub_select; + + +/* ref_restore_read + * restores read from central storage + * + * "cmp_rd" [ DATA ] + * + * "seq_id" [ DATA ] + * + * "seq_start" [ DATA ] and "seq_len" [ DATA ] + */ +extern function +INSDC:4na:bin NCBI:align:ref_restore_read #1 ( INSDC:4na:bin cmp_rd, ascii seq_id, + INSDC:coord:one seq_start, INSDC:coord:len seq_len) + = ALIGN:ref_restore_read; + + +/* seq_restore_read + * projects read from align_deflate table to SEQUENCE + * + * "cmp_rd" [ DATA ] + * + * "align_id" [ DATA ] + * + * "read_len" [ DATA ] + * + * "rd_type" [ DATA ] + */ +extern function +INSDC:4na:bin NCBI:align:seq_restore_read #1 ( INSDC:4na:bin cmp_rd, I64 align_id, + INSDC:coord:len read_len, INSDC:SRA:xread_type rd_type ) + = ALIGN:seq_restore_read; + + +/* seq_restore_linkage_group + * projects LINKAGE_GROUP from PRIMARY_ALIGNMENT table to SEQUENCE + * + * "cmp_linkage_group" [ DATA ] + * + * "align_id" [ DATA ] + */ +extern function +ascii NCBI:align:seq_restore_linkage_group #1 ( ascii cmp_linkage_group, + I64 align_id ) + = ALIGN:seq_restore_linkage_group; + + +/* generate_has_mismatch + * generates has mismatch by doing actual compare of reference and subject, + * *ref_offsets move comparisons reference-wise + * + * "reference" [ DATA ] + * + * "subject" [ DATA ] + * + * "has_ref_offset" [ DATA ] + * + * "ref_offset" [ DATA ] + */ +extern function +bool NCBI:align:generate_has_mismatch #1 ( INSDC:4na:bin reference, + INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset) + = ALIGN:generate_has_mismatch; + + +/* generate_mismatch + * + * "reference" [ DATA ] + * + * "subject" [ DATA ] + * + * "has_ref_offset" [ DATA ] + * + * "ref_offset" [ DATA ] + */ +extern function +INSDC:4na:bin NCBI:align:generate_mismatch #1 ( INSDC:4na:bin reference, + INSDC:4na:bin subject, bool has_ref_offset, I32 ref_offset ) + = ALIGN:generate_mismatch; + + +/* ref_pos + * retrieves the alignment's positions on the reference + * one per PLOIDY + * + * "ref_id" [ DATA ] + * + * "ref_start" [ DATA ] - one per PLOIDY + */ +extern function +INSDC:coord:zero NCBI:align:ref_pos #1 ( I64 ref_id, INSDC:coord:zero ref_start ); + + +/* ref_name + * retrieve the name from the reference + * + * "ref_id" [ DATA ] + */ +extern function +ascii NCBI:align:ref_name #1 ( I64 ref_id ); + + +/* ref_seq_id + * retrieve the seq_id from the reference + * + * "ref_id" [ DATA ] + */ +extern function +ascii NCBI:align:ref_seq_id #1 ( I64 ref_id ); + + +/* local_ref_id + * convert global ref_start into ref_id + */ +extern function +I64 NCBI:align:local_ref_id #1 ( U64 global_ref_start ); + + +/* global_ref_id + * convert global ref_start into ref_id + */ +extern function +INSDC:coord:zero NCBI:align:local_ref_start #1 ( U64 global_ref_start ); + +/* not_my_row + * removes current row_id from the list + */ +extern function I64 NCBI:align:not_my_row #1 ( I64 list ); + +/* template_len + * compute template length, i.e. the distance from the left-most to the + * right-most matching reference position + */ +extern function I32 NCBI:align:template_len #1 ( + INSDC:coord:zero pos, INSDC:coord:zero mate_pos, + INSDC:coord:len reflen, INSDC:coord:len mate_reflen, + ascii ref_name, ascii mate_ref_name, INSDC:coord:one read_id); + +/* get_sam_flags + * compute the flags that would be in a SAM file + * + * version 1 works with full Alignment databases. + * version 2 works with Alignment databases that have had SEQUENCE removed. + */ +extern function U32 NCBI:align:get_sam_flags #1 ( + INSDC:coord:len read_len, INSDC:coord:one read_id, I32 template_len, + bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter); + +extern function U32 NCBI:align:get_sam_flags #2 ( + I64 mate_id, INSDC:coord:one read_id, I32 template_len, + bool strand, bool mate_strand, bool is_secondary, * INSDC:SRA:read_filter filter) + = NCBI:align:get_sam_flags_2; + +/* get_left_soft_clip + * compute the length of the soft clip on the left edge of the alignment + */ +extern function INSDC:coord:len NCBI:align:get_left_soft_clip #1 + ( bool has_ref_offset, I32 ref_offset ); + +extern function INSDC:coord:len NCBI:align:get_left_soft_clip #2 + ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len ) + = NCBI:align:get_left_soft_clip_2; + +/* get_right_soft_clip + * compute the length of the soft clip on the right edge of the alignment + */ +extern function INSDC:coord:len NCBI:align:get_right_soft_clip #1 + ( bool has_mismatch, INSDC:coord:len left_clip * bool has_ref_offset ); + +extern function INSDC:coord:len NCBI:align:get_right_soft_clip #2 + ( bool has_mismatch, INSDC:coord:len left_clip, bool has_ref_offset, I32 ref_offset ) + = NCBI:align:get_right_soft_clip_2; + +extern function INSDC:coord:len NCBI:align:get_right_soft_clip #3 + ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len ref_len ) + = NCBI:align:get_right_soft_clip_3; + +extern function INSDC:coord:len NCBI:align:get_right_soft_clip #4 + ( bool has_ref_offset, I32 ref_offset, INSDC:coord:len read_len, INSDC:coord:len ref_len ) + = NCBI:align:get_right_soft_clip_4; + +extern function INSDC:coord:len NCBI:align:get_right_soft_clip #5 + ( bool has_ref_offset, I32 ref_offset, NCBI:align:ro_type ref_offset_type, INSDC:coord:len read_len ) + = NCBI:align:get_right_soft_clip_5; + +/* get_clipped_cigar + * compute the CIGAR string with the soft clipping removed + */ +extern function ascii NCBI:align:get_clipped_cigar #1 ( ascii cigar ); + +extern function < type T > +T NCBI:align:get_clipped_cigar #2 ( ascii cigar, INSDC:coord:len cigar_len ) = NCBI:align:get_clipped_cigar_2; + +/* get_clipped_ref_offset + * compute the reference offsets with the soft clipping removed + */ +extern function I32 NCBI:align:get_clipped_ref_offset #1 + ( bool has_ref_offset, I32 ref_offset ); + +/* clip + * remove the soft clipped bases (or qualities, or has_mismatch, or cetera) + * works with things whose lengths are the same as SEQUENCE.READ + */ +extern function < type T > T NCBI:align:clip #1 + ( T object, INSDC:coord:len left_clip, INSDC:coord:len right_clip); + +extern function < type T > T NCBI:align:clip #2 + ( T object, INSDC:coord:len read_len, INSDC:coord:len left_clip, INSDC:coord:len right_clip) + = NCBI:align:clip_2; + +/* get_ref_len + * compute reference length from alignment information + */ +extern function INSDC:coord:len NCBI:align:get_ref_len #1 + ( bool has_ref_offset, I32 ref_offset, * INSDC:coord:len right_clip ); + +extern function INSDC:coord:len NCBI:align:get_ref_len_2 #2 + ( bool has_ref_offset, I32 ref_offset) + = NCBI:align:get_ref_len_2; + + +/* get_mismatch_read + * generate the READ with matching bases replaced with '=' + */ +extern function ascii NCBI:align:get_mismatch_read #1 + ( bool has_mismatch, INSDC:dna:text mismatch ); + +/* get_ref_mismatch + * shows mismatch positions in reference space + */ +function bool NCBI:align:get_ref_mismatch #1 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, + INSDC:coord:len ref_len ); + +/* get_ref_insert + * shows positions of inserts in reference space + * i.e. an insert occurs between each pair of true's + */ +function bool NCBI:align:get_ref_insert #1 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, + INSDC:coord:len ref_len ); + +/* get_ref_delete + * shows positions of deleted bases in reference space + */ +function bool NCBI:align:get_ref_delete #1 + ( bool has_mismatch, bool has_ref_offset, I32 ref_offset, + INSDC:coord:len ref_len ); + +extern function INSDC:quality:phred NCBI:align:compress_quality #1 + ( INSDC:quality:phred quality, bool preserved ); + +extern function INSDC:quality:phred NCBI:align:decompress_quality #1 + < INSDC:quality:phred restored_qual_value > + ( INSDC:quality:phred cmp_quality, bool preserved ); + +/* make_cmp_read_start + * + */ +extern function INSDC:coord:zero NCBI:align:make_read_start #1 + (INSDC:coord:len read_len); + +/* make_cmp_read_desc + * determines whether an element of "operand" is aligned + * by looking at the corresponding element of "align_id" + * + * zeros out unaligned elements of operand, unless "invert" is true, + * in which case it zeros out aligned elements. + * + * "T" [ TYPE ] - type of operand + * + * "invert" [ CONST ] - if true, invert the logic of which elements + * to zero out. + * + * "operand" [ DATA ] - uncompressed data + * + * "align_id" [ DATA ] - indication of alignment + */ +extern function < type T > +T NCBI:align:make_cmp_read_desc #1 <bool invert>(T operand, I64 align_id); + +/* seq_construct_read + * assembles read from aligned and unaligned parts + */ +extern function < type T > +T NCBI:align:seq_construct_read #1 ( + T aligned, INSDC:coord:len aligned_read_len, + T unaligned, INSDC:coord:len unaligned_read_len ); + +extern function I64 NCBI:align:get_mate_align_id #1 ( I64 spot_id ); + +/*-------------------------------------------------------------------------- + * tables + */ + + +/* ref_block_cmn + * common implementation ancestor for reference block + */ +table NCBI:align:tbl:ref_block_cmn #1.0.0 +{ + readonly column ascii REF_TABLE + = < ascii > meta:read < "CONFIG/REF_TABLE" > () + | < ascii > echo < 'REFERENCE' > (); + + // REF_ID is rowid in Reference Table REF_TABLE + extern column I64 REF_ID + = out_ref_id; + + // this is a redefinition of REF_START + // REF_START is the offset within REFERENCE.READ + extern column INSDC:coord:zero REF_START + = out_ref_start; + + // global REF_START + extern column U64 GLOBAL_REF_START + = out_global_ref_start; + + // REF_LEN the length of a read projection on reference + INSDC:coord:len out_ref_len_internal + = NCBI:align:get_ref_len_2 ( out_has_ref_offset, out_ref_offset ) + | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset ); + + INSDC:coord:len out_ref_len + = .REF_LEN +/* | NCBI:align:get_ref_len ( out_has_ref_offset, out_ref_offset, out_right_clip ) */ + | out_ref_len_internal; + + physical column < INSDC:coord:len > izip_encoding .REF_LEN = REF_LEN; + extern column INSDC:coord:len REF_LEN = out_ref_len; + + // REF_ORIENTATION - relative orientation of original raw read to the reference + // false -> same orientation, true -> opposite orientation + // alignment and reference are always in the same orientation + extern column bool_encoding REF_ORIENTATION; + + // REF_PLOIDY + extern column < U32 > izip_encoding REF_PLOIDY; + + /* REF_POS + * per PLOIDY + */ + readonly column INSDC:coord:zero REF_POS + = NCBI:align:ref_pos ( out_ref_id, out_ref_start ); + + /* REF_NAME + * the name of the reference + */ + readonly column ascii REF_NAME + = NCBI:align:ref_name ( out_ref_id ); + + /* REF_SEQ_ID + */ + readonly column ascii REF_SEQ_ID + = NCBI:align:ref_seq_id ( out_ref_id ) + | < ascii > echo < '' > (); +}; + + +/* global_ref_block + * reference block favoring global ref-start + */ +table NCBI:align:tbl:global_ref_block #1.0.0 + = NCBI:align:tbl:ref_block_cmn #1.0.0 +{ + U64 out_global_ref_start = .GLOBAL_REF_START; + physical < U64 > izip_encoding .GLOBAL_REF_START = GLOBAL_REF_START; + + I64 out_ref_id = NCBI:align:local_ref_id ( .GLOBAL_REF_START ); + INSDC:coord:zero out_ref_start = NCBI:align:local_ref_start ( .GLOBAL_REF_START ); +}; + + +/* local_ref_block + * reference block favoring local ref-start + */ +table NCBI:align:tbl:local_ref_block #1.0.0 + = NCBI:align:tbl:ref_block_cmn #1.0.0 +{ + I64 out_ref_id = .REF_ID; + physical < I64 > izip_encoding .REF_ID = REF_ID; + + INSDC:coord:zero out_ref_start = .REF_START; + physical < INSDC:coord:zero > izip_encoding .REF_START = REF_START; +}; + + +/* align_cmn + * common interface and implementation for alignment object + * + * History: + * 2.1 - added REF_OFFSET_TYPE and RNA_ORIENTATION columns + * updated all cigar calculations + */ +table NCBI:align:tbl:align_cmn #2.1 + = NCBI:tbl:base_space_common #1.0.3 + , NCBI:SRA:tbl:stats #1.2.0 + , NCBI:align:tbl:ref_block_cmn #1.0.0 +{ + bool is_secondary = out_is_secondary; +// temporary key + extern column < U32 > izip_encoding TMP_KEY_ID; + + extern column <ascii> zip_encoding LINKAGE_GROUP; + + +/* Raw Sequence Block */ + // Points to sequence table, which may contain more information about the raw sequence. + // row id in SEQUENCE table; 0 if not linked + extern column < I64 > izip_encoding SEQ_SPOT_ID; + + // read number in SEQUENCE table; { SEQ_SPOT_ID, SEQ_READ_ID } is the unique link to the sequence + extern column < INSDC:coord:one > izip_encoding SEQ_READ_ID; + + +/* Soft-Clipped data block */ + + readonly column INSDC:coord:len LEFT_SOFT_CLIP + = NCBI:align:get_left_soft_clip ( HAS_REF_OFFSET, REF_OFFSET, out_read_len ); + + INSDC:coord:len out_right_clip + = NCBI:align:get_right_soft_clip #5 ( out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len ) + | NCBI:align:get_right_soft_clip #4 ( out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len ) + | NCBI:align:get_right_soft_clip #3 ( out_has_ref_offset, out_ref_offset, out_ref_len ) + | NCBI:align:get_right_soft_clip #2 ( out_has_mismatch, LEFT_SOFT_CLIP, out_has_ref_offset, out_ref_offset ); + readonly column INSDC:coord:len RIGHT_SOFT_CLIP = out_right_clip; + + readonly column ascii CLIPPED_CIGAR_LONG + = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); + + readonly column INSDC:coord:len CLIPPED_CIGAR_LONG_LEN + = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_LONG, CIGAR_LONG_LEN ); + + readonly column ascii CLIPPED_CIGAR_SHORT + = < ascii > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); + + readonly column INSDC:coord:len CLIPPED_CIGAR_SHORT_LEN + = < INSDC:coord:len > NCBI:align:get_clipped_cigar ( CIGAR_SHORT, CIGAR_SHORT_LEN ); + + bool out_clipped_has_mismatch + = < bool > NCBI:align:clip (out_has_mismatch, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); + + readonly column ascii CLIPPED_HAS_MISMATCH + = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_mismatch ); + + readonly column bool CLIPPED_HAS_MISMATCH = out_clipped_has_mismatch; + + bool out_clipped_has_ref_offset + = < bool > NCBI:align:clip (HAS_REF_OFFSET, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); + + readonly column ascii CLIPPED_HAS_REF_OFFSET + = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_clipped_has_ref_offset ); + + readonly column bool CLIPPED_HAS_REF_OFFSET = out_clipped_has_ref_offset; + + // TBD cannot be computed right unless HAS_MISMATCH and! READ_LEN is used + readonly column INSDC:dna:text CLIPPED_MISMATCH + = < INSDC:dna:text > NCBI:align:clip #1 ( out_mismatch_dna_text, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); + + readonly column I32 CLIPPED_REF_OFFSET + = NCBI:align:get_clipped_ref_offset ( HAS_REF_OFFSET, REF_OFFSET ); + + readonly column INSDC:quality:phred CLIPPED_QUALITY + = < INSDC:quality:phred > NCBI:align:clip (out_qual_phred, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); + + readonly column INSDC:dna:text CLIPPED_READ + = < INSDC:dna:text > NCBI:align:clip (READ, out_read_len, LEFT_SOFT_CLIP, RIGHT_SOFT_CLIP); + +/* Sequence Block */ + + extern column < NCBI:align:ploidy > izip_encoding PLOIDY; + + // Number of reads per spot; corresponds to the number of alternative alignments + // all alternative alignments are computed against the same reference region + U32 out_nreads + = .PLOIDY + | < U32 > echo < 1 > (); + + // READ_START and READ_LEN are position and length of the sequence + physical < INSDC:coord:zero > izip_encoding .READ_START = READ_START; + INSDC:coord:zero out_read_start + = .READ_START + | < INSDC:coord:zero > echo < 0 > (); + + physical < INSDC:coord:len > izip_encoding .READ_LEN = READ_LEN; + + INSDC:coord:len align_spot_len = ( INSDC:coord:len ) row_len ( out_has_ref_offset ); + INSDC:coord:len out_read_len + = .READ_LEN + | align_spot_len; + + // associated qualities + extern column INSDC:quality:phred CMP_QUALITY + = .CMP_QUALITY + | out_cmp_quality; + physical column < INSDC:quality:phred > zip_encoding .CMP_QUALITY = CMP_QUALITY; + + INSDC:quality:phred out_raw_qual = < INSDC:quality:phred > + NCBI:align:project_from_sequence < '( INSDC:quality:phred ) QUALITY'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); + INSDC:quality:phred out_qual_phred + = NCBI:align:raw_restore_qual ( out_raw_qual, .REF_ORIENTATION ) + | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); + readonly column INSDC:quality:text:phred_33 SAM_QUALITY = QUALITY ; + + // project read group and name + ascii out_spot_group = < ascii > simple_sub_select < 'SEQUENCE','SPOT_GROUP'> (.SEQ_SPOT_ID); + + + INSDC:SRA:spotid_t tmp_seq_spot_id + = cast ( .SEQ_SPOT_ID ) + ; + physical <ascii> zip_encoding .SEQ_NAME = SEQ_NAME; + extern column ascii SEQ_NAME + = .SEQ_NAME + | < ascii > simple_sub_select < 'SEQUENCE','NAME'> (.SEQ_SPOT_ID) + | sprintf < "%u" > ( tmp_seq_spot_id ); + + // compute sam flags + /* blows up parser: starts at schema-tbl.c:2138 + readonly column U32 SAM_FLAGS = NCBI:align:get_sam_flags(MATE_ALIGN_ID, + .SEQ_READ_ID, out_template_len, REF_ORIENTATION, + out_mate_ref_orientation, is_secondary); + */ + INSDC:coord:len projected_read_len + = < INSDC:coord:len > simple_sub_select < 'SEQUENCE', 'READ_LEN' > ( .SEQ_SPOT_ID ); + + readonly column U32 SAM_FLAGS + = NCBI:align:get_sam_flags #1 (projected_read_len, + .SEQ_READ_ID, out_template_len, REF_ORIENTATION, + out_mate_ref_orientation, is_secondary, out_rd_filter) + | NCBI:align:get_sam_flags #2 (out_mate_align_id, + .SEQ_READ_ID, out_template_len, REF_ORIENTATION, + out_mate_ref_orientation, is_secondary, out_rd_filter); + + ascii out_name_fmt = < ascii > echo < '$R' > (); + + INSDC:coord:zero trim_start + = < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len trim_len + = align_spot_len; + + ascii out_label + = .LABEL + | < ascii > echo < "ploidy1" > (); + INSDC:coord:zero out_label_start + = .LABEL_START + | < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len out_label_len + = .LABEL_LEN + | < INSDC:coord:len > echo < 7 > (); + + physical < INSDC:SRA:read_filter > zip_encoding .RD_FILTER = READ_FILTER; + INSDC:SRA:read_filter out_rd_filter + = .RD_FILTER + | < INSDC:SRA:read_filter > NCBI:align:project_from_sequence < 'READ_FILTER' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ) + | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ( out_read_len ); + + INSDC:SRA:platform_id out_platform + = .PLATFORM + | < INSDC:SRA:platform_id > simple_sub_select < 'SEQUENCE','PLATFORM'> (.SEQ_SPOT_ID) + | < INSDC:SRA:platform_id > echo < SRA_PLATFORM_UNDEFINED > (); + + U8 out_alignment_count = <U8> NCBI:align:project_from_sequence < 'ALIGNMENT_COUNT' > ( .SEQ_SPOT_ID, .SEQ_READ_ID ); + + /* out_read_type + * set to SRA_READ_TYPE_FORWARD + SRA_READ_TYPE_BIOLOGICAL + * which has a constant value of 3 + */ + INSDC:SRA:xread_type out_read_type + = < INSDC:SRA:xread_type > echo < 3 > ( out_read_len ); + + // stats inputs + bool in_stats_bin = HAS_REF_OFFSET; + + INSDC:coord:len _alt_in_read_len + = READ_LEN + | ( INSDC:coord:len ) row_len #1 ( HAS_REF_OFFSET ); + + INSDC:SRA:xread_type _alt_in_read_type + = READ_TYPE + | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (_alt_in_read_len); + + readonly column ascii MISMATCH_READ + = NCBI:align:get_mismatch_read ( out_has_mismatch, out_mismatch_dna_text ); + +/* Alignment block */ + + // MAPQ - single value quality of the mapping; the scale is submitter specific + extern column < I32 > izip_encoding MAPQ; + + extern column INSDC:coord:zero MATE_REF_POS = out_mate_ref_pos; + extern column INSDC:coord:len MATE_REF_LEN = out_mate_ref_len; + extern column I64 MATE_REF_ID = out_mate_ref_id; + extern column I32 TEMPLATE_LEN = out_template_len; + extern column bool MATE_REF_ORIENTATION = out_mate_ref_orientation; + readonly column ascii MATE_REF_NAME = NCBI:align:ref_name ( out_mate_ref_id ); + readonly column ascii MATE_REF_SEQ_ID = NCBI:align:ref_seq_id( out_mate_ref_id ); + readonly column U8 ALIGNMENT_COUNT = out_alignment_count; + + +/******************************** +* Columns representing CIGARs +********************************/ + + + // one value per base i.e. length is same as sum of READ_LEN + // partitioned by READ_START and READ_LEN into alternative alignments + // flags the shifts in reference position preceeding the base + // if sequence of a partitioned read starts with a ref_offset and one or more mismatches + // then it represents a left soft clip + // any run of mismatches at the end represents a right soft clip + + readonly column ascii HAS_REF_OFFSET = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_ref_offset ); + extern column bool_encoding HAS_REF_OFFSET; + bool out_has_ref_offset = .HAS_REF_OFFSET; + + // has number of elements equal to number of true elements in HAS_REF_OFFSET + extern column < I32 > izip_encoding REF_OFFSET; + I32 out_ref_offset = .REF_OFFSET; + + // the type of offset recorded in REF_OFFSET + extern column < NCBI:align:ro_type > izip_encoding REF_OFFSET_TYPE; + NCBI:align:ro_type out_ro_type = .REF_OFFSET_TYPE; + + // DISPLAY Columns + + readonly column I64 ALIGN_ID = row_id (); + + // get projection of the reference + readonly column INSDC:dna:text REF_READ + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( REF_READ ); + + readonly column INSDC:4na:bin REF_READ + = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len, .REF_PLOIDY) + | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len ); + + INSDC:4na:bin ref_read_internal + = NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal, .REF_PLOIDY) + | NCBI:align:ref_sub_select (out_ref_id, out_ref_start, out_ref_len_internal); + + // text forms of reads + INSDC:dna:text out_dna_text + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); + readonly column INSDC:dna:text RAW_READ + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_raw_read ); + readonly column INSDC:4na:bin RAW_READ + = out_raw_read; + + // CIGARs + readonly column ascii CIGAR_LONG + = < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) + | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) + | < ascii > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) + ; + readonly column INSDC:coord:len CIGAR_LONG_LEN + = < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) + | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) + | < INSDC:coord:len > NCBI:align:cigar #2 < 1 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) + ; + readonly column ascii CIGAR_SHORT + = < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) + | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) + | < ascii > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) + ; + readonly column INSDC:coord:len CIGAR_SHORT_LEN + = < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len, out_ro_type) + | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len, out_ref_len) + | < INSDC:coord:len > NCBI:align:cigar #2 < 0 > (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_read_len) + ; + + readonly column ascii RNA_ORIENTATION + = NCBI:align:rna_orientation ( out_ro_type ) + ; + + readonly column U32 EDIT_DISTANCE + = NCBI:align:edit_distance #3 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ro_type, out_read_len) + | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len, out_read_len) + | NCBI:align:edit_distance #2 (out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len) + | NCBI:align:edit_distance #1 (out_has_mismatch, out_has_ref_offset, out_ref_offset); + + readonly column ascii HAS_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_has_mismatch ); + + // needed for backward compatibility + readonly column ascii SEQ_SPOT_GROUP = out_spot_group; + + +/* These columns are purely informational. */ + bool out_ref_mismatch = NCBI:align:get_ref_mismatch ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); + readonly column ascii REF_MISMATCH = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_mismatch ); + readonly column bool REF_MISMATCH = out_ref_mismatch; + + bool out_ref_insert = NCBI:align:get_ref_insert ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); + readonly column ascii REF_INSERT = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_insert ); + readonly column bool REF_INSERT = out_ref_insert; + + bool out_ref_delete = NCBI:align:get_ref_delete ( out_has_mismatch, out_has_ref_offset, out_ref_offset, out_ref_len ); + readonly column ascii REF_DELETE = < U8 , ascii > map < [ 0 , 1 ] , '01' > ( out_ref_delete ); + readonly column bool REF_DELETE = out_ref_delete; + +}; + + +/* align_full + * aligns externally stored sequence against reference + * alignment transcript is calculated + * + * History: + * 1.1 - respond to changes in base table + */ +table NCBI:align:tbl:align_full #1.1 + = NCBI:align:tbl:align_cmn #2.1 +{ + bool out_is_secondary = <bool> echo < true > (); + // restore reads to its raw form (orientation is restored) + + INSDC:4na:bin out_raw_read + = < INSDC:4na:bin > simple_sub_select < 'PRIMARY_ALIGNMENT', '( INSDC:4na:bin ) RAW_READ' > (.PRIMARY_ALIGNMENT_ID) + | < INSDC:4na:bin > NCBI:align:project_from_sequence < '( INSDC:4na:bin ) READ'> ( .SEQ_SPOT_ID, .SEQ_READ_ID ); + + INSDC:4na:bin out_4na_bin + = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset, .READ_LEN ) + | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, tmp_out_mismatch_4na_bin, out_has_ref_offset, out_ref_offset ) + | NCBI:align:raw_restore_read ( out_raw_read, .REF_ORIENTATION ); + + + // flags mismatches with the reference + // produced by actual comparison of REF_READ and READ + // TMP_HAS_MISMATCH is a hack to speed up retrieval during coverage recalculation + column bool_encoding TMP_HAS_MISMATCH; + bool out_has_mismatch + = .TMP_HAS_MISMATCH + | NCBI:align:generate_has_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); + readonly column bool HAS_MISMATCH = out_has_mismatch; + + INSDC:4na:bin out_mismatch_4na_bin + = NCBI:align:generate_mismatch ( REF_READ, READ, out_has_ref_offset, out_ref_offset ); + + INSDC:4na:bin tmp_out_mismatch_4na_bin = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( .TMP_MISMATCH ); + + // temporary column for reference coverage calculation + column < INSDC:dna:text> zip_encoding TMP_MISMATCH; + + INSDC:dna:text out_mismatch_dna_text + = .TMP_MISMATCH + | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); + + readonly column INSDC:dna:text MISMATCH = out_mismatch_dna_text; + readonly column INSDC:4na:bin MISMATCH = out_mismatch_4na_bin; + + physical column < INSDC:coord:zero > izip_encoding .MATE_REF_POS = MATE_REF_POS; + INSDC:coord:zero out_mate_ref_pos = .MATE_REF_POS + | < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); + + physical column < I64 > izip_encoding .MATE_REF_ID = MATE_REF_ID; + I64 out_mate_ref_id = .MATE_REF_ID + | < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); + + INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); + physical column < I32 > izip_encoding .TEMPLATE_LEN = TEMPLATE_LEN; + I32 out_template_len = .TEMPLATE_LEN + | NCBI:align:template_len(REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); + + physical column < bool > izip_encoding .MATE_REF_ORIENTATION = MATE_REF_ORIENTATION; + bool out_mate_ref_orientation = .MATE_REF_ORIENTATION + | < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); + + I64 out_mate_align_id = .MATE_ALIGN_ID; + physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; + extern column I64 MATE_ALIGN_ID = out_mate_align_id; + + physical column < I64 > izip_encoding .PRIMARY_ALIGNMENT_ID = PRIMARY_ALIGNMENT_ID; + + I32 read_idx = <I32> cast (.SEQ_READ_ID); + extern column I64 PRIMARY_ALIGNMENT_ID + = .PRIMARY_ALIGNMENT_ID + | <I64> simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID' > (.SEQ_SPOT_ID,.SEQ_READ_ID); + +}; + + +/* compressed_by_reference + * aligns internally represented sequence against reference + * alignment transcript is stored + * original sequence is reconstructed + * + * History: + * 1.2 - respond to changes in base table + */ +table NCBI:align:tbl:compressed_by_reference #1.2 + = NCBI:align:tbl:align_cmn #2.1 +{ + bool out_is_secondary = <bool> echo < false > (); + + // one value per base i.e. length is same as sum of READ_LEN + // partitioned by READ_START and READ_LEN into alternative alignments + // flags mismatches with the reference + extern default column bool_encoding HAS_MISMATCH; + bool out_has_mismatch = .HAS_MISMATCH; + + // has number of elements equal to number of true elements in HAS_MISMATCH + extern column INSDC:dna:text MISMATCH + { + read = out_mismatch_dna_text; + validate = < INSDC:dna:text > compare ( in_mismatch_dna_text, out_mismatch_dna_text ); + } + + INSDC:dna:text in_mismatch_dna_text + = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( MISMATCH ); + + INSDC:4na:bin in_mismatch_4na_bin + = < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_mismatch_dna_text ); + + extern column < ascii > zip_encoding ALIGN_GROUP; + + physical column < INSDC:4na:bin > zip_encoding .MISMATCH = in_mismatch_4na_bin; + + INSDC:4na:bin out_mismatch_4na_bin = .MISMATCH; + INSDC:dna:text out_mismatch_dna_text + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_mismatch_4na_bin ); + + I64 out_mate_align_id + = .MATE_ALIGN_ID + | NCBI:align:get_mate_align_id (.SEQ_SPOT_ID); + + physical column <I64> izip_encoding .MATE_ALIGN_ID = MATE_ALIGN_ID; + extern column I64 MATE_ALIGN_ID = out_mate_align_id; + + // restore reads from alignment columns and the reference + // optional .READ_LEN size defines PLOIDY + INSDC:4na:bin out_4na_bin + = NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset, .READ_LEN ) + | NCBI:align:align_restore_read ( ref_read_internal, out_has_mismatch, .MISMATCH, out_has_ref_offset, out_ref_offset ); + + // restore reads to its raw form (orientation is restored) + INSDC:4na:bin out_raw_read = NCBI:align:raw_restore_read (out_4na_bin,.REF_ORIENTATION); + + I64 primary_align_pair = < I64 > simple_sub_select < 'SEQUENCE','PRIMARY_ALIGNMENT_ID'> (.SEQ_SPOT_ID); + I64 out_mate_ref_id = < I64 > simple_sub_select < '','REF_ID'> (MATE_ALIGN_ID); + bool out_mate_ref_orientation = < bool > simple_sub_select < '','REF_ORIENTATION'> (MATE_ALIGN_ID); + INSDC:coord:zero out_mate_ref_pos = < INSDC:coord:zero > simple_sub_select < '','REF_POS'> (MATE_ALIGN_ID); + INSDC:coord:len out_mate_ref_len = < INSDC:coord:len > simple_sub_select < '','REF_LEN'> (MATE_ALIGN_ID); + readonly column U32 MATE_EDIT_DISTANCE = < U32 > simple_sub_select < '','EDIT_DISTANCE'> (MATE_ALIGN_ID); + readonly column ascii MATE_CIGAR_LONG = < ascii > simple_sub_select < '','CIGAR_LONG'> (MATE_ALIGN_ID); + readonly column ascii MATE_CIGAR_SHORT = < ascii > simple_sub_select < '','CIGAR_SHORT'> (MATE_ALIGN_ID); + readonly column INSDC:coord:len MATE_CIGAR_LONG_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_LONG_LEN'> (MATE_ALIGN_ID); + readonly column INSDC:coord:len MATE_CIGAR_SHORT_LEN = < INSDC:coord:len > simple_sub_select < '','CIGAR_SHORT_LEN'> (MATE_ALIGN_ID); + + I32 out_template_len = NCBI:align:template_len (REF_POS,out_mate_ref_pos,out_ref_len,out_mate_ref_len,REF_NAME,MATE_REF_NAME,SEQ_READ_ID); +}; + + +/* align_sorted + * deflated alignment data sorted against reference + * + * History: + * 1.2 - respond to changes in base table + */ +table NCBI:align:tbl:align_sorted #1.2 + = NCBI:align:tbl:compressed_by_reference #1.2 + , NCBI:align:tbl:global_ref_block #1.0.0 +{ + // 128K + column default limit = 131072; +}; + + +/* align_unsorted + * deflated alignment unsorted data + * + * History: + * 1.2 - respond to changes in base table + */ +table NCBI:align:tbl:align_unsorted #1.2 + = NCBI:align:tbl:compressed_by_reference #1.2 + , NCBI:align:tbl:local_ref_block #1.0.0 +{ + // 128K + column default limit = 131072; +}; + + +/* align_mate_sorted + * + * History: + * 1.1 - respond to changes in base table + */ +table NCBI:align:tbl:align_mate_sorted #1.1 + = NCBI:align:tbl:align_full #1.1 + , NCBI:align:tbl:global_ref_block #1.0.0 +{ + // 128K + column default limit = 131072; +}; + + +/* align_mate_unsorted + * + * History: + * 1.1 - respond to changes in base table + */ +table NCBI:align:tbl:align_mate_unsorted #1.1 + = NCBI:align:tbl:align_full #1.1 + , NCBI:align:tbl:local_ref_block #1.0.0 +{ + // 128K + column default limit = 131072; +}; + +/* align_allele + * alleles coverage extension + * + * History: + * 1.2 - respond to changes in base table + */ +table NCBI:align:tbl:align_allele #1.2 + = NCBI:align:tbl:align_unsorted #1.2 +{ + extern column < I64 > izip_encoding EVIDENCE_ALIGNMENT_IDS; + + /* + INSDC:quality:phred out_qual_phred + = < INSDC:quality:phred > echo < 30 > ( out_4na_bin ); + */ +}; + +/*-------------------------------------------------------------------------- + * seq + * alignment sequence table + */ +physical +I64 NCBI:align:sorted:alignment_id_encoding #1.0 +{ + decode + { + I64 outliers_removed = iunzip ( @ ); + return < I64 > outlier_decode < 0 > ( outliers_removed ); + } + + encode + { + I64 outliers_removed = < I64 > outlier_encode < 0 > ( @ ); + return izip ( outliers_removed ); + } +} + + +table NCBI:align:tbl:seq #1.1 = + NCBI:tbl:base_space #2.0.3, + NCBI:tbl:phred_quality #2.0.4, + NCBI:align:tbl:cmp_base_space #1, + NCBI:SRA:tbl:spotdesc #1.0.2, + NCBI:SRA:tbl:stats #1.2.0 +{ + // 128K + column default limit = 131072; + + // gets primary record in alignment table (size of column is NREADS) + // if sorted - should used special encoding + extern column <I64> izip_encoding PRIMARY_ALIGNMENT_ID; + + INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len trim_len = _spot_len; + + // size is NREADS + extern column < U8 > zip_encoding ALIGNMENT_COUNT; + + // auto-generate name from row-id + ascii out_name_fmt = < ascii > echo < '$R' > (); + + // temparary column + extern column < U64 > izip_encoding TMP_KEY_ID; + + // restored READ + INSDC:4na:bin out_dcmp_4na_bin + = NCBI:align:seq_restore_read (out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE); + + extern column < U64 > izip_encoding TI; + + extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; + + // restored LINKAGE_GROUP + readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) + | .CMP_LINKAGE_GROUP; +}; + + +table NCBI:align:tbl:cs_seq #1.2 +{ + /* writable columns */ + extern column INSDC:color:text CMP_CSREAD + = out_cmp_color_text + ; + + extern column < INSDC:dna:text > zip_encoding CS_KEY; + + extern default column < INSDC:quality:phred > zip_encoding QUALITY; + + extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_ID; + + extern column < U8 > zip_encoding ALIGNMENT_COUNT; + + extern column < INSDC:SRA:platform_id > zip_encoding PLATFORM; + + extern column < ascii > zip_encoding LABEL; + extern column < INSDC:coord:zero > izip_encoding LABEL_START; + extern column < INSDC:coord:len > izip_encoding LABEL_LEN; + + extern column < INSDC:SRA:xread_type > zip_encoding READ_TYPE; + extern column < INSDC:coord:zero > izip_encoding READ_START; + extern column < INSDC:coord:len > izip_encoding READ_LEN; + extern column < INSDC:SRA:read_filter > zip_encoding READ_FILTER; + + extern column < U64 > izip_encoding TMP_KEY_ID; + + extern column < ascii > zip_encoding SPOT_GROUP; + + extern column < U64 > izip_encoding TI; + + /* writing rules */ + INSDC:x2cs:bin in_cmp_x2cs_bin + = < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( CMP_CSREAD ) + ; + INSDC:2cs:bin in_cmp_2cs_bin + = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ) + ; + INSDC:x2cs:bin in_cmp_alt_x2cs_bin + = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ) + ; + physical column INSDC:2cs:packed .CMP_CSREAD + = ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ) + ; + physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD + = < INSDC:x2cs:bin > trim < ALIGN_LEFT, 0 > ( in_cmp_alt_x2cs_bin ) + ; + + /* reading rules */ + INSDC:2cs:packed phys_cmp_2cs_packed + = .CMP_CSREAD + ; + INSDC:x2cs:bin phys_cmp_alt_x2cs_bin + = .CMP_ALTCSREAD + ; + INSDC:2cs:packed phys_2cs_packed + = .CSREAD + ; + INSDC:x2cs:bin phys_alt_x2cs_bin + = .ALTCSREAD + ; + INSDC:2cs:bin out_cmp_2cs_bin + = ( INSDC:2cs:bin ) unpack ( phys_cmp_2cs_packed ) + ; + INSDC:2cs:bin out_2cs_bin + = ( INSDC:2cs:bin ) unpack ( phys_2cs_packed ) + ; + INSDC:x2cs:bin out_cmp_x2cs_bin + = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, phys_cmp_alt_x2cs_bin ) + | ( INSDC:x2cs:bin ) out_cmp_2cs_bin + ; + INSDC:x2cs:bin out_x2cs_bin + = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, phys_alt_x2cs_bin ) + | ( INSDC:x2cs:bin ) out_2cs_bin + ; + INSDC:color:text out_cmp_color_text + = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ) + ; + INSDC:color:text out_color_text + = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ) + ; + + /* triggers from stats */ + INSDC:quality:phred in_qual_phred + = QUALITY + ; + INSDC:coord:len in_read_len + = READ_LEN + ; + INSDC:SRA:xread_type in_read_type + = READ_TYPE + ; + ascii in_spot_group + = SPOT_GROUP + ; + trigger meta_stats + = NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type, in_spot_group ) + | NCBI:SRA:cmp_stats_trigger ( in_cmp_x2cs_bin, in_qual_phred, in_read_len, in_read_type ) + ; + trigger qual_stats + = NCBI:SRA:phred_stats_trigger #1 ( in_qual_phred ) + ; + + extern column <ascii> zip_encoding CMP_LINKAGE_GROUP; + + // restored LINKAGE_GROUP + readonly column ascii LINKAGE_GROUP = NCBI:align:seq_restore_linkage_group(.CMP_LINKAGE_GROUP, .PRIMARY_ALIGNMENT_ID) + | .CMP_LINKAGE_GROUP; +}; + +table NCBI:align:view:cs_seq #1.1 = NCBI:align:tbl:cs_seq #1.2 +{ + // various READ columns + default readonly column INSDC:dna:text READ + = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_dcmp_4na_bin ) + | < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) + ; + readonly column INSDC:4na:bin READ = out_dcmp_4na_bin | out_4na_bin; + readonly column INSDC:4na:packed READ = pack ( out_dcmp_4na_bin ) | pack ( out_4na_bin ); + readonly column INSDC:x2na:bin READ = out_dcmp_x2na_bin | out_x2na_bin; + readonly column INSDC:2na:bin READ = out_dcmp_2na_bin | out_2na_bin; + INSDC:2na:bin out_dcmp_2na_bin + = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2na_bin ) + ; + INSDC:2na:bin out_2na_bin + = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ) + ; + readonly column INSDC:2na:packed READ = pack ( out_dcmp_2na_bin ) | pack ( out_2na_bin ); + + // decompression in base space + INSDC:coord:len cmp_read_len + = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < true > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) + ; + INSDC:coord:zero cmp_read_start + = NCBI:align:make_read_start #1 ( cmp_read_len ) + ; + INSDC:x2na:bin out_cmp_x2na_bin + = NCBI:dna_from_color #1 ( out_cmp_x2cs_bin, cmp_read_start, cmp_read_len, .CS_KEY, color_matrix ) + ; + INSDC:x2na:bin out_x2na_bin + = NCBI:dna_from_color #1 ( out_x2cs_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) + ; + INSDC:4na:bin out_cmp_4na_bin + = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_cmp_x2na_bin ) + ; + INSDC:4na:bin out_4na_bin + = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ) + ; + INSDC:4na:bin out_dcmp_4na_bin + = NCBI:align:seq_restore_read ( out_cmp_4na_bin, .PRIMARY_ALIGNMENT_ID, .READ_LEN, .READ_TYPE ) + ; + + + // various CSREAD columns + default readonly column INSDC:color:text CSREAD + = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_dcmp_x2cs_bin ) + | out_color_text; + readonly column INSDC:x2cs:bin CSREAD = out_dcmp_x2cs_bin | out_x2cs_bin; + readonly column INSDC:2cs:bin CSREAD = out_dcmp_2cs_bin | out_2cs_bin; + INSDC:2cs:bin out_dcmp_2cs_bin + = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_dcmp_x2cs_bin ) + ; + readonly column INSDC:2cs:packed CSREAD = pack ( out_dcmp_2cs_bin ) | out_2cs_bin; + + + // decompression in color space + INSDC:x2na:bin out_dcmp_x2na_bin + = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ) + ; + INSDC:x2cs:bin out_dcmp_x2na_x2cs_bin + = NCBI:color_from_dna #1 ( out_dcmp_x2na_bin, .READ_START, .READ_LEN, .CS_KEY, color_matrix ) + ; + INSDC:coord:len aligned_read_len + = < INSDC:coord:len > NCBI:align:make_cmp_read_desc #1 < false > ( .READ_LEN, .PRIMARY_ALIGNMENT_ID ) + ; + INSDC:x2cs:bin out_dcmp_x2cs_bin + = < INSDC:x2cs:bin > NCBI:align:seq_construct_read #1 ( out_dcmp_x2na_x2cs_bin, .READ_LEN, out_cmp_x2cs_bin, cmp_read_len ) + ; + + // CS_NATIVE - dynamic + U32 cmp_csread_row_len + = row_len #1 ( phys_cmp_2cs_packed ) + ; + U32 cmp_csread_not_zero + = < U32 > clip < 0, 1 > ( cmp_csread_row_len ) + ; + readonly column bool CS_NATIVE + = < U32, bool > map < [ 0, 1 ], [ false, true ] > ( cmp_cs_read_not_zero ) + ; + + // COLOR_MATRIX + readonly column U8 COLOR_MATRIX + = color_matrix + ; + U8 color_matrix + = < U8 > echo < INSDC:color:default_matrix > () + ; + + // various QUALITY types + readonly column INSDC:quality:text:phred_33 QUALITY + = out_qual_text_phred_33 + | ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( .QUALITY ); + readonly column INSDC:quality:text:phred_64 QUALITY + = out_qual_text_phred_64 + | ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( .QUALITY ); + + // SPOT_LEN + INSDC:coord:len spot_len + = ( INSDC:coord:len ) row_len ( out_dcmp_4na_bin ) + | ( INSDC:coord:len ) row_len ( out_4na_bin ) + ; + readonly column INSDC:coord:len SPOT_LEN = spot_len; + + // TRIM_START + readonly column INSDC:coord:zero TRIM_START + = < INSDC:coord:zero > echo < 0 > () + ; + readonly column INSDC:coord:one TRIM_START + = < INSDC:coord:one > echo < 1 > () + ; + // TRIM_LEN + readonly column INSDC:coord:len TRIM_LEN = spot_len; + + // MIN_SPOT_ID + readonly column INSDC:SRA:spotid_t MIN_SPOT_ID + = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MIN" > () + ; + // MAX_SPOT_ID + readonly column INSDC:SRA:spotid_t MAX_SPOT_ID + = < INSDC:SRA:spotid_t > meta:value < "STATS/TABLE/SPOT_MAX" > () + ; + // SPOT_COUNT + readonly column U64 SPOT_COUNT + = < U64 > meta:value < "STATS/TABLE/SPOT_COUNT" > () + ; + // BASE_COUNT + U64 base_count + = < U64 > meta:value < "STATS/TABLE/BASE_COUNT" > () + ; + readonly column U64 BASE_COUNT = base_count; + // BIO_BASE_COUNT + readonly column U64 BIO_BASE_COUNT + = < U64 > meta:value < "STATS/TABLE/BIO_BASE_COUNT" > () + ; + // CMP_BASE_COUNT + readonly column U64 CMP_BASE_COUNT + = < U64 > meta:value < "STATS/TABLE/CMP_BASE_COUNT" > () + | base_count + ; + + // various PLATFORM + // TBD + + // SPOT_ID + I64 rowid_64 = row_id (); + readonly column INSDC:SRA:spotid_t SPOT_ID + = cast ( rowid_64 ) + ; + + readonly column ascii NAME + = sprintf < "%u" > ( SPOT_ID ) + ; + +}; + + +/*********************************** +* Reference table - to store reference sequences +* Sequences are divided in chunks. Two sequences never share a chunk. +* SEQ_LEN - real size of a chunk should never exceed MAX_SEQ_LEN when it is set +* READ - inherited from NCBI:tbl:base_space +* CMP_READ,CMP_ALTREAD - are inherited from NCBI:align:tbl:cmp_base_space +* SEQ_ID,SEQ_START,SEQ_LEN are inherited from NCBI:align:tbl:seqloc +* .skey contains NAME of the chunk - it corresponds to actual name used in BAM (chr1,chr2, etc....) +* +* SEQ_START,SEQ_LEN,MAX_SEQ_LEN,SEQID and rowlen(READ) operate the following way +* - SEQ_LEN < MAX_SEQ_LEN - should only happen on the last chunk of the sequence +* - .READ is absent - there should be a retrieval from external services by SEQ_ID,SEQ_START,SEQ_LEN +* - rowlen(.READ) = 0 && SEQ_START==0 (used as flag) - the sequence is SEQ_LEN repetition of 'N' +* - rowlen(.READ) = 0 && SEQ_START >= 1 - the sequence have to be fetched from external sources +* - 0 < rowlen(.READ)< SEQ_LEN -- the sequence have to be filled with 'N's +* +v***********************************/ +table NCBI:align:tbl:reference #2 = + NCBI:align:tbl:cmp_base_space #1, + NCBI:tbl:base_space #2.0.3, + NCBI:tbl:seqloc #1, + NCBI:SRA:tbl:stats #1.2.0 +{ + INSDC:quality:phred out_qual_phred + = < INSDC:quality:phred > echo < 30 > ( out_dcmp_4na_bin ); + + // MAX_SEQ_LEN - should be a constant == static column + extern column < U32 > izip_encoding MAX_SEQ_LEN; + + // indicates if sequence has circular structure + // copied from refSeq + extern column bool_encoding CIRCULAR; + + // make CS_KEY writable + INSDC:dna:text in_cs_key + = < INSDC:dna:text, INSDC:dna:text > map < 'acgtn', 'ACGTN' > ( CS_KEY ); + physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; + + U32 in_spot_len = SEQ_LEN; + + INSDC:coord:len _alt_in_read_len + = READ_LEN + | SEQ_LEN; + + INSDC:SRA:xread_type _alt_in_read_type + = READ_TYPE + | < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > (); + + // extra columns needed for CS conversion + INSDC:coord:zero out_read_start = < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len out_read_len = .SEQ_LEN; + + extern column utf8 NAME = out_spot_name_utf8; + physical utf8 .NAME = idx:text:insert #1.0 < 'i_name' > ( NAME ); + + utf8 out_spot_name_utf8 = idx:text:project #1.0 < 'i_name' > (.NAME ); + + ascii out_spot_name = cast ( out_spot_name_utf8 ); + + INSDC:coord:zero trim_start = < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len trim_len = base_space_spot_len; + + ascii out_label + = < ascii > echo < "reference" > (); + INSDC:coord:zero out_label_start + = < INSDC:coord:zero > echo < 0 > (); + INSDC:coord:len out_label_len + = < INSDC:coord:len > echo < 9 > (); + + U32 out_nreads + = < U32 > echo < 1 > (); + INSDC:SRA:xread_type out_read_type + = < INSDC:SRA:xread_type > echo < 3 > (); + INSDC:SRA:read_filter out_rd_filter + = < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > (); + + +// Columns of computed coverages by alignment + + // TBD: use percentiles instead of min/max? + // maximum value clipped at 255 of the coverage density + // for a chunk + extern column < U8 > izip_encoding CGRAPH_HIGH; + + // minimum value clipped at 255 of the coverage density + // for a chunk + extern column < U8 > izip_encoding CGRAPH_LOW; + + // count of the number of mismatches in the chunk + extern column < U32 > izip_encoding CGRAPH_MISMATCHES; + + // count of the number of inserts and deletes in the chunk + extern column < U32 > izip_encoding CGRAPH_INDELS; + + // List of row ids from alignment tables + extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; + extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; + extern column < I64 > izip_encoding EVIDENCE_INTERVAL_IDS; + + // both OVERLAP_REF_* columns are array of three elements, matching number of *_IDS columns above. + // points back to an offset where the alignments to this chunk start + extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; + // indicates the length of the longest tail of the alignmnent to this chunk which start in previous chunks + // if value of an element in this col is zero corresponding value of OVERLAP_REF_POS is meaningless + extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; + + // Mechanism to seach for NAME + readonly column vdb:row_id_range NAME_RANGE + = idx:text:lookup #1.0 < 'i_name', 'QUERY_SEQ_NAME' > (); + + // Fully instantiates READ + INSDC:4na:bin out_dcmp_4na_bin + = NCBI:align:ref_restore_read (out_cmp_4na_bin, .SEQ_ID, .SEQ_START, .SEQ_LEN); +} + +// THE DATABASES +database NCBI:align:db:alignment_sorted #1.3 +{ + table NCBI:align:tbl:reference #2 REFERENCE; + table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; + table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; + table NCBI:align:tbl:seq #1.1 SEQUENCE; + table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; + table NCBI:align:tbl:qstat #1.0 QUAL_STAT; +}; + +database NCBI:align:db:alignment_unsorted #1.3 +{ + table NCBI:align:tbl:reference #2 REFERENCE; + table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; + table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; + table NCBI:align:tbl:seq #1.1 SEQUENCE; + table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; + table NCBI:align:tbl:qstat #1.0 QUAL_STAT; +}; + +database NCBI:align:db:alignment_evidence #1.3 +{ + table NCBI:align:tbl:reference #2 REFERENCE; + table NCBI:align:tbl:align_unsorted #1.2 PRIMARY_ALIGNMENT; + table NCBI:align:tbl:align_mate_unsorted #1.1 SECONDARY_ALIGNMENT; + table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; + table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; + table NCBI:align:tbl:seq #1.1 SEQUENCE; + table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; + table NCBI:align:tbl:qstat #1.0 QUAL_STAT; +}; + +database NCBI:align:db:alignment_evidence_sorted #1.2 +{ + table NCBI:align:tbl:reference #2 REFERENCE; + table NCBI:align:tbl:align_sorted #1.2 PRIMARY_ALIGNMENT; + table NCBI:align:tbl:align_mate_sorted #1.1 SECONDARY_ALIGNMENT; + table NCBI:align:tbl:align_allele #1.2 EVIDENCE_INTERVAL; + table NCBI:align:tbl:align_mate_unsorted #1.1 EVIDENCE_ALIGNMENT; + table NCBI:align:tbl:seq #1.1 SEQUENCE; + table NCBI:align:view:cs_seq #1.1 CS_SEQUENCE; + table NCBI:align:tbl:qstat #1.0 QUAL_STAT; +}; + +database NCBI:align:db:unaligned #1 +{ + table NCBI:align:tbl:seq #1.1 SEQUENCE; + table NCBI:SRA:ABI:tbl:v2 #1.0.4 CS_SEQUENCE; + table NCBI:align:tbl:qstat #1.0 QUAL_STAT; +};