Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/reference.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * VDB Alignment types, functions and tables | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'csra2/stats.vschema'; | |
| 34 | |
| 35 | |
| 36 /*-------------------------------------------------------------------------- | |
| 37 * tables | |
| 38 */ | |
| 39 table NCBI:csra2:tbl:reference #1.0 | |
| 40 = NCBI:csra2:tbl:read_stats #1 | |
| 41 { | |
| 42 /* CHUNK_SIZE | |
| 43 * describes the maximum number of bases in any cell | |
| 44 */ | |
| 45 extern column INSDC:coord:len CHUNK_SIZE; | |
| 46 | |
| 47 /* CIRCULAR | |
| 48 * true if the reference is circular | |
| 49 */ | |
| 50 extern column bool CIRCULAR; | |
| 51 | |
| 52 /* CANONICAL_NAME | |
| 53 * this should be an accessioned proper name | |
| 54 */ | |
| 55 extern column utf8 CANONICAL_NAME; | |
| 56 | |
| 57 /* COMMON_NAME | |
| 58 * this name may be ambiguous or missing entirely | |
| 59 */ | |
| 60 extern column utf8 COMMON_NAME; | |
| 61 | |
| 62 /* LOCAL_SEQUENCE | |
| 63 * supports name overloading by type | |
| 64 */ | |
| 65 extern default column INSDC:dna:text LOCAL_SEQUENCE | |
| 66 { | |
| 67 read = out_local_dna_text; | |
| 68 validate = < INSDC:dna:text > compare ( in_local_dna_text, out_local_dna_text ); | |
| 69 } | |
| 70 extern column INSDC:4na:bin LOCAL_SEQUENCE = out_local_4na_bin; | |
| 71 | |
| 72 /* PRIMARY_ALIGNMENT_IDS | |
| 73 * SECONDARY_ALIGNMENT_IDS | |
| 74 * an index to rows in the PRIMARY_ALIGNMENT and | |
| 75 * SECONDARY_ALIGNMENT tables having alignments | |
| 76 * STARTING within this chunk | |
| 77 * | |
| 78 * the indicies MUST be sorted in clustered order, | |
| 79 * meaning that they are in ascending numeric order | |
| 80 */ | |
| 81 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; | |
| 82 extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; | |
| 83 | |
| 84 /* OVERLAP_REF_POS | |
| 85 * min ( REF_POS ) for all alignments intersecting this chunk | |
| 86 * but starting in a previous chunk, where the stored position | |
| 87 * is in reference coordinates. | |
| 88 * | |
| 89 * a value of 0 indicates that no alignments starting to | |
| 90 * the left of this chunk also intersect with it. | |
| 91 */ | |
| 92 extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; | |
| 93 | |
| 94 /* OVERLAP_REF_LEN | |
| 95 * max ( REF_POS + REF_LEN - CHUNK_START ) % CHUNK_SIZE | |
| 96 * for all alignments intersecting this chunk but starting | |
| 97 * in a previous chunk. | |
| 98 * | |
| 99 * indicates the amount of this chunk that is needed by | |
| 100 * alignments not starting within chunk. so if a slice on | |
| 101 * this reference were to start at 100 bases into this chunk, | |
| 102 * for example, and the OVERLAP_REF_LEN were 100 or less, then | |
| 103 * there are no alignments from prior chunks that need to be | |
| 104 * considered. | |
| 105 */ | |
| 106 extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; | |
| 107 | |
| 108 /* COVERAGE | |
| 109 * graphing statistics for the chunk | |
| 110 */ | |
| 111 | |
| 112 // clipped at 255 | |
| 113 extern column < U8 > izip_encoding CGRAPH_HIGH; | |
| 114 extern column < U8 > izip_encoding CGRAPH_LOW; | |
| 115 | |
| 116 // count of the number of mismatches in the chunk | |
| 117 extern column < U32 > izip_encoding CGRAPH_MISMATCHES; | |
| 118 | |
| 119 // count of the number of inserts and deletes in the chunk | |
| 120 extern column < U32 > izip_encoding CGRAPH_INDELS; | |
| 121 | |
| 122 | |
| 123 /* writing rules */ | |
| 124 INSDC:dna:text in_local_dna_text | |
| 125 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( LOCAL_SEQUENCE ); | |
| 126 ; | |
| 127 INSDC:4na:bin in_local_4na_bin | |
| 128 = < INSDC:4na:bin > range_validate < 0, 15 > ( LOCAL_SEQUENCE ) | |
| 129 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_local_dna_text ) | |
| 130 ; | |
| 131 INSDC:2na:bin in_local_2na_bin | |
| 132 = INSDC:SEQ:rand_4na_2na ( in_local_4na_bin ) | |
| 133 ; | |
| 134 INSDC:4na:bin in_ambig_4na_bin | |
| 135 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_local_4na_bin ); | |
| 136 ; | |
| 137 | |
| 138 INSDC:4na:bin in_stats_seq = in_local_4na_bin; | |
| 139 | |
| 140 /* physical columns for sequence */ | |
| 141 physical column INSDC:2na:packed .LOCAL_SEQUENCE | |
| 142 = ( INSDC:2na:packed ) pack ( in_local_2na_bin ) | |
| 143 ; | |
| 144 physical column < INSDC:4na:bin > zip_encoding .LOCAL_AMBIGUITY | |
| 145 = < INSDC:4na:bin > trim < 0, 0 > ( in_ambig_4na_bin ) | |
| 146 ; | |
| 147 | |
| 148 /* reading rules */ | |
| 149 INSDC:2na:packed out_local_2na_packed | |
| 150 = .LOCAL_SEQUENCE | |
| 151 ; | |
| 152 INSDC:2na:bin out_local_2na_bin | |
| 153 = ( INSDC:2na:bin ) unpack ( out_local_2na_packed ) | |
| 154 ; | |
| 155 INSDC:4na:bin out_local_2na_4na_bin | |
| 156 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_local_2na_bin ); | |
| 157 ; | |
| 158 INSDC:4na:bin out_local_4na_bin | |
| 159 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_local_2na_4na_bin, .LOCAL_AMBIGUITY ) | |
| 160 ; | |
| 161 INSDC:dna:text out_local_dna_text | |
| 162 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_local_4na_bin ) | |
| 163 ; | |
| 164 | |
| 165 | |
| 166 INSDC:coord:len in_local_read_len | |
| 167 = ( INSDC:coord:len ) row_len ( in_local_2na_bin ) | |
| 168 ; | |
| 169 INSDC:SRA:xread_type in_local_read_type | |
| 170 = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > () | |
| 171 ; | |
| 172 } | |
| 173 | |
| 174 | |
| 175 /*-------------------------------------------------------------------------- | |
| 176 * "views" | |
| 177 */ | |
| 178 table NCBI:csra2:view:reference #1.0 | |
| 179 = NCBI:csra2:tbl:reference #1.0 | |
| 180 { | |
| 181 /* EXTERNAL | |
| 182 * may need to be a function | |
| 183 * it can test the CANONICAL_NAME as in cSRA.v1, | |
| 184 * but if internal it can also check row_length of bases | |
| 185 */ | |
| 186 readonly column bool EXTERNAL | |
| 187 = < bool > exists < false > ( .LOCAL_SEQUENCE ) | |
| 188 | < bool > echo < true > () | |
| 189 ; | |
| 190 | |
| 191 /* SEQUENCE | |
| 192 * available as text, 4na, x2na, 2na | |
| 193 */ | |
| 194 default readonly column INSDC:dna:text SEQUENCE | |
| 195 = out_dna_text | |
| 196 ; | |
| 197 readonly column INSDC:4na:bin SEQUENCE | |
| 198 = out_4na_bin | |
| 199 ; | |
| 200 readonly column INSDC:4na:packed SEQUENCE | |
| 201 = ( INSDC:4na:packed ) pack ( out_4na_bin ) | |
| 202 ; | |
| 203 readonly column INSDC:x2na:bin SEQUENCE | |
| 204 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ) | |
| 205 ; | |
| 206 readonly column INSDC:2na:bin SEQUENCE | |
| 207 = out_2na_bin | |
| 208 ; | |
| 209 readonly column INSDC:2na:packed SEQUENCE | |
| 210 = pack ( out_2na_bin ) | |
| 211 ; | |
| 212 | |
| 213 /* QUALITY | |
| 214 * This is fake column for compatibility | |
| 215 */ | |
| 216 readonly column INSDC:quality:phred QUALITY | |
| 217 = out_qual_phred | |
| 218 ; | |
| 219 | |
| 220 /* column aliases */ | |
| 221 readonly column INSDC:coord:len MAX_SEQ_LEN = .CHUNK_SIZE; | |
| 222 readonly column ascii SEQ_ID = cast ( .CANONICAL_NAME ); | |
| 223 | |
| 224 /* sequence productions */ | |
| 225 INSDC:4na:bin out_4na_bin | |
| 226 = out_local_4na_bin | |
| 227 // TODO: | sub-select from external table | |
| 228 ; | |
| 229 | |
| 230 INSDC:dna:text out_dna_text | |
| 231 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) | |
| 232 ; | |
| 233 | |
| 234 INSDC:2na:bin out_2na_bin | |
| 235 = INSDC:SEQ:rand_4na_2na ( out_4na_bin ) | |
| 236 ; | |
| 237 | |
| 238 /* quality productions */ | |
| 239 INSDC:quality:phred out_qual_phred | |
| 240 = < INSDC:quality:phred > echo < 30 > ( SEQUENCE ) | |
| 241 ; | |
| 242 | |
| 243 INSDC:quality:phred in_stats_qual_phred = out_qual_phred; | |
| 244 } | |
| 245 |
