Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/seq.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * Sequence schema implementation tables | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'ncbi/ncbi.vschema'; | |
| 34 include 'insdc/sra.vschema'; | |
| 35 | |
| 36 | |
| 37 /*-------------------------------------------------------------------------- | |
| 38 * n_encoding - implementation | |
| 39 * introduces common virtual productions | |
| 40 */ | |
| 41 table NCBI:tbl:n_encoding #1 | |
| 42 { | |
| 43 U8 n_encoding_dummy | |
| 44 = read_unpack | |
| 45 | read_ndecode; | |
| 46 }; | |
| 47 | |
| 48 | |
| 49 /*-------------------------------------------------------------------------- | |
| 50 * seqloc | |
| 51 * NCBI sequence locator table | |
| 52 */ | |
| 53 table NCBI:tbl:seqloc #1.0 | |
| 54 { | |
| 55 /* SEQ_ID | |
| 56 * a FASTA-style SeqId | |
| 57 */ | |
| 58 extern column < ascii > zip_encoding SEQ_ID; | |
| 59 | |
| 60 /* SEQ_START | |
| 61 * provided in both 1 ( default ) and 0-based coordinates | |
| 62 */ | |
| 63 extern default column < INSDC:coord:one > izip_encoding SEQ_START; | |
| 64 readonly column INSDC:coord:zero SEQ_START | |
| 65 = ( INSDC:coord:zero ) < INSDC:coord:one > diff < 1 > ( .SEQ_START ); | |
| 66 | |
| 67 /* SEQ_LEN | |
| 68 */ | |
| 69 extern column < INSDC:coord:len > izip_encoding SEQ_LEN; | |
| 70 }; | |
| 71 | |
| 72 | |
| 73 /*-------------------------------------------------------------------------- | |
| 74 * base_space - implementation | |
| 75 * READ column rules | |
| 76 */ | |
| 77 | |
| 78 /* color_from_dna | |
| 79 * use starting keys and color matrix to convert individual reads | |
| 80 * to base space. | |
| 81 */ | |
| 82 extern function | |
| 83 INSDC:x2cs:bin NCBI:color_from_dna #1 ( INSDC:x2na:bin bin_x2na, | |
| 84 INSDC:coord:zero read_start, INSDC:coord:len read_len, | |
| 85 INSDC:dna:text cs_key, U8 color_matrix ); | |
| 86 | |
| 87 | |
| 88 /* dcmp_base_space | |
| 89 * table to introduce common virtual productions | |
| 90 */ | |
| 91 table NCBI:tbl:dcmp_base_space #1 | |
| 92 { | |
| 93 // rules to introduce purely virtual productions | |
| 94 // never expected to resolve... | |
| 95 INSDC:dna:text dcmp_virtual_productions | |
| 96 = out_dcmp_4na_bin | |
| 97 | out_dcmp_x2na_bin | |
| 98 | out_dcmp_2na_bin | |
| 99 | out_dcmp_2na_packed; | |
| 100 } | |
| 101 | |
| 102 /* history: | |
| 103 * 1.0.1 - base explicitly upon sequence #1.0.1, spotdesc #1.0.1 | |
| 104 * 1.0.2 - spotdesc #1.0.2 | |
| 105 * 1.0.3 - base upon dcmp_base_space for "out_dcmp_2na_bin" | |
| 106 */ | |
| 107 table NCBI:tbl:base_space_common #1.0.3 | |
| 108 = INSDC:tbl:sequence #1.0.1 | |
| 109 , INSDC:SRA:tbl:spotdesc #1.0.2 | |
| 110 , INSDC:SRA:tbl:stats #1.1.0 | |
| 111 , NCBI:tbl:dcmp_base_space #1.0.0 | |
| 112 { | |
| 113 /* INSDC:tbl:sequence inherited virtual productions | |
| 114 */ | |
| 115 | |
| 116 // cs_native - tells user color space is not native | |
| 117 bool cs_native = < bool > echo < false > (); | |
| 118 | |
| 119 // in_cs_key is not writable in base_space | |
| 120 | |
| 121 // color-space key is completely artificial | |
| 122 INSDC:dna:text out_cs_key | |
| 123 = .CS_KEY | |
| 124 | < INSDC:dna:text > echo < 'T' > ( out_read_type ) | |
| 125 | < INSDC:dna:text > echo < 'T' > ( out_read_len ) | |
| 126 | < INSDC:dna:text > echo < 'T' > (); | |
| 127 | |
| 128 // unambiguous synthesized 2cs | |
| 129 INSDC:2cs:bin out_2cs_bin | |
| 130 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2cs_bin ); | |
| 131 | |
| 132 // unambiguous unpacked 2na | |
| 133 INSDC:2na:bin out_2na_bin | |
| 134 = out_dcmp_2na_bin | |
| 135 | ( INSDC:2na:bin ) unpack ( out_2na_packed ); | |
| 136 | |
| 137 // synthesized color sequence | |
| 138 INSDC:x2cs:bin out_x2cs_bin | |
| 139 = NCBI:color_from_dna ( out_x2na_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); | |
| 140 | |
| 141 // synthesized packed 2cs | |
| 142 INSDC:2cs:packed out_2cs_packed | |
| 143 = ( INSDC:2cs:packed ) pack ( out_2cs_bin ); | |
| 144 | |
| 145 // synthesized packed 4na | |
| 146 INSDC:4na:packed out_4na_packed | |
| 147 = ( INSDC:4na:packed ) pack ( out_4na_bin ); | |
| 148 | |
| 149 // synthesized color text | |
| 150 INSDC:color:text out_color_text | |
| 151 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); | |
| 152 | |
| 153 // published color matrix | |
| 154 U8 out_color_matrix | |
| 155 = < U8 > echo < INSDC:color:default_matrix > (); | |
| 156 | |
| 157 // spot_len and fixed_spot_len | |
| 158 INSDC:coord:len base_space_spot_len | |
| 159 = ( INSDC:coord:len ) row_len ( out_2na_packed ); | |
| 160 INSDC:coord:len base_space_fixed_spot_len | |
| 161 = ( INSDC:coord:len ) fixed_row_len ( out_2na_packed ); | |
| 162 | |
| 163 | |
| 164 /* INSDC:tbl:sequence inherited productions | |
| 165 * out_signal | |
| 166 * in_dna_text | |
| 167 * out_4na_bin | |
| 168 * out_dna_text | |
| 169 * out_x2na_bin | |
| 170 * out_2na_packed | |
| 171 */ | |
| 172 | |
| 173 /* INSDC:SRA:tbl:stats inherited productions | |
| 174 * in_stats_bin | |
| 175 */ | |
| 176 | |
| 177 /* NCBI:tbl:dcmp_base_space inherited productions | |
| 178 * out_dcmp_2na_bin | |
| 179 * out_dcmp_4na_bin | |
| 180 * out_dcmp_x2na_bin | |
| 181 * out_dcmp_2na_packed | |
| 182 */ | |
| 183 }; | |
| 184 | |
| 185 | |
| 186 /* base_space_nocol | |
| 187 * this table describes viewing rules | |
| 188 * but omits writing rules and physical column description | |
| 189 * in order to support older tables | |
| 190 * | |
| 191 * history: | |
| 192 * 1.0.1 - base explicitly upon base_space_common #1.0.1 | |
| 193 * 1.0.2 - base explicitly upon base_space_common #1.0.2 | |
| 194 * 1.0.3 - " " 1.0.3 | |
| 195 */ | |
| 196 table NCBI:tbl:base_space_nocol #1.0.3 | |
| 197 = NCBI:tbl:base_space_common #1.0.3 | |
| 198 , NCBI:tbl:n_encoding #1 | |
| 199 { | |
| 200 // incoming is disabled | |
| 201 | |
| 202 // synthesized dna text | |
| 203 INSDC:dna:text out_dna_text | |
| 204 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); | |
| 205 | |
| 206 // synthesized 4na | |
| 207 INSDC:4na:bin out_4na_bin | |
| 208 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); | |
| 209 | |
| 210 // unpacked 2na with ambiguities | |
| 211 INSDC:x2na:bin out_x2na_bin | |
| 212 = ( INSDC:x2na:bin ) read_ndecode; | |
| 213 | |
| 214 // interface with n-encoded qualities | |
| 215 U8 read_unpack = out_2na_bin; | |
| 216 | |
| 217 /* INSDC:tbl:sequence inherited productions | |
| 218 * out_signal | |
| 219 * out_2na_packed | |
| 220 */ | |
| 221 | |
| 222 /* NCBI:tbl:n_encoding inherited productions | |
| 223 * read_ndecode | |
| 224 */ | |
| 225 }; | |
| 226 | |
| 227 /* base_space #1 | |
| 228 * this schema brings in standard .READ column for v1 tables | |
| 229 * | |
| 230 * history: | |
| 231 * 1.0.1 - base explicitly upon base_space_nocol #1.0.1 | |
| 232 * 1.0.2 - base explicitly upon base_space_nocol #1.0.2 | |
| 233 * 1.0.3 - base explicitly upon base_space_nocol #1.0.3 | |
| 234 */ | |
| 235 table NCBI:tbl:base_space #1.0.3 = NCBI:tbl:base_space_nocol #1.0.3 | |
| 236 { | |
| 237 // 2-bit 2na representation (0..3) | |
| 238 INSDC:2na:packed out_2na_packed = .READ; | |
| 239 | |
| 240 // no rules for writing to .READ | |
| 241 | |
| 242 /* INSDC:tbl:sequence inherited productions | |
| 243 * out_signal | |
| 244 */ | |
| 245 | |
| 246 /* NCBI:tbl:n_encoding inherited productions | |
| 247 * read_ndecode | |
| 248 */ | |
| 249 }; | |
| 250 | |
| 251 | |
| 252 /* base_space #2 | |
| 253 * standard current base-space table | |
| 254 * | |
| 255 * history: | |
| 256 * 2.0.2 - base_space_common #1.0.2 | |
| 257 * 2.0.3 - base_space_common #1.0.3 now has dcmp_base_space as well | |
| 258 */ | |
| 259 table NCBI:tbl:base_space #2.0.3 | |
| 260 = NCBI:tbl:base_space_common #1.0.3 | |
| 261 , NCBI:tbl:dcmp_base_space #1 | |
| 262 { | |
| 263 /* input rules | |
| 264 */ | |
| 265 | |
| 266 // input text | |
| 267 INSDC:dna:text in_dna_text | |
| 268 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ ); | |
| 269 | |
| 270 // input 4na bin | |
| 271 INSDC:4na:bin in_4na_bin | |
| 272 = < INSDC:4na:bin > range_validate < 0, 15 > ( READ ) | |
| 273 | ( INSDC:4na:bin ) unpack ( in_4na_packed ) | |
| 274 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text ) | |
| 275 | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_x2na_bin ); | |
| 276 | |
| 277 // input 4na packed | |
| 278 INSDC:4na:packed in_4na_packed = READ; | |
| 279 | |
| 280 // input x2na bin | |
| 281 INSDC:x2na:bin in_x2na_bin | |
| 282 = < INSDC:x2na:bin > range_validate < 0, 4 > ( READ ) | |
| 283 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_4na_bin ); | |
| 284 | |
| 285 // input 2na bin | |
| 286 INSDC:2na:bin in_2na_bin | |
| 287 = < INSDC:2na:bin > range_validate < 0, 3 > ( READ ) | |
| 288 | ( INSDC:2na:bin ) unpack ( in_2na_packed ) | |
| 289 | INSDC:SEQ:rand_4na_2na ( in_4na_bin ); | |
| 290 | |
| 291 // input 2na packed | |
| 292 INSDC:2na:packed in_2na_packed = READ; | |
| 293 | |
| 294 // input 4na alt-read ( ambiguities ) | |
| 295 INSDC:4na:bin in_alt_4na_bin | |
| 296 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin ); | |
| 297 | |
| 298 // preparing a feed into stats column | |
| 299 U8 in_stats_bin = in_2na_bin; | |
| 300 | |
| 301 | |
| 302 /* physical columns | |
| 303 */ | |
| 304 | |
| 305 physical column INSDC:2na:packed .READ | |
| 306 = in_2na_packed | |
| 307 | ( INSDC:2na:packed ) pack ( in_2na_bin ); | |
| 308 | |
| 309 physical column < INSDC:4na:bin > zip_encoding .ALTREAD | |
| 310 = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin ); | |
| 311 | |
| 312 | |
| 313 /* output rules | |
| 314 */ | |
| 315 | |
| 316 // output 2na packed | |
| 317 INSDC:2na:packed out_2na_packed | |
| 318 = .READ | |
| 319 | out_dcmp_2na_packed; | |
| 320 | |
| 321 // output x2na bin | |
| 322 INSDC:x2na:bin out_x2na_bin | |
| 323 = out_dcmp_x2na_bin | |
| 324 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ); | |
| 325 | |
| 326 // output 2na->4na bin | |
| 327 INSDC:4na:bin out_2na_4na_bin | |
| 328 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin ); | |
| 329 | |
| 330 // output 4na bin | |
| 331 INSDC:4na:bin out_4na_bin | |
| 332 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD ) | |
| 333 | out_dcmp_4na_bin | |
| 334 | out_2na_4na_bin; | |
| 335 | |
| 336 // output text | |
| 337 INSDC:dna:text out_dna_text | |
| 338 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ); | |
| 339 | |
| 340 | |
| 341 /* INSDC:tbl:sequence inherited productions | |
| 342 * out_signal | |
| 343 */ | |
| 344 | |
| 345 /* NCBI:tbl:dcmp_base_space inherited productions | |
| 346 * out_dcmp_2na_bin | |
| 347 * out_dcmp_4na_bin | |
| 348 * out_dcmp_x2na_bin | |
| 349 * out_dcmp_2na_packed | |
| 350 */ | |
| 351 }; | |
| 352 | |
| 353 | |
| 354 | |
| 355 | |
| 356 /*-------------------------------------------------------------------------- | |
| 357 * color_space - implementation | |
| 358 * nucleotide sequences in color space | |
| 359 */ | |
| 360 | |
| 361 extern function | |
| 362 INSDC:x2na:bin NCBI:dna_from_color #1 ( INSDC:x2cs:bin color_bin, | |
| 363 INSDC:coord:zero read_start, INSDC:coord:len read_len, | |
| 364 INSDC:dna:text cs_key, U8 color_matrix ); | |
| 365 | |
| 366 | |
| 367 /* dcmp_color_space | |
| 368 * declares common virtual productions | |
| 369 */ | |
| 370 table NCBI:tbl:dcmp_color_space #1 | |
| 371 { | |
| 372 // rules to introduce purely virtual productions | |
| 373 // never expected to resolve... | |
| 374 INSDC:dna:text dcmp_virtual_productions | |
| 375 = out_dcmp_x2cs_bin | |
| 376 | out_dcmp_2cs_bin | |
| 377 | out_dcmp_2cs_packed; | |
| 378 } | |
| 379 | |
| 380 /* history: | |
| 381 * 1.0.1 - base explicitly upn sequence #1.0.1, spotdesc #1.0.1 | |
| 382 * 1.0.2 - spotdesc #1.0.2 | |
| 383 * 1.0.3 - base upon dcmp_color_space for "out_dcmp_2cs_bin" | |
| 384 */ | |
| 385 table NCBI:tbl:color_space_common #1.0.3 | |
| 386 = INSDC:tbl:sequence #1.0.1 | |
| 387 , INSDC:SRA:tbl:spotdesc #1.0.2 | |
| 388 , INSDC:SRA:tbl:stats #1.1.0 | |
| 389 , NCBI:tbl:dcmp_color_space #1.0.0 | |
| 390 { | |
| 391 // cs_native - tells user color space is native | |
| 392 bool cs_native = < bool > echo < true > (); | |
| 393 | |
| 394 // unambiguous unpacked 2cs | |
| 395 INSDC:2cs:bin out_2cs_bin | |
| 396 = out_dcmp_2cs_bin | |
| 397 | ( INSDC:2cs:bin ) unpack ( out_2cs_packed ); | |
| 398 | |
| 399 // unambiguous synthesized 2na | |
| 400 INSDC:2na:bin out_2na_bin | |
| 401 = < INSDC:x2na:bin, INSDC:2na:bin > map < INSDC:x2na:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( out_x2na_bin ); | |
| 402 | |
| 403 // synthesized unpacked 4na | |
| 404 INSDC:4na:bin out_4na_bin | |
| 405 = < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( out_x2na_bin ); | |
| 406 | |
| 407 // synthesized dna text | |
| 408 INSDC:dna:text out_dna_text | |
| 409 = < INSDC:x2na:bin, INSDC:dna:text > map < INSDC:x2na:map:BINSET, INSDC:x2na:map:CHARSET > ( out_x2na_bin ); | |
| 410 | |
| 411 // synthesized dna sequence | |
| 412 INSDC:x2na:bin out_x2na_bin | |
| 413 = NCBI:dna_from_color ( out_x2cs_bin, out_read_start, out_read_len, out_cs_key, out_color_matrix ); | |
| 414 | |
| 415 // synthesized packed 2na | |
| 416 INSDC:2na:packed out_2na_packed | |
| 417 = ( INSDC:2na:packed ) pack ( out_2na_bin ); | |
| 418 | |
| 419 // synthesized packed 4na | |
| 420 INSDC:4na:packed out_4na_packed | |
| 421 = ( INSDC:4na:packed ) pack ( out_4na_bin ); | |
| 422 | |
| 423 // synthesized color text | |
| 424 INSDC:color:text out_color_text | |
| 425 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_x2cs_bin ); | |
| 426 | |
| 427 // spot_len and fixed_spot_len | |
| 428 INSDC:coord:len color_space_spot_len | |
| 429 = ( INSDC:coord:len ) row_len ( out_2cs_packed ); | |
| 430 INSDC:coord:len color_space_fixed_spot_len | |
| 431 = ( INSDC:coord:len ) fixed_row_len ( out_2cs_packed ); | |
| 432 | |
| 433 /* INSDC:tbl:sequence inherited productions | |
| 434 * in_cs_key | |
| 435 * out_cs_key | |
| 436 * out_signal | |
| 437 * out_x2cs_bin | |
| 438 * in_color_text | |
| 439 * out_2cs_packed | |
| 440 * out_color_matrix | |
| 441 */ | |
| 442 | |
| 443 /* INSDC:SRA:tbl:stats inherited productions | |
| 444 * in_stats_bin | |
| 445 */ | |
| 446 | |
| 447 /* NCBI:tbl:dcmp_color_space inherited productions | |
| 448 * out_dcmp_2cs_bin | |
| 449 * out_dcmp_x2cs_bin | |
| 450 * out_dcmp_2cs_packed | |
| 451 */ | |
| 452 }; | |
| 453 | |
| 454 /* color_space_nocol | |
| 455 * this table describes viewing rules | |
| 456 * but omits writing rules and physical column description | |
| 457 * in order to support older tables | |
| 458 * | |
| 459 * history: | |
| 460 * 1.0.1 - base explicitly upon color_space_common #1.0.1 | |
| 461 * 1.0.2 - color_space_common #1.0.2 | |
| 462 * 1.0.3 - color_space_common #1.0.3 | |
| 463 */ | |
| 464 table NCBI:tbl:color_space_nocol #1.0.3 | |
| 465 = NCBI:tbl:color_space_common #1.0.3 | |
| 466 , NCBI:tbl:n_encoding #1 | |
| 467 { | |
| 468 // incoming is disabled | |
| 469 | |
| 470 // v1 color matrix was stored in metadata | |
| 471 U8 out_color_matrix | |
| 472 = < U8 > meta:read < "COLOR_MATRIX" > () | |
| 473 | < U8 > echo < INSDC:color:default_matrix > (); | |
| 474 | |
| 475 // unpacked 2cs with ambiguities | |
| 476 INSDC:x2cs:bin out_x2cs_bin | |
| 477 = ( INSDC:x2cs:bin ) read_ndecode; | |
| 478 | |
| 479 // interface with n-encoded qualities | |
| 480 U8 read_unpack = out_2cs_bin; | |
| 481 | |
| 482 /* INSDC:tbl:sequence inherited productions | |
| 483 * out_cs_key | |
| 484 * out_signal | |
| 485 * out_2cs_packed | |
| 486 */ | |
| 487 | |
| 488 /* NCBI:tbl:n_encoding inherited productions | |
| 489 * read_ndecode | |
| 490 */ | |
| 491 }; | |
| 492 | |
| 493 /* color_space #1 | |
| 494 * this schema brings in .CSREAD and .CS_KEY columns for v1 tables | |
| 495 * | |
| 496 * history: | |
| 497 * 1.0.1 - base explicitly upon color_space_nocol #1.0.1 | |
| 498 * 1.0.2 - color_space_nocol #1.0.2 | |
| 499 * 1.0.3 - color_space_nocol #1.0.3 | |
| 500 */ | |
| 501 table NCBI:tbl:color_space #1.0.3 = NCBI:tbl:color_space_nocol #1.0.3 | |
| 502 { | |
| 503 // stored as text | |
| 504 INSDC:dna:text out_cs_key = .CS_KEY; | |
| 505 | |
| 506 // stored color sequence | |
| 507 INSDC:2cs:packed out_2cs_packed = .CSREAD; | |
| 508 | |
| 509 /* INSDC:tbl:sequence inherited productions | |
| 510 * out_signal | |
| 511 */ | |
| 512 | |
| 513 /* NCBI:tbl:n_encoding inherited productions | |
| 514 * read_ndecode | |
| 515 */ | |
| 516 }; | |
| 517 | |
| 518 /* color_space #2 | |
| 519 * standard current color-space table | |
| 520 * | |
| 521 * history: | |
| 522 * 2.0.1 - base explicitly upon color_space_common #1.0.1 | |
| 523 * 2.0.2 - base explicitly upon color_space_common #1.0.2 | |
| 524 * 2.1.0 - introduce hooks for compressed color space | |
| 525 */ | |
| 526 table NCBI:tbl:color_space #2.1 | |
| 527 = NCBI:tbl:color_space_common #1.0.3 | |
| 528 , NCBI:tbl:dcmp_color_space #1.0.0 | |
| 529 { | |
| 530 /* input rules | |
| 531 */ | |
| 532 | |
| 533 // input text is not modified | |
| 534 // illegal values are not detected here | |
| 535 INSDC:color:text in_color_text = CSREAD; | |
| 536 | |
| 537 // input x2cs bin | |
| 538 // illegal values will be caught here | |
| 539 INSDC:x2cs:bin in_x2cs_bin | |
| 540 = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CSREAD ) | |
| 541 | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_color_text ); | |
| 542 | |
| 543 // input 2cs bin | |
| 544 INSDC:2cs:bin in_2cs_bin | |
| 545 = < INSDC:2cs:bin > range_validate < 0, 3 > ( CSREAD ) | |
| 546 | ( INSDC:2cs:bin ) unpack ( in_2cs_packed ) | |
| 547 | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_x2cs_bin ); | |
| 548 | |
| 549 // input 2cs packed | |
| 550 INSDC:2cs:packed in_2cs_packed = CSREAD; | |
| 551 | |
| 552 // input x2cs alt-csread ( ambiguity ) | |
| 553 INSDC:x2cs:bin in_alt_x2cs_bin | |
| 554 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_x2cs_bin ); | |
| 555 | |
| 556 // color-space keys ARE modified on input | |
| 557 INSDC:dna:text in_cs_key | |
| 558 = < INSDC:dna:text, INSDC:dna:text > map < 'acgt', 'ACGT' > ( CS_KEY ); | |
| 559 | |
| 560 // color matrix | |
| 561 U8 in_color_matrix = < U8 > range_validate < 0, 4 > ( COLOR_MATRIX ); | |
| 562 | |
| 563 // prepairing a feed into stats column | |
| 564 U8 in_stats_bin = in_2cs_bin; | |
| 565 | |
| 566 | |
| 567 /* physical columns | |
| 568 */ | |
| 569 | |
| 570 physical column INSDC:2cs:packed .CSREAD | |
| 571 = in_2cs_packed | |
| 572 | ( INSDC:2cs:packed ) pack ( in_2cs_bin ); | |
| 573 | |
| 574 physical column < INSDC:x2cs:bin > zip_encoding .ALTCSREAD | |
| 575 = < INSDC:x2cs:bin > trim < 0, 0 > ( in_alt_x2cs_bin ); | |
| 576 | |
| 577 physical column < INSDC:dna:text > zip_encoding .CS_KEY = in_cs_key; | |
| 578 | |
| 579 physical column < U8 > zip_encoding .COLOR_MATRIX = in_color_matrix; | |
| 580 | |
| 581 | |
| 582 /* output rules | |
| 583 */ | |
| 584 | |
| 585 // output 2cs packed | |
| 586 INSDC:2cs:packed out_2cs_packed | |
| 587 = .CSREAD | |
| 588 | out_dcmp_2cs_packed; | |
| 589 | |
| 590 // unpacked 2cs with ambiguity | |
| 591 INSDC:x2cs:bin out_x2cs_bin | |
| 592 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_2cs_bin, .ALTCSREAD ) | |
| 593 | out_dcmp_x2cs_bin | |
| 594 | ( INSDC:x2cs:bin ) out_2cs_bin; | |
| 595 | |
| 596 // read directly from physical column | |
| 597 INSDC:dna:text out_cs_key = .CS_KEY; | |
| 598 | |
| 599 // color matrix may be synthesized | |
| 600 U8 out_color_matrix | |
| 601 = .COLOR_MATRIX | |
| 602 | < U8 > echo < INSDC:color:default_matrix > (); | |
| 603 | |
| 604 | |
| 605 /* INSDC:tbl:sequence inherited productions | |
| 606 * out_signal | |
| 607 */ | |
| 608 | |
| 609 /* NCBI:tbl:dcmp_color_space inherited productions | |
| 610 * out_dcmp_2cs_bin | |
| 611 * out_dcmp_x2cs_bin | |
| 612 * out_dcmp_2cs_packed | |
| 613 */ | |
| 614 }; | |
| 615 | |
| 616 | |
| 617 /*-------------------------------------------------------------------------- | |
| 618 * protein | |
| 619 */ | |
| 620 table NCBI:tbl:protein #1 = INSDC:tbl:protein | |
| 621 { | |
| 622 /* upper-case letters */ | |
| 623 INSDC:protein:text in_protein_text = < INSDC:protein:text, INSDC:protein:text > | |
| 624 map < 'abcdefghijklmnopqrstvwxyzu','ABCDEFGHIJKLMNOPQRSTVWXYZU' > ( PROTEIN ); | |
| 625 | |
| 626 /* std aa */ | |
| 627 INSDC:aa:bin in_aa_bin | |
| 628 = < INSDC:aa:bin > range_validate < 1, 27 > ( PROTEIN ) | |
| 629 | < INSDC:protein:text, INSDC:aa:bin > map < INSDC:aa:map:CHARSET, INSDC:aa:map:BINSET > ( in_protein_text ); | |
| 630 | |
| 631 /* physical column */ | |
| 632 physical column < INSDC:aa:bin > zip_encoding .PROTEIN = in_aa_bin; | |
| 633 | |
| 634 /* output rules */ | |
| 635 INSDC:aa:bin out_aa_bin = .PROTEIN; | |
| 636 INSDC:protein:text out_protein_text = < INSDC:aa:bin, INSDC:protein:text > | |
| 637 map < INSDC:aa:map:BINSET, INSDC:aa:map:CHARSET > ( out_aa_bin ); | |
| 638 }; | |
| 639 | |
| 640 | |
| 641 /*-------------------------------------------------------------------------- | |
| 642 * phred | |
| 643 * standard phred quality representation | |
| 644 * limits values on input to 1..63 | |
| 645 * reserves value 0 as ambiguity symbol for reads | |
| 646 */ | |
| 647 | |
| 648 | |
| 649 /* history: | |
| 650 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
| 651 */ | |
| 652 table NCBI:tbl:phred_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 | |
| 653 { | |
| 654 /* [CS]READ - decoding | |
| 655 */ | |
| 656 U8 read_ndecode | |
| 657 = < INSDC:quality:phred, U8 > map < 0, 4 > ( out_qual_phred, read_unpack ); | |
| 658 | |
| 659 /* INSDC:tbl:sequence inherited productions | |
| 660 * out_qual_phred | |
| 661 * out_qual_text_phred_33 | |
| 662 * out_qual_text_phred_64 | |
| 663 */ | |
| 664 | |
| 665 /* NCBI:tbl:n_encoding inherited productions | |
| 666 * read_unpack | |
| 667 */ | |
| 668 }; | |
| 669 | |
| 670 /* history: | |
| 671 * 1.0.1 - base explicitly upon phred_quality_nocol #1.0.1 | |
| 672 */ | |
| 673 table NCBI:tbl:phred_quality #1.0.1 = NCBI:tbl:phred_quality_nocol #1.0.1 | |
| 674 { | |
| 675 // read directly as n-encoded phred is compatible with phred | |
| 676 NCBI:quality:n_encoded:phred out_qual_phred = .QUALITY; | |
| 677 | |
| 678 /* INSDC:tbl:sequence inherited productions | |
| 679 * out_qual_text_phred_33 | |
| 680 * out_qual_text_phred_64 | |
| 681 */ | |
| 682 | |
| 683 /* NCBI:tbl:n_encoding inherited productions | |
| 684 * read_unpack | |
| 685 */ | |
| 686 }; | |
| 687 | |
| 688 /* history: | |
| 689 * 2.0.1 - added feed of in_stats_qual | |
| 690 * 2.0.2 - added input of text encodings | |
| 691 * 2.0.3 - base explicitly upon sequence #1.0.1 | |
| 692 * 2.0.4 - change compression from izip to zip | |
| 693 * 2.0.5 - change from zip to delta_average_zip | |
| 694 */ | |
| 695 table NCBI:tbl:phred_quality #2.0.4 = INSDC:tbl:sequence #1.0.1 | |
| 696 { | |
| 697 // read directly quality as phred | |
| 698 INSDC:quality:phred out_qual_phred = .QUALITY; | |
| 699 | |
| 700 // input rules | |
| 701 INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY; | |
| 702 INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY; | |
| 703 | |
| 704 INSDC:quality:phred in_qual_phred | |
| 705 = QUALITY | |
| 706 | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 ) | |
| 707 | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 ); | |
| 708 | |
| 709 // physical storage | |
| 710 /*** next line is for future change in production, but we have to wait until supporting code is released to the public ***/ | |
| 711 // physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY = in_qual_phred; | |
| 712 /*** NB *** MUST change table version to 2.0.5 and propagate to all derived tables ***/ | |
| 713 physical column < INSDC:quality:phred > zip_encoding .QUALITY = in_qual_phred; | |
| 714 | |
| 715 // feed to compressed statistics | |
| 716 INSDC:quality:phred in_stats_qual = in_qual_phred; | |
| 717 | |
| 718 /* INSDC:tbl:sequence inherited productions | |
| 719 * out_qual_text_phred_33 | |
| 720 * out_qual_text_phred_64 | |
| 721 */ | |
| 722 }; | |
| 723 | |
| 724 | |
| 725 | |
| 726 /*-------------------------------------------------------------------------- | |
| 727 * log_odds | |
| 728 * log-odds quality score support | |
| 729 * | |
| 730 * conversion from log-odds to phred is via formula | |
| 731 * 10 * log ( 1 + pow ( 10, x / 10 ) ) / log ( 10 ) + 0.499 | |
| 732 * for x = -4..40 : when x = -5, phred = 0 | |
| 733 */ | |
| 734 | |
| 735 // the map function requires two lookup tables: | |
| 736 // the first table detects every legal value... | |
| 737 const INSDC:quality:log_odds NCBI:quality:from:log_odds = | |
| 738 [ | |
| 739 -6,-5,-4,-3,-2,-1, 0, | |
| 740 1, 2, 3, 4, 5, 6, 7, 8, 9,10, | |
| 741 11,12,13,14,15,16,17,18,19,20, | |
| 742 21,22,23,24,25,26,27,28,29,30, | |
| 743 31,32,33,34,35,36,37,38,39,40 | |
| 744 ]; | |
| 745 | |
| 746 // ...the second table gives positional translations | |
| 747 const INSDC:quality:phred NCBI:quality:to:phred = | |
| 748 [ | |
| 749 0, 1, 1, 2, 2, 3, 3, | |
| 750 4, 4, 5, 5, 6, 7, 8, 9,10,10, | |
| 751 11,12,13,14,15,16,17,18,19,20, | |
| 752 21,22,23,24,25,26,27,28,29,30, | |
| 753 31,32,33,34,35,36,37,38,39,40 | |
| 754 ]; | |
| 755 | |
| 756 function | |
| 757 INSDC:quality:phred NCBI:log_odds_to_phred #1 ( INSDC:quality:log_odds qual_log_odds ) | |
| 758 { | |
| 759 // this range enforcement may not be required | |
| 760 INSDC:quality:log_odds log_odds_clip | |
| 761 = < INSDC:quality:log_odds > clip < -6, 40 > ( qual_log_odds ); | |
| 762 | |
| 763 // use the tables above to map from log-odds to phred | |
| 764 return < INSDC:quality:log_odds, INSDC:quality:phred > | |
| 765 map < NCBI:quality:from:log_odds, NCBI:quality:to:phred > ( log_odds_clip ); | |
| 766 } | |
| 767 | |
| 768 /* history: | |
| 769 * 1.0.1 - base explicitly upon sequence #1.0.1 | |
| 770 */ | |
| 771 table NCBI:tbl:log_odds_quality_nocol #1.0.1 = INSDC:tbl:sequence #1.0.1, NCBI:tbl:n_encoding #1 | |
| 772 { | |
| 773 /* READ - decoding | |
| 774 */ | |
| 775 U8 read_ndecode | |
| 776 = < INSDC:quality:log_odds, U8 > map < -6, 4 > ( out_qual_log_odds, read_unpack ); | |
| 777 | |
| 778 /* QUALITY | |
| 779 * declared in INSDC:tbl:sequence as phred | |
| 780 * introduce here as log-odds | |
| 781 */ | |
| 782 extern column INSDC:quality:log_odds QUALITY = out_qual_log_odds; | |
| 783 | |
| 784 // resolve for phred | |
| 785 INSDC:quality:phred out_qual_phred | |
| 786 = out_qual2_phred | |
| 787 | NCBI:log_odds_to_phred ( out_qual_log_odds ); | |
| 788 | |
| 789 /* INSDC:tbl:sequence inherited productions | |
| 790 * out_qual_text_phred_33 | |
| 791 * out_qual_text_phred_64 | |
| 792 */ | |
| 793 | |
| 794 /* NCBI:tbl:n_encoding inherited productions | |
| 795 * read_unpack | |
| 796 */ | |
| 797 | |
| 798 /* NCBI:tbl:log_odds_quality_nocol productions | |
| 799 * out_qual2_phred | |
| 800 * out_qual_log_odds | |
| 801 */ | |
| 802 }; | |
| 803 | |
| 804 /* history: | |
| 805 * 1.0.1 - base explicitly upon log_odds_quality_nocol #1.0.1 | |
| 806 */ | |
| 807 table NCBI:tbl:log_odds_quality #1.0.1 = NCBI:tbl:log_odds_quality_nocol #1.0.1 | |
| 808 { | |
| 809 // read directly as n-encoded log_odds is compatible with log_odds | |
| 810 NCBI:quality:n_encoded:log_odds out_qual_log_odds = .QUALITY; | |
| 811 | |
| 812 /* INSDC:tbl:sequence inherited productions | |
| 813 * out_qual_text_phred_33 | |
| 814 * out_qual_text_phred_64 | |
| 815 */ | |
| 816 | |
| 817 /* NCBI:tbl:n_encoding inherited productions | |
| 818 * read_unpack | |
| 819 */ | |
| 820 | |
| 821 /* NCBI:tbl:log_odds_quality_nocol inherited productions | |
| 822 * out_qual2_phred | |
| 823 */ | |
| 824 }; | |
| 825 | |
| 826 /* history: | |
| 827 * 2.0.1 - base explicitly upon sequence #1.0.1 | |
| 828 * 2.1.0 - added production of in_qual_phred | |
| 829 */ | |
| 830 table NCBI:tbl:log_odds_quality_nocol #2.1.0 = INSDC:tbl:sequence #1.0.1 | |
| 831 { | |
| 832 /* QUALITY | |
| 833 * declared in INSDC:tbl:sequence as phred | |
| 834 * introduce here as log-odds | |
| 835 */ | |
| 836 extern column INSDC:quality:log_odds QUALITY | |
| 837 = out_qual_log_odds; | |
| 838 | |
| 839 // resolve for phred | |
| 840 INSDC:quality:phred in_qual_phred | |
| 841 = NCBI:log_odds_to_phred ( in_qual_log_odds ); | |
| 842 | |
| 843 INSDC:quality:phred out_qual_phred | |
| 844 = NCBI:log_odds_to_phred ( out_qual_log_odds ); | |
| 845 | |
| 846 | |
| 847 /* INSDC:tbl:sequence inherited productions | |
| 848 * out_qual_text_phred_33 | |
| 849 * out_qual_text_phred_64 | |
| 850 */ | |
| 851 | |
| 852 /* NCBI:tbl:log_odds_quality_nocol productions | |
| 853 * out_qual_log_odds | |
| 854 */ | |
| 855 }; | |
| 856 | |
| 857 /* history: | |
| 858 * 2.0.1 - added feed of in_stats_qual | |
| 859 * 2.0.2 - added input of text encodings | |
| 860 * 2.0.3 - base explicitly upon log_odds_quality_nocol #2.0.1 | |
| 861 * 2.0.4 - changed compression from izip to zip | |
| 862 * 2.1.0 - base explicitly upon log_odds_quality_nocol #2.1.0 | |
| 863 */ | |
| 864 table NCBI:tbl:log_odds_quality #2.1.0 = NCBI:tbl:log_odds_quality_nocol #2.1.0 | |
| 865 { | |
| 866 INSDC:quality:log_odds out_qual_log_odds= .QUALITY; | |
| 867 | |
| 868 extern column INSDC:quality:text:log_odds_64 QUALITY | |
| 869 = out_qual_text_log_odds_64 | |
| 870 | ( INSDC:quality:text:log_odds_64 ) < B8 > sum < 64 > ( out_qual_log_odds ); | |
| 871 | |
| 872 // input rules | |
| 873 INSDC:quality:text:log_odds_64 in_qual_text_log_odds_64 = QUALITY; | |
| 874 | |
| 875 INSDC:quality:log_odds in_qual_log_odds | |
| 876 = QUALITY | |
| 877 | ( INSDC:quality:log_odds ) < B8 > diff < 64 > ( in_qual_text_log_odds_64 ); | |
| 878 | |
| 879 physical column < INSDC:quality:log_odds > zip_encoding .QUALITY | |
| 880 = in_qual_log_odds; | |
| 881 | |
| 882 // feed to compressed statistics | |
| 883 INSDC:quality:log_odds in_stats_qual = in_qual_log_odds; | |
| 884 | |
| 885 | |
| 886 /* INSDC:tbl:sequence inherited productions | |
| 887 * out_qual_text_phred_33 | |
| 888 * out_qual_text_phred_64 | |
| 889 */ | |
| 890 | |
| 891 /* NCBI:tbl:log_odds_quality productions | |
| 892 * out_qual_text_log_odds_64 | |
| 893 */ | |
| 894 }; |
