Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/illumina.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * NCBI Illumina Sequence Read Archive schema | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'ncbi/sra.vschema'; | |
| 33 include 'ncbi/spotname.vschema'; | |
| 34 | |
| 35 | |
| 36 /*-------------------------------------------------------------------------- | |
| 37 * types | |
| 38 */ | |
| 39 | |
| 40 typedef INSDC:quality:log_odds NCBI:qual4 [ 4 ]; | |
| 41 typedef NCBI:qual4 NCBI:SRA:rotated_qual4, NCBI:SRA:swapped_qual4; | |
| 42 | |
| 43 | |
| 44 /*-------------------------------------------------------------------------- | |
| 45 * functions | |
| 46 */ | |
| 47 | |
| 48 /* tokenize_spot_name | |
| 49 * scans name on input | |
| 50 * tokenizes into parts | |
| 51 */ | |
| 52 extern function NCBI:SRA:spot_name_token | |
| 53 NCBI:SRA:Illumina:tokenize_spot_name #1 ( ascii name ); | |
| 54 | |
| 55 | |
| 56 /*-------------------------------------------------------------------------- | |
| 57 * NCBI:SRA:Illumina:qual4 | |
| 58 * 4-channel log-odds-ish quality | |
| 59 */ | |
| 60 | |
| 61 /* history: | |
| 62 * 1.0.1 - base explicitly upon updated ancestry | |
| 63 */ | |
| 64 table NCBI:SRA:Illumina:qual4_nocol #1.0.1 | |
| 65 = INSDC:tbl:sequence #1.0.1 | |
| 66 , NCBI:tbl:log_odds_quality_nocol #1.0.1 | |
| 67 { | |
| 68 /* QUALITY | |
| 69 * 4-channel quality column | |
| 70 */ | |
| 71 readonly column NCBI:qual4 QUALITY = out_qual4; | |
| 72 | |
| 73 NCBI:qual4 out_qual4 | |
| 74 = < NCBI:qual4 > NCBI:SRA:swap ( out_qual4_swapped, read_unpack ) | |
| 75 | < NCBI:qual4 > NCBI:SRA:rotate < false > ( out_qual4_rotated, read_unpack ); | |
| 76 | |
| 77 | |
| 78 /* single-channel output | |
| 79 * convert 4-channel log-odds to single channel | |
| 80 * must retain n-encoding, which was intended to be the 4-channel pattern | |
| 81 * ( -5, -5, -5, -5 ) and a base of 'A' | |
| 82 */ | |
| 83 | |
| 84 // first, extract quality for called base | |
| 85 INSDC:quality:log_odds out_qual1_ch0 | |
| 86 = < INSDC:quality:log_odds> cut < 0 > ( out_qual4_swapped ) | |
| 87 | < INSDC:quality:log_odds> cut < 0 > ( out_qual4_rotated ); | |
| 88 | |
| 89 // clip it to -5 and above | |
| 90 INSDC:quality:log_odds out_qual1_clip | |
| 91 = < INSDC:quality:log_odds > clip < -5, 127 > ( out_qual1_ch0 ); | |
| 92 | |
| 93 // convert 4 channel to single 32-bit value | |
| 94 U32 out_qual4_32 | |
| 95 = redimension ( out_qual4_swapped ) | |
| 96 | redimension ( out_qual4_rotated ); | |
| 97 | |
| 98 // detect ( -5, -5, -5, -5 ) and introduce a -6 value into log-odds | |
| 99 // this is treated as an 'N', but still not ready | |
| 100 INSDC:quality:log_odds out_qual1_fives | |
| 101 = < U32, INSDC:quality:log_odds > map < 0xFBFBFBFB, -6 > ( out_qual4_32, out_qual1_clip ); | |
| 102 | |
| 103 // now slam zeros into anything that doesn't correspond to an A | |
| 104 // essentially this leaves all of the A qualities. any having -6 are really N. | |
| 105 INSDC:quality:log_odds out_qual1_n | |
| 106 = < U8, INSDC:quality:log_odds > map < [ 1, 2, 3 ], [ 0, 0, 0 ] > ( read_unpack, out_qual1_fives ); | |
| 107 | |
| 108 // finally, produce log-odds with n-encoded as -6 | |
| 109 INSDC:quality:log_odds out_qual_log_odds | |
| 110 = < INSDC:quality:log_odds, INSDC:quality:log_odds > map < -6, -6 > ( out_qual1_n, out_qual1_clip ); | |
| 111 | |
| 112 | |
| 113 /* NCBI:tbl:n_encoding inherited productions | |
| 114 * read_unpack | |
| 115 */ | |
| 116 | |
| 117 /* NCBI:SRA:Illumina:qual4_nocol productions | |
| 118 * out_qual4_rotated | |
| 119 * out_qual4_swapped | |
| 120 */ | |
| 121 }; | |
| 122 | |
| 123 | |
| 124 /* 4-channel log-odds compression | |
| 125 */ | |
| 126 | |
| 127 // encoded type - a single byte code for 4-channel pattern | |
| 128 typedef B8 NCBI:SRA:encoded_qual4; | |
| 129 | |
| 130 // decoding function | |
| 131 extern function | |
| 132 NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_decode #1 ( NCBI:SRA:encoded_qual4 in ); | |
| 133 | |
| 134 // encoding function | |
| 135 extern function | |
| 136 NCBI:SRA:encoded_qual4 NCBI:SRA:qual4_encode #1 ( NCBI:SRA:swapped_qual4 in ); | |
| 137 | |
| 138 // compression rules | |
| 139 physical NCBI:SRA:swapped_qual4 NCBI:SRA:qual4_encoding #1 | |
| 140 { | |
| 141 encode | |
| 142 { | |
| 143 // produce codes | |
| 144 NCBI:SRA:encoded_qual4 encoded = NCBI:SRA:qual4_encode ( @ ); | |
| 145 | |
| 146 // gzip | |
| 147 return zip < Z_RLE, Z_BEST_SPEED > ( encoded ); | |
| 148 } | |
| 149 | |
| 150 decode | |
| 151 { | |
| 152 // gunzip | |
| 153 NCBI:SRA:encoded_qual4 unzipped = unzip ( @ ); | |
| 154 | |
| 155 // inflate to swapped | |
| 156 return NCBI:SRA:qual4_decode ( unzipped ); | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 /* history: | |
| 161 * 1.0.1 - base upon updated qual4_nocol | |
| 162 */ | |
| 163 table NCBI:SRA:Illumina:qual4 #1.0.1 = NCBI:SRA:Illumina:qual4_nocol #1.0.1 | |
| 164 { | |
| 165 // read directly as swapped, n-encoded log_odds | |
| 166 NCBI:SRA:swapped_qual4 out_qual4_swapped = .QUALITY; | |
| 167 | |
| 168 /* NCBI:tbl:n_encoding inherited virtual productions | |
| 169 * read_unpack | |
| 170 */ | |
| 171 }; | |
| 172 | |
| 173 /* history: | |
| 174 * 2.0.2 - base upon updated ancestry | |
| 175 * 2.0.3 - base upon updated ancestry | |
| 176 * 2.0.4 - base upon updated ancestry | |
| 177 * 2.1.0 - base upon updated ancestry, added in_qual_log_odds | |
| 178 */ | |
| 179 table NCBI:SRA:Illumina:qual4 #2.1.0 | |
| 180 = NCBI:tbl:base_space #2.0.3 | |
| 181 , NCBI:tbl:log_odds_quality_nocol #2.1.0 | |
| 182 { | |
| 183 /* QUALITY | |
| 184 * 4-channel log-odds | |
| 185 */ | |
| 186 extern column NCBI:qual4 QUALITY = out_qual4; | |
| 187 | |
| 188 NCBI:SRA:swapped_qual4 in_qual4 | |
| 189 = ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_x2na_bin ) | |
| 190 | ( NCBI:SRA:swapped_qual4 ) < NCBI:qual4 > NCBI:SRA:swap ( QUALITY, in_2na_bin ); | |
| 191 | |
| 192 NCBI:qual4 out_qual4 | |
| 193 = < NCBI:SRA:swapped_qual4 > NCBI:SRA:swap ( .QUALITY, out_x2na_bin ); | |
| 194 | |
| 195 physical column NCBI:SRA:qual4_encoding .QUALITY = in_qual4; | |
| 196 | |
| 197 // feed to compressed statistics | |
| 198 NCBI:qual4 in_stats_qual = in_qual4; | |
| 199 | |
| 200 // single channel | |
| 201 INSDC:quality:log_odds in_qual_log_odds | |
| 202 = < INSDC:quality:log_odds > cut < 0 > ( in_qual4 ); | |
| 203 INSDC:quality:log_odds out_qual_log_odds | |
| 204 = < INSDC:quality:log_odds > cut < 0 > ( .QUALITY ); | |
| 205 }; | |
| 206 | |
| 207 | |
| 208 /*-------------------------------------------------------------------------- | |
| 209 * NCBI:SRA:Illumina | |
| 210 * Illumina SRA Platform | |
| 211 */ | |
| 212 | |
| 213 | |
| 214 /* NCBI:SRA:Illumina:common #1 | |
| 215 * basic table interface based upon Illumina's pipelines | |
| 216 * | |
| 217 * history: | |
| 218 * 1.0.1 - explictly base upon sra #1.0.1 | |
| 219 * 1.0.2 - base explicitly upon sra #1.0.2 | |
| 220 * 1.0.3 - base explicitly upon sra #1.0.3 | |
| 221 */ | |
| 222 table NCBI:SRA:Illumina:common #1.0.3 = INSDC:SRA:tbl:sra #1.0.3 | |
| 223 { | |
| 224 // platform name is always 'ILLUMINA' | |
| 225 ascii platform_name | |
| 226 = < ascii > echo < "ILLUMINA" > (); | |
| 227 | |
| 228 /* TRIMMED SEQUENCE | |
| 229 * need to find the 0-based trim_start and trim_len | |
| 230 */ | |
| 231 INSDC:coord:zero bio_start = NCBI:SRA:bio_start ( out_read_start, out_read_type ); | |
| 232 INSDC:coord:zero trim_start = bio_start; | |
| 233 U32 trim_left = ( U32 ) trim_start; | |
| 234 INSDC:coord:len trim_len = (INSDC:coord:len) < U32 > diff ( spot_len, trim_left ); | |
| 235 | |
| 236 /* COORDINATES | |
| 237 * in addition to X and Y, | |
| 238 * Illumina has LANE and TILE | |
| 239 */ | |
| 240 readonly column INSDC:coord:val LANE = out_lane_coord; | |
| 241 readonly column INSDC:coord:val TILE = out_tile_coord; | |
| 242 }; | |
| 243 | |
| 244 | |
| 245 /*-------------------------------------------------------------------------- | |
| 246 * NCBI:SRA:Illumina:tbl:v2 #1 | |
| 247 * normalized v2 table | |
| 248 * still has variants based upon quality type | |
| 249 * | |
| 250 * history: | |
| 251 * 1.0.1 - explictly base upon sra #1.0.1 and related tables | |
| 252 * 1.0.2 - updated ancestry | |
| 253 * 1.0.3 - updated ancestry | |
| 254 */ | |
| 255 | |
| 256 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:SIGNAL #2 | |
| 257 { | |
| 258 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } | |
| 259 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } | |
| 260 } | |
| 261 | |
| 262 physical NCBI:fsamp4 NCBI:SRA:Illumina:encoding:NOISE #2 | |
| 263 { | |
| 264 decode | |
| 265 { | |
| 266 F32 dcmp = funzip ( @ ); | |
| 267 return redimension ( dcmp ); | |
| 268 } | |
| 269 encode | |
| 270 { | |
| 271 F32 ncmp = redimension ( @ ); | |
| 272 return fzip < 10 > ( ncmp ); | |
| 273 } | |
| 274 } | |
| 275 | |
| 276 physical NCBI:SRA:swapped_fsamp4 NCBI:SRA:Illumina:encoding:INTENSITY #2 | |
| 277 { | |
| 278 decode { return NCBI:SRA:fsamp4:decode #2 ( @ ); } | |
| 279 encode { return NCBI:SRA:fsamp4:encode #2 < 14, 10 > ( @ ); } | |
| 280 } | |
| 281 | |
| 282 // v2 base table | |
| 283 table NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
| 284 = NCBI:SRA:tbl:sra #2.1.3 | |
| 285 , NCBI:tbl:base_space #2.0.3 | |
| 286 , NCBI:SRA:Illumina:common #1.0.3 | |
| 287 { | |
| 288 /* NAME tokenizing and coordinates | |
| 289 * most work happens in skeyname table | |
| 290 * we still obtain LANE and TILE from name | |
| 291 */ | |
| 292 INSDC:coord:val out_lane_coord = ( INSDC:coord:val ) | |
| 293 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( _out_name, out_spot_name_tok ); | |
| 294 INSDC:coord:val out_tile_coord = ( INSDC:coord:val ) | |
| 295 NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( _out_name, out_spot_name_tok ); | |
| 296 NCBI:SRA:spot_name_token out_spot_name_tok | |
| 297 = NCBI:SRA:Illumina:tokenize_spot_name ( _out_name ); | |
| 298 | |
| 299 NCBI:SRA:spot_name_token in_spot_name_tok | |
| 300 = NCBI:SRA:Illumina:tokenize_spot_name ( NAME ); | |
| 301 | |
| 302 /* SIGNAL | |
| 303 * optional, no longer archived | |
| 304 */ | |
| 305 extern column NCBI:fsamp4 SIGNAL | |
| 306 { | |
| 307 read = out_signal; | |
| 308 validate = < NCBI:fsamp4 > no_compare #1 ( in_signal, out_signal ); | |
| 309 } | |
| 310 NCBI:fsamp4 in_signal = SIGNAL; | |
| 311 NCBI:fsamp4 out_signal | |
| 312 = < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .SIGNAL, out_x2na_bin ); | |
| 313 | |
| 314 physical column NCBI:SRA:Illumina:encoding:SIGNAL #2 .SIGNAL | |
| 315 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_x2na_bin ) | |
| 316 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_signal, in_2na_bin ); | |
| 317 | |
| 318 /* NOISE | |
| 319 * optional, no longer archived | |
| 320 */ | |
| 321 extern column NCBI:fsamp4 NOISE | |
| 322 { | |
| 323 read = out_noise; | |
| 324 validate = < NCBI:fsamp4 > no_compare #1 ( in_noise, out_noise ); | |
| 325 } | |
| 326 NCBI:fsamp4 in_noise = NOISE; | |
| 327 NCBI:fsamp4 out_noise = .NOISE; | |
| 328 | |
| 329 physical column NCBI:SRA:Illumina:encoding:NOISE #2 .NOISE = in_noise; | |
| 330 | |
| 331 /* INTENSITY | |
| 332 * optional, no longer archived | |
| 333 */ | |
| 334 extern column NCBI:fsamp4 INTENSITY | |
| 335 { | |
| 336 read = out_intensity; | |
| 337 validate = < NCBI:fsamp4 > no_compare #1 ( in_intensity, out_intensity ); | |
| 338 } | |
| 339 NCBI:fsamp4 in_intensity = INTENSITY; | |
| 340 NCBI:fsamp4 out_intensity | |
| 341 = < NCBI:fsamp4 > NCBI:SRA:denormalize ( out_norm_intensity, out_x2na_bin ); | |
| 342 NCBI:fsamp4 out_norm_intensity | |
| 343 = ( NCBI:fsamp4 ) < NCBI:SRA:swapped_fsamp4 > NCBI:SRA:swap ( .INTENSITY, out_x2na_bin ); | |
| 344 NCBI:fsamp4 in_norm_intensity | |
| 345 = < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_x2na_bin ) | |
| 346 | < NCBI:fsamp4 > NCBI:SRA:normalize ( in_intensity, in_2na_bin ); | |
| 347 physical column NCBI:SRA:Illumina:encoding:INTENSITY #2 .INTENSITY | |
| 348 = ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_x2na_bin ) | |
| 349 | ( NCBI:SRA:swapped_fsamp4 ) < NCBI:fsamp4 > NCBI:SRA:swap ( in_norm_intensity, in_2na_bin ); | |
| 350 | |
| 351 /* INSDC:tbl:sequence inherited virtual productions | |
| 352 * out_qual_phred | |
| 353 */ | |
| 354 | |
| 355 /* INSDC:SRA:tbl:spotdesc inherited productions | |
| 356 * static_fixed_spot_len | |
| 357 */ | |
| 358 }; | |
| 359 | |
| 360 /* 4-channel log-odds qualities | |
| 361 * | |
| 362 * history: | |
| 363 * 1.0.2 - updated ancestry | |
| 364 * 1.0.3 - updated ancestry | |
| 365 * 1.0.4 - updated ancestry | |
| 366 * 1.1.0 - updated ancestry | |
| 367 */ | |
| 368 table NCBI:SRA:Illumina:tbl:q4:v2 #1.1.0 | |
| 369 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
| 370 , NCBI:SRA:Illumina:qual4 #2.1.0 | |
| 371 { | |
| 372 /* INSDC:SRA:tbl:spotdesc inherited virtual productions | |
| 373 * static_fixed_spot_len | |
| 374 */ | |
| 375 }; | |
| 376 | |
| 377 /* 1-channel log-odds qualities | |
| 378 * | |
| 379 * history: | |
| 380 * 1.0.2 - updated ancestry | |
| 381 * 1.0.3 - updated ancestry | |
| 382 * 1.0.4 - updated ancestry | |
| 383 * 1.1.0 - updated ancestry | |
| 384 */ | |
| 385 table NCBI:SRA:Illumina:tbl:q1:v2 #1.1 | |
| 386 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
| 387 , NCBI:tbl:log_odds_quality #2.1.0 | |
| 388 { | |
| 389 /* INSDC:SRA:tbl:spotdesc inherited productions | |
| 390 * static_fixed_spot_len | |
| 391 */ | |
| 392 }; | |
| 393 | |
| 394 /* phred qualities | |
| 395 * | |
| 396 * history: | |
| 397 * 1.0.2 - updated ancestry | |
| 398 * 1.0.3 - updated ancestry | |
| 399 * 1.0.4 - updated ancestry | |
| 400 */ | |
| 401 table NCBI:SRA:Illumina:tbl:phred:v2 #1.0.4 | |
| 402 = NCBI:SRA:Illumina:tbl:v2 #1.0.4 | |
| 403 , NCBI:tbl:phred_quality #2.0.3 | |
| 404 { | |
| 405 /* INSDC:SRA:tbl:spotdesc inherited virtual productions | |
| 406 * static_fixed_spot_len | |
| 407 */ | |
| 408 }; |
