Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/align/seq.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * Sequence schema | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'ncbi/seq.vschema'; | |
| 34 | |
| 35 | |
| 36 /* cmp_base_space | |
| 37 * table representing compressed reads in base space, | |
| 38 * where the bases are only stored for unaligned reads | |
| 39 */ | |
| 40 table NCBI:align:tbl:cmp_base_space #1 | |
| 41 = INSDC:tbl:sequence #1.0.1 | |
| 42 , NCBI:tbl:dcmp_base_space #1 | |
| 43 { | |
| 44 /* CMP_READ | |
| 45 * read compressed against a reference sequence | |
| 46 */ | |
| 47 | |
| 48 // default is IUPAC character representation | |
| 49 extern default column INSDC:dna:text CMP_READ | |
| 50 { | |
| 51 read = out_cmp_dna_text; | |
| 52 validate = < INSDC:dna:text > compare ( in_cmp_dna_text, out_cmp_dna_text ); | |
| 53 } | |
| 54 | |
| 55 // 4na representation | |
| 56 extern column INSDC:4na:bin CMP_READ = out_cmp_4na_bin; | |
| 57 extern column INSDC:4na:packed CMP_READ = out_cmp_4na_packed; | |
| 58 | |
| 59 // x2na representation - 2na with ambiguity | |
| 60 extern column INSDC:x2na:bin CMP_READ = out_cmp_x2na_bin; | |
| 61 | |
| 62 // 2na representation - 2na with no ambiguity | |
| 63 extern column INSDC:2na:bin CMP_READ = out_cmp_2na_bin; | |
| 64 extern column INSDC:2na:packed CMP_READ = out_cmp_2na_packed; | |
| 65 | |
| 66 | |
| 67 /* input processing rules | |
| 68 */ | |
| 69 | |
| 70 // compressed input text | |
| 71 INSDC:dna:text in_cmp_dna_text | |
| 72 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( CMP_READ ); | |
| 73 | |
| 74 // compressed input 4na bin | |
| 75 INSDC:4na:bin in_cmp_4na_bin | |
| 76 = < INSDC:4na:bin > range_validate < 0, 15 > ( CMP_READ ) | |
| 77 | ( INSDC:4na:bin ) unpack ( in_cmp_4na_packed ) | |
| 78 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_cmp_dna_text ) | |
| 79 | < INSDC:x2na:bin, INSDC:4na:bin > map < INSDC:x2na:map:BINSET, [ 1, 2, 4, 8, 15 ] > ( in_cmp_x2na_bin ); | |
| 80 | |
| 81 // compressed input 4na packed | |
| 82 INSDC:4na:packed in_cmp_4na_packed = CMP_READ; | |
| 83 | |
| 84 // compressed input x2na bin | |
| 85 INSDC:x2na:bin in_cmp_x2na_bin | |
| 86 = < INSDC:x2na:bin > range_validate < 0, 4 > ( CMP_READ ) | |
| 87 | < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( in_cmp_4na_bin ); | |
| 88 | |
| 89 // compressed input 2na bin | |
| 90 INSDC:2na:bin in_cmp_2na_bin | |
| 91 = < INSDC:2na:bin > range_validate < 0, 3 > ( CMP_READ ) | |
| 92 | ( INSDC:2na:bin ) unpack ( in_cmp_2na_packed ) | |
| 93 | INSDC:SEQ:rand_4na_2na ( in_cmp_4na_bin ); | |
| 94 | |
| 95 // compressed input 2na packed | |
| 96 INSDC:2na:packed in_cmp_2na_packed = CMP_READ; | |
| 97 | |
| 98 // input 4na alt-read ( ambiguities ) | |
| 99 INSDC:4na:bin in_cmp_alt_4na_bin | |
| 100 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_cmp_4na_bin ); | |
| 101 | |
| 102 // preparing a feed into stats column | |
| 103 U8 in_cmp_stats_bin = in_cmp_2na_bin; | |
| 104 | |
| 105 | |
| 106 /* physical columns | |
| 107 */ | |
| 108 | |
| 109 physical column INSDC:2na:packed .CMP_READ | |
| 110 = in_cmp_2na_packed | |
| 111 | ( INSDC:2na:packed ) pack ( in_cmp_2na_bin ); | |
| 112 | |
| 113 physical column < INSDC:4na:bin > zip_encoding .CMP_ALTREAD | |
| 114 = < INSDC:4na:bin > trim < 0, 0 > ( in_cmp_alt_4na_bin ); | |
| 115 | |
| 116 | |
| 117 /* output processing rules | |
| 118 */ | |
| 119 | |
| 120 // output 2na packed | |
| 121 INSDC:2na:packed out_cmp_2na_packed = .CMP_READ; | |
| 122 | |
| 123 // unambiguous unpacked 2na | |
| 124 INSDC:2na:bin out_cmp_2na_bin | |
| 125 = ( INSDC:2na:bin ) unpack ( out_cmp_2na_packed ); | |
| 126 | |
| 127 // output x2na bin | |
| 128 INSDC:x2na:bin out_cmp_x2na_bin | |
| 129 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_cmp_4na_bin ); | |
| 130 | |
| 131 // output 2na->4na bin | |
| 132 INSDC:4na:bin out_cmp_2na_4na_bin | |
| 133 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_cmp_2na_bin ); | |
| 134 | |
| 135 // output 4na bin | |
| 136 INSDC:4na:bin out_cmp_4na_bin | |
| 137 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_cmp_2na_4na_bin, .CMP_ALTREAD ) | |
| 138 | out_cmp_2na_4na_bin; | |
| 139 | |
| 140 // synthesized packed 4na | |
| 141 INSDC:4na:packed out_cmp_4na_packed | |
| 142 = ( INSDC:4na:packed ) pack ( out_cmp_4na_bin ); | |
| 143 | |
| 144 // output text | |
| 145 INSDC:dna:text out_cmp_dna_text | |
| 146 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_cmp_4na_bin ); | |
| 147 | |
| 148 | |
| 149 /* decompressed sequences | |
| 150 * source is out_dcmp_4na_bin - a virtual production | |
| 151 */ | |
| 152 | |
| 153 // synthesize x2na_bin, 2na_bin and 2na_packed | |
| 154 INSDC:x2na:bin out_dcmp_x2na_bin | |
| 155 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_dcmp_4na_bin ); | |
| 156 INSDC:2na:bin out_dcmp_2na_bin | |
| 157 = < INSDC:x2na:bin, INSDC:2na:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2na_bin ); | |
| 158 INSDC:2na:packed out_dcmp_2na_packed | |
| 159 = ( INSDC:2na:packed ) pack ( out_dcmp_2na_bin ); | |
| 160 | |
| 161 | |
| 162 /* INSDC:tbl:sequence inherited productions | |
| 163 * cs_native | |
| 164 * out_cs_key | |
| 165 * out_signal | |
| 166 * out_2cs_bin | |
| 167 * out_2na_bin | |
| 168 * out_4na_bin | |
| 169 * out_dna_text | |
| 170 * out_x2cs_bin | |
| 171 * out_x2na_bin | |
| 172 * out_2cs_packed | |
| 173 * out_2na_packed | |
| 174 * out_4na_packed | |
| 175 * out_color_text | |
| 176 * out_color_matrix | |
| 177 */ | |
| 178 | |
| 179 /* NCBI:tbl:dcmp_base_space inherited productions | |
| 180 * out_dcmp_4na_bin | |
| 181 */ | |
| 182 } | |
| 183 | |
| 184 | |
| 185 /* cmp_color_space | |
| 186 * table representing compressed reads in color space, | |
| 187 * where the colors are only stored for unaligned reads | |
| 188 */ | |
| 189 table NCBI:align:tbl:cmp_color_space #1 = | |
| 190 INSDC:tbl:sequence #1.0.1, NCBI:tbl:dcmp_color_space #1 | |
| 191 { | |
| 192 /* CMP_CSREAD | |
| 193 * read compressed against a reference sequence | |
| 194 */ | |
| 195 | |
| 196 // default is IUPAC character representation | |
| 197 extern default column INSDC:color:text CMP_CSREAD = out_cmp_color_text; | |
| 198 | |
| 199 // x2cs representation - 2cs with ambiguity | |
| 200 extern column INSDC:x2cs:bin CMP_CSREAD = out_cmp_x2cs_bin; | |
| 201 | |
| 202 // 2cs representation - 2cs with no ambiguity | |
| 203 extern column INSDC:2cs:bin CMP_CSREAD = out_cmp_2cs_bin; | |
| 204 extern column INSDC:2cs:packed CMP_CSREAD = out_cmp_2cs_packed; | |
| 205 | |
| 206 | |
| 207 /* input processing rules | |
| 208 */ | |
| 209 | |
| 210 // compressed input text | |
| 211 INSDC:color:text in_cmp_color_text = CMP_CSREAD; | |
| 212 | |
| 213 // compressed input x2cs bin | |
| 214 INSDC:x2cs:bin in_cmp_x2cs_bin | |
| 215 = < INSDC:x2cs:bin > range_validate < 0, 4 > ( CMP_CSREAD ) | |
| 216 | < INSDC:color:text, INSDC:x2cs:bin > map < INSDC:x2cs:map:CHARSET, INSDC:x2cs:map:BINSET > ( in_cmp_color_text ); | |
| 217 | |
| 218 // compressed input 2cs bin | |
| 219 INSDC:2cs:bin in_cmp_2cs_bin | |
| 220 = < INSDC:2cs:bin > range_validate < 0, 3 > ( CMP_CSREAD ) | |
| 221 | ( INSDC:2cs:bin ) unpack ( in_cmp_2cs_packed ) | |
| 222 | < INSDC:x2cs:bin, INSDC:2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 1, 2, 3, 0 ] > ( in_cmp_x2cs_bin ); | |
| 223 | |
| 224 // compressed input 2cs packed | |
| 225 INSDC:2cs:packed in_cmp_2cs_packed = CMP_CSREAD; | |
| 226 | |
| 227 // compressed input x2cs alt-read ( ambiguities ) | |
| 228 INSDC:x2cs:bin in_cmp_alt_x2cs_bin | |
| 229 = < INSDC:x2cs:bin, INSDC:x2cs:bin > map < INSDC:x2cs:map:BINSET, [ 0, 0, 0, 0, 4 ] > ( in_cmp_x2cs_bin ); | |
| 230 | |
| 231 // preparing a feed into stats column | |
| 232 U8 in_cmp_stats_bin = in_cmp_2cs_bin; | |
| 233 | |
| 234 | |
| 235 /* physical columns | |
| 236 */ | |
| 237 | |
| 238 physical column INSDC:2cs:packed .CMP_CSREAD | |
| 239 = in_cmp_2cs_packed | |
| 240 | ( INSDC:2cs:packed ) pack ( in_cmp_2cs_bin ); | |
| 241 | |
| 242 physical column < INSDC:x2cs:bin > zip_encoding .CMP_ALTCSREAD | |
| 243 = < INSDC:x2cs:bin > trim < 0, 0 > ( in_cmp_alt_x2cs_bin ); | |
| 244 | |
| 245 | |
| 246 /* output processing rules | |
| 247 */ | |
| 248 | |
| 249 // compressed output 2cs packed | |
| 250 INSDC:2cs:packed out_cmp_2cs_packed = .CMP_CSREAD; | |
| 251 | |
| 252 // unambiguous unpacked 2cs | |
| 253 INSDC:2cs:bin out_cmp_2cs_bin | |
| 254 = ( INSDC:2cs:bin ) unpack ( out_cmp_2cs_packed ); | |
| 255 | |
| 256 // unpacked 2cs with ambiguity | |
| 257 INSDC:x2cs:bin out_cmp_x2cs_bin | |
| 258 = ( INSDC:x2cs:bin ) < U8 > bit_or < ALIGN_RIGHT > ( out_cmp_2cs_bin, .CMP_ALTCSREAD ) | |
| 259 | ( INSDC:x2cs:bin ) out_cmp_2cs_bin; | |
| 260 | |
| 261 // output text | |
| 262 INSDC:color:text out_cmp_color_text | |
| 263 = < INSDC:x2cs:bin, INSDC:color:text > map < INSDC:x2cs:map:BINSET, INSDC:x2cs:map:CHARSET > ( out_cmp_x2cs_bin ); | |
| 264 | |
| 265 | |
| 266 /* decompressed sequences | |
| 267 * sources are out_dcmp_x2cs_bin - virtual production | |
| 268 */ | |
| 269 | |
| 270 // synthesize 2cs_bin and 2cs_packed | |
| 271 INSDC:2cs:bin out_dcmp_2cs_bin | |
| 272 = < INSDC:x2cs:bin, INSDC:2cs:bin > map < [ 0,1,2,3,4 ], [ 0,1,2,3,0 ] > ( out_dcmp_x2cs_bin ); | |
| 273 INSDC:2cs:packed out_dcmp_2cs_packed | |
| 274 = ( INSDC:2cs:packed ) pack ( out_dcmp_2cs_bin ); | |
| 275 | |
| 276 | |
| 277 /* INSDC:tbl:sequence inherited productions | |
| 278 * cs_native | |
| 279 * out_cs_key | |
| 280 * out_signal | |
| 281 * out_2cs_bin | |
| 282 * out_2na_bin | |
| 283 * out_4na_bin | |
| 284 * out_dna_text | |
| 285 * out_x2cs_bin | |
| 286 * out_x2na_bin | |
| 287 * out_2cs_packed | |
| 288 * out_2na_packed | |
| 289 * out_4na_packed | |
| 290 * out_color_text | |
| 291 * out_qual_phred | |
| 292 * out_color_matrix | |
| 293 * out_qual_text_phred_33 | |
| 294 * out_qual_text_phred_64 | |
| 295 */ | |
| 296 | |
| 297 /* NCBI:tbl:dcmp_color_space inherited productions | |
| 298 * out_dcmp_x2cs_bin | |
| 299 */ | |
| 300 } |
