Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * NCBI Sequence Read Archive schema | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'insdc/sra.vschema'; | |
| 34 | |
| 35 | |
| 36 /*-------------------------------------------------------------------------- | |
| 37 * types | |
| 38 */ | |
| 39 | |
| 40 /* spot_name_token | |
| 41 * a vector describing tokens recognized within a spot name | |
| 42 * | |
| 43 * COMPONENTS: | |
| 44 * 0 - token id | |
| 45 * 1 - token starting coordinate | |
| 46 * 2 - token length | |
| 47 */ | |
| 48 alias text:token NCBI:SRA:spot_name_token; | |
| 49 | |
| 50 | |
| 51 /* token values | |
| 52 * | |
| 53 * tokens are produced by a schema-specific tokenizer function | |
| 54 * this function is purposely abstract because it may rely upon | |
| 55 * whatever information it needs to perform its task. the only | |
| 56 * requirement is that it produce these tokens as its output. | |
| 57 * | |
| 58 * an empty name input must produce no tokens. in this case, | |
| 59 * there is no name to tokenize or data to produce. | |
| 60 * | |
| 61 * a non-empty name must produce 1 or more tokens of output. | |
| 62 * all tokens must be ordered by starting character position. | |
| 63 * | |
| 64 * if a name does not conform to any pattern recognized by the | |
| 65 * tokenizer, then the tokenizer emits a single token of "unrecognized" | |
| 66 * | |
| 67 * if a name conforms to some pattern but does not have any | |
| 68 * substitution tokens, the tokenizer emits a single token of "recognized" | |
| 69 * | |
| 70 * if a name may be tokenized, then the resulting tokens should | |
| 71 * describe only the portions of the string that should be removed | |
| 72 * from the name, e.g. "X" or "Y". | |
| 73 * | |
| 74 * the standard coordinates "X".."L" are given in unsigned decimal. | |
| 75 * alternate representations are contained within their respective | |
| 76 * namespaces: "signed", "hex" and "octal". | |
| 77 * | |
| 78 * the special coordinate "Q" represents the 454-specific encoding | |
| 79 * of X and Y into base-36, where the formula for Q is: | |
| 80 * Q = 4096 * X + Y | |
| 81 * and ASCII encoding: | |
| 82 * 0..25 => "A-Z", 26..35 => "0-9" | |
| 83 */ | |
| 84 const U16 NCBI:SRA:name_token:unrecognized = 1; | |
| 85 const U16 NCBI:SRA:name_token:recognized = 2; | |
| 86 const U16 NCBI:SRA:name_token:Q = 3; | |
| 87 const U16 NCBI:SRA:name_token:X = 4; | |
| 88 const U16 NCBI:SRA:name_token:Y = 5; | |
| 89 const U16 NCBI:SRA:name_token:T = 6; | |
| 90 const U16 NCBI:SRA:name_token:L = 7; | |
| 91 const U16 NCBI:SRA:name_token:signed:X = 8; | |
| 92 const U16 NCBI:SRA:name_token:signed:Y = 9; | |
| 93 const U16 NCBI:SRA:name_token:signed:T = 10; | |
| 94 const U16 NCBI:SRA:name_token:signed:L = 11; | |
| 95 const U16 NCBI:SRA:name_token:octal:X = 12; | |
| 96 const U16 NCBI:SRA:name_token:octal:Y = 13; | |
| 97 const U16 NCBI:SRA:name_token:octal:T = 14; | |
| 98 const U16 NCBI:SRA:name_token:octal:L = 15; | |
| 99 const U16 NCBI:SRA:name_token:hex:upper:X = 16; | |
| 100 const U16 NCBI:SRA:name_token:hex:upper:Y = 17; | |
| 101 const U16 NCBI:SRA:name_token:hex:upper:T = 18; | |
| 102 const U16 NCBI:SRA:name_token:hex:upper:L = 19; | |
| 103 const U16 NCBI:SRA:name_token:hex:lower:X = 20; | |
| 104 const U16 NCBI:SRA:name_token:hex:lower:Y = 21; | |
| 105 const U16 NCBI:SRA:name_token:hex:lower:T = 22; | |
| 106 const U16 NCBI:SRA:name_token:hex:lower:L = 23; | |
| 107 | |
| 108 | |
| 109 /* token symbols | |
| 110 * when a name matches some pattern and tokens are recognized, | |
| 111 * the tokens are extracted from the name and sent to individual | |
| 112 * columns, and replaced with the symbols below to create a | |
| 113 * formatted name. | |
| 114 */ | |
| 115 const ascii NCBI:SRA:name_symbol:Q = '$Q'; | |
| 116 const ascii NCBI:SRA:name_symbol:X = '$X'; | |
| 117 const ascii NCBI:SRA:name_symbol:Y = '$Y'; | |
| 118 const ascii NCBI:SRA:name_symbol:T = '$T'; | |
| 119 const ascii NCBI:SRA:name_symbol:L = '$L'; | |
| 120 const ascii NCBI:SRA:name_symbol:octal:X = '$a'; | |
| 121 const ascii NCBI:SRA:name_symbol:octal:Y = '$b'; | |
| 122 const ascii NCBI:SRA:name_symbol:octal:T = '$c'; | |
| 123 const ascii NCBI:SRA:name_symbol:octal:L = '$d'; | |
| 124 const ascii NCBI:SRA:name_symbol:hex:upper:X = '$e'; | |
| 125 const ascii NCBI:SRA:name_symbol:hex:upper:Y = '$f'; | |
| 126 const ascii NCBI:SRA:name_symbol:hex:upper:T = '$g'; | |
| 127 const ascii NCBI:SRA:name_symbol:hex:upper:L = '$h'; | |
| 128 const ascii NCBI:SRA:name_symbol:hex:lower:X = '$x'; | |
| 129 const ascii NCBI:SRA:name_symbol:hex:lower:Y = '$y'; | |
| 130 const ascii NCBI:SRA:name_symbol:hex:lower:T = '$t'; | |
| 131 const ascii NCBI:SRA:name_symbol:hex:lower:L = '$l'; | |
| 132 | |
| 133 | |
| 134 /*-------------------------------------------------------------------------- | |
| 135 * functions | |
| 136 */ | |
| 137 | |
| 138 /* extract_spot_name | |
| 139 * generates input to .SPOT_NAME column | |
| 140 * | |
| 141 * on NCBI:SRA:name_token:unrecognized, produces the entire spot name row | |
| 142 * otherwise, produces an empty row | |
| 143 * | |
| 144 * "name" [ DATA ] - raw spot names from NAME column | |
| 145 * | |
| 146 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
| 147 */ | |
| 148 function ascii | |
| 149 NCBI:SRA:extract_spot_name #1 ( ascii name, NCBI:SRA:spot_name_token tok ); | |
| 150 | |
| 151 | |
| 152 /* extract_name_fmt | |
| 153 * generates input to .NAME_FMT column and/or updates skey index | |
| 154 * | |
| 155 * on NCBI:SRA:name_token:unrecognized, produces an empty row | |
| 156 * otherwise, it creates a temporary "name_fmt" string from name row | |
| 157 * | |
| 158 * an attempt is made to insert name_fmt into indicated text index | |
| 159 * ( normally 'skey' ). if the insert succeeds, i.e. associates "name_fmt" | |
| 160 * with a row_id, then the output for the row is empty. | |
| 161 * | |
| 162 * if the insert fails due to key duplication, an attempt is made to | |
| 163 * extend the id range of associated rows. depending upon the type of index, | |
| 164 * this may succeed or fail, e.g. if the existing row range for "name_fmt" is | |
| 165 * n..m where m = row_id - 1, the range can be extended to n..row_id and | |
| 166 * the update succeeds. if the index supports discontiguous id ranges, the | |
| 167 * update will also succeed. upon any success updating the index, the output | |
| 168 * row will be empty. | |
| 169 * | |
| 170 * finally, if the temporary "name_fmt" cannot be inserted into the index | |
| 171 * nor the existing id range updated, the output for the row will be "name_fmt". | |
| 172 * | |
| 173 * "name" [ DATA ] - raw spot names from NAME column | |
| 174 * | |
| 175 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
| 176 */ | |
| 177 function ascii | |
| 178 NCBI:SRA:extract_name_fmt #1 < ascii idx > ( ascii name, NCBI:SRA:spot_name_token tok ); | |
| 179 | |
| 180 | |
| 181 /* extract_name_coord | |
| 182 * generates inputs to .X and .Y and possibly other columns | |
| 183 * | |
| 184 * if no tokens match "coord"constant, produces an empty row | |
| 185 * otherwise, produces binary coordinate value | |
| 186 * if multiple tokens match criteria, all values must be equivalent | |
| 187 * because only a single value will be output per row | |
| 188 * | |
| 189 * "coord" [ CONST ] - either NCBI:SRA:name_token:X or NCBI:SRA:name_token:Y | |
| 190 * both of these values also match the token NCBI:SRA:name_token:Q and extract | |
| 191 * contents appropriately. | |
| 192 * | |
| 193 * "name" [ DATA ] - raw spot names from NAME column | |
| 194 * | |
| 195 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
| 196 */ | |
| 197 function INSDC:coord:val | |
| 198 NCBI:SRA:extract_name_coord #1 < U16 coord > ( ascii name, NCBI:SRA:spot_name_token tok ); | |
| 199 | |
| 200 | |
| 201 /* lookup | |
| 202 */ | |
| 203 function INSDC:SRA:spot_ids_found NCBI:SRA:lookup #1.0 | |
| 204 < ascii index_name, ascii query_by_name, U8 name_fmt_version > ( * ascii name_prefix ); | |
| 205 | |
| 206 | |
| 207 /*-------------------------------------------------------------------------- | |
| 208 * spotcoord | |
| 209 * spot coordinate table implementation | |
| 210 */ | |
| 211 table NCBI:SRA:tbl:spotcoord #1 = INSDC:SRA:tbl:spotcoord #1 | |
| 212 { | |
| 213 // X and Y stored as I32 | |
| 214 INSDC:coord:val out_x_coord = .X; | |
| 215 INSDC:coord:val out_y_coord = .Y; | |
| 216 | |
| 217 // T and L are usually present but optional | |
| 218 INSDC:coord:val out_t_coord = .T; | |
| 219 INSDC:coord:val out_l_coord = .L; | |
| 220 | |
| 221 // .X, .Y, .T and .L get either empty coordinate or proper coordinate | |
| 222 physical column < INSDC:coord:val > izip_encoding .X | |
| 223 = in_x_coord | |
| 224 | in_name_x_coord; | |
| 225 physical column < INSDC:coord:val > izip_encoding .Y | |
| 226 = in_y_coord | |
| 227 | in_name_y_coord; | |
| 228 physical column < INSDC:coord:val > izip_encoding .T | |
| 229 = in_t_coord | |
| 230 | in_name_t_coord; | |
| 231 physical column < INSDC:coord:val > izip_encoding .L | |
| 232 = in_l_coord | |
| 233 | in_name_l_coord; | |
| 234 }; | |
| 235 | |
| 236 | |
| 237 /*-------------------------------------------------------------------------- | |
| 238 * skeyname | |
| 239 * spot name table implementation built upon prefix-tree skey index | |
| 240 * | |
| 241 * v1 - maintains a 1->1 key=>spot_id relationship | |
| 242 * with unique constraint on key. it does NOT | |
| 243 * implement name_fmt or x_coord or y_coord. | |
| 244 * | |
| 245 * v2 - maintains a 1->1 key=>spot_id-range relationship | |
| 246 * with unique constraint on key. it does NOT | |
| 247 * implement spot_name. X and Y are stored using | |
| 248 * 16-bit unsigned quantities. | |
| 249 * | |
| 250 * v3 - maintains a flexible naming approach | |
| 251 * retrieves name directly from column if so stored | |
| 252 * synthesizes name from name_fmt, X and Y otherwise | |
| 253 * name_fmt is either retrieved directly from column | |
| 254 * or from skey index. X and Y are stored as 32-bit | |
| 255 * signed quantities. | |
| 256 * | |
| 257 * history: | |
| 258 * 1.0.1 - explicitly account for spotname #1.0.1 ancestry | |
| 259 * 2.0.1 - " " | |
| 260 * 3.0.1 - moved .X and .Y to spotcoord table | |
| 261 */ | |
| 262 table NCBI:SRA:tbl:skeyname #1.0.1 = INSDC:SRA:tbl:spotname #1.0.1 | |
| 263 { | |
| 264 // read the skey entry | |
| 265 ascii out_skey = ( ascii ) idx:text:project #1.0 < 'skey' > (); | |
| 266 | |
| 267 // spot_name | |
| 268 ascii out_spot_name | |
| 269 = rewritten_spot_name | |
| 270 | out_skey; | |
| 271 | |
| 272 // search skey entry | |
| 273 INSDC:SRA:spot_ids_found spot_ids_found | |
| 274 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 1 > ( out_slx_prefix ) | |
| 275 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 0 > (); | |
| 276 | |
| 277 | |
| 278 /* INSDC:SRA:tbl:spotname inherited productions | |
| 279 * out_x_coord | |
| 280 * out_y_coord | |
| 281 * out_name_fmt | |
| 282 */ | |
| 283 | |
| 284 /* NCBI:SRA:tbl:skeyname productions | |
| 285 * out_slx_prefix | |
| 286 * rewritten_spot_name | |
| 287 */ | |
| 288 }; | |
| 289 | |
| 290 table NCBI:SRA:tbl:skeyname_nocol #2.0.1 = INSDC:SRA:tbl:spotname #1.0.1 | |
| 291 { | |
| 292 // name_fmt | |
| 293 // perform reverse lookup through index to get key | |
| 294 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > (); | |
| 295 | |
| 296 // search skey entry | |
| 297 INSDC:SRA:spot_ids_found spot_ids_found | |
| 298 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ( out_slx_prefix ) | |
| 299 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > (); | |
| 300 | |
| 301 // X and Y stored as U16 | |
| 302 INSDC:coord:val out_x_coord = cast ( .X ); | |
| 303 INSDC:coord:val out_y_coord = cast ( .Y ); | |
| 304 | |
| 305 | |
| 306 /* NCBI:SRA:tbl:skeyname_nocol virtual productions | |
| 307 * out_slx_prefix | |
| 308 */ | |
| 309 }; | |
| 310 | |
| 311 table NCBI:SRA:tbl:skeyname #2.0.1 = NCBI:SRA:tbl:skeyname_nocol #2.0.1 | |
| 312 { | |
| 313 // spot_name_tok comes from a platform-specific tokenizer | |
| 314 // and must be of type 'NCBI:SRA:spot_name_token' | |
| 315 physical column < INSDC:coord:val > izip_encoding #1 .X | |
| 316 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok ); | |
| 317 physical column < INSDC:coord:val > izip_encoding .Y | |
| 318 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok ); | |
| 319 | |
| 320 /* NCBI:SRA:tbl:skeyname_nocol inherited virtual productions | |
| 321 * out_slx_prefix | |
| 322 */ | |
| 323 | |
| 324 /* NCBI:SRA:tbl:skeyname virtual productions | |
| 325 * in_spot_name_tok | |
| 326 */ | |
| 327 }; | |
| 328 | |
| 329 table NCBI:SRA:tbl:skeyname #3.0.1 = INSDC:SRA:tbl:spotname #1.0.1, NCBI:SRA:tbl:spotcoord #1 | |
| 330 { | |
| 331 // spot_name | |
| 332 // retrieve from hard column | |
| 333 ascii out_spot_name = .SPOT_NAME; | |
| 334 | |
| 335 // name_fmt | |
| 336 // retrieve from hard column or reverse lookup through index | |
| 337 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ( .NAME_FMT ); | |
| 338 | |
| 339 INSDC:SRA:spot_ids_found spot_ids_found | |
| 340 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > (); | |
| 341 | |
| 342 | |
| 343 /* encoding rules | |
| 344 * the sub-table will provide a platform-specific parser that | |
| 345 * produces as its output a series of NCBI:SRA:spot_name_token | |
| 346 * for each input row in the virtual production "spot_name_tok" | |
| 347 * | |
| 348 * the tokenizer will look for X, Y or Q (combined) coordinates | |
| 349 * within the spot name and issue tokens when found, or in the | |
| 350 * case that none are found, an "unrecognized" token is issued. | |
| 351 * | |
| 352 * the tokens are then processed here by common rules | |
| 353 */ | |
| 354 | |
| 355 // .SPOT_NAME gets either empty strings or unrecognized strings | |
| 356 physical column < ascii > zip_encoding .SPOT_NAME | |
| 357 = NCBI:SRA:extract_spot_name ( NAME, in_spot_name_tok ); | |
| 358 | |
| 359 // .NAME_FMT gets either empty strings or unindexed but recognized strings | |
| 360 physical column < ascii > zip_encoding .NAME_FMT | |
| 361 = NCBI:SRA:extract_name_fmt < 'skey' > ( NAME, in_spot_name_tok ); | |
| 362 | |
| 363 // .X, .Y, .T and .L get either empty coordinate or proper coordinate | |
| 364 INSDC:coord:val in_name_x_coord | |
| 365 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok ); | |
| 366 INSDC:coord:val in_name_y_coord | |
| 367 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok ); | |
| 368 INSDC:coord:val in_name_t_coord | |
| 369 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( NAME, in_spot_name_tok ); | |
| 370 INSDC:coord:val in_name_l_coord | |
| 371 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( NAME, in_spot_name_tok ); | |
| 372 | |
| 373 | |
| 374 /* NCBI:SRA:tbl:skeyname virtual productions | |
| 375 * in_spot_name_tok | |
| 376 */ | |
| 377 }; |
