Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * NCBI Sequence Read Archive schema | |
29 */ | |
30 version 1; | |
31 | |
32 include 'vdb/vdb.vschema'; | |
33 include 'insdc/sra.vschema'; | |
34 | |
35 | |
36 /*-------------------------------------------------------------------------- | |
37 * types | |
38 */ | |
39 | |
40 /* spot_name_token | |
41 * a vector describing tokens recognized within a spot name | |
42 * | |
43 * COMPONENTS: | |
44 * 0 - token id | |
45 * 1 - token starting coordinate | |
46 * 2 - token length | |
47 */ | |
48 alias text:token NCBI:SRA:spot_name_token; | |
49 | |
50 | |
51 /* token values | |
52 * | |
53 * tokens are produced by a schema-specific tokenizer function | |
54 * this function is purposely abstract because it may rely upon | |
55 * whatever information it needs to perform its task. the only | |
56 * requirement is that it produce these tokens as its output. | |
57 * | |
58 * an empty name input must produce no tokens. in this case, | |
59 * there is no name to tokenize or data to produce. | |
60 * | |
61 * a non-empty name must produce 1 or more tokens of output. | |
62 * all tokens must be ordered by starting character position. | |
63 * | |
64 * if a name does not conform to any pattern recognized by the | |
65 * tokenizer, then the tokenizer emits a single token of "unrecognized" | |
66 * | |
67 * if a name conforms to some pattern but does not have any | |
68 * substitution tokens, the tokenizer emits a single token of "recognized" | |
69 * | |
70 * if a name may be tokenized, then the resulting tokens should | |
71 * describe only the portions of the string that should be removed | |
72 * from the name, e.g. "X" or "Y". | |
73 * | |
74 * the standard coordinates "X".."L" are given in unsigned decimal. | |
75 * alternate representations are contained within their respective | |
76 * namespaces: "signed", "hex" and "octal". | |
77 * | |
78 * the special coordinate "Q" represents the 454-specific encoding | |
79 * of X and Y into base-36, where the formula for Q is: | |
80 * Q = 4096 * X + Y | |
81 * and ASCII encoding: | |
82 * 0..25 => "A-Z", 26..35 => "0-9" | |
83 */ | |
84 const U16 NCBI:SRA:name_token:unrecognized = 1; | |
85 const U16 NCBI:SRA:name_token:recognized = 2; | |
86 const U16 NCBI:SRA:name_token:Q = 3; | |
87 const U16 NCBI:SRA:name_token:X = 4; | |
88 const U16 NCBI:SRA:name_token:Y = 5; | |
89 const U16 NCBI:SRA:name_token:T = 6; | |
90 const U16 NCBI:SRA:name_token:L = 7; | |
91 const U16 NCBI:SRA:name_token:signed:X = 8; | |
92 const U16 NCBI:SRA:name_token:signed:Y = 9; | |
93 const U16 NCBI:SRA:name_token:signed:T = 10; | |
94 const U16 NCBI:SRA:name_token:signed:L = 11; | |
95 const U16 NCBI:SRA:name_token:octal:X = 12; | |
96 const U16 NCBI:SRA:name_token:octal:Y = 13; | |
97 const U16 NCBI:SRA:name_token:octal:T = 14; | |
98 const U16 NCBI:SRA:name_token:octal:L = 15; | |
99 const U16 NCBI:SRA:name_token:hex:upper:X = 16; | |
100 const U16 NCBI:SRA:name_token:hex:upper:Y = 17; | |
101 const U16 NCBI:SRA:name_token:hex:upper:T = 18; | |
102 const U16 NCBI:SRA:name_token:hex:upper:L = 19; | |
103 const U16 NCBI:SRA:name_token:hex:lower:X = 20; | |
104 const U16 NCBI:SRA:name_token:hex:lower:Y = 21; | |
105 const U16 NCBI:SRA:name_token:hex:lower:T = 22; | |
106 const U16 NCBI:SRA:name_token:hex:lower:L = 23; | |
107 | |
108 | |
109 /* token symbols | |
110 * when a name matches some pattern and tokens are recognized, | |
111 * the tokens are extracted from the name and sent to individual | |
112 * columns, and replaced with the symbols below to create a | |
113 * formatted name. | |
114 */ | |
115 const ascii NCBI:SRA:name_symbol:Q = '$Q'; | |
116 const ascii NCBI:SRA:name_symbol:X = '$X'; | |
117 const ascii NCBI:SRA:name_symbol:Y = '$Y'; | |
118 const ascii NCBI:SRA:name_symbol:T = '$T'; | |
119 const ascii NCBI:SRA:name_symbol:L = '$L'; | |
120 const ascii NCBI:SRA:name_symbol:octal:X = '$a'; | |
121 const ascii NCBI:SRA:name_symbol:octal:Y = '$b'; | |
122 const ascii NCBI:SRA:name_symbol:octal:T = '$c'; | |
123 const ascii NCBI:SRA:name_symbol:octal:L = '$d'; | |
124 const ascii NCBI:SRA:name_symbol:hex:upper:X = '$e'; | |
125 const ascii NCBI:SRA:name_symbol:hex:upper:Y = '$f'; | |
126 const ascii NCBI:SRA:name_symbol:hex:upper:T = '$g'; | |
127 const ascii NCBI:SRA:name_symbol:hex:upper:L = '$h'; | |
128 const ascii NCBI:SRA:name_symbol:hex:lower:X = '$x'; | |
129 const ascii NCBI:SRA:name_symbol:hex:lower:Y = '$y'; | |
130 const ascii NCBI:SRA:name_symbol:hex:lower:T = '$t'; | |
131 const ascii NCBI:SRA:name_symbol:hex:lower:L = '$l'; | |
132 | |
133 | |
134 /*-------------------------------------------------------------------------- | |
135 * functions | |
136 */ | |
137 | |
138 /* extract_spot_name | |
139 * generates input to .SPOT_NAME column | |
140 * | |
141 * on NCBI:SRA:name_token:unrecognized, produces the entire spot name row | |
142 * otherwise, produces an empty row | |
143 * | |
144 * "name" [ DATA ] - raw spot names from NAME column | |
145 * | |
146 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
147 */ | |
148 function ascii | |
149 NCBI:SRA:extract_spot_name #1 ( ascii name, NCBI:SRA:spot_name_token tok ); | |
150 | |
151 | |
152 /* extract_name_fmt | |
153 * generates input to .NAME_FMT column and/or updates skey index | |
154 * | |
155 * on NCBI:SRA:name_token:unrecognized, produces an empty row | |
156 * otherwise, it creates a temporary "name_fmt" string from name row | |
157 * | |
158 * an attempt is made to insert name_fmt into indicated text index | |
159 * ( normally 'skey' ). if the insert succeeds, i.e. associates "name_fmt" | |
160 * with a row_id, then the output for the row is empty. | |
161 * | |
162 * if the insert fails due to key duplication, an attempt is made to | |
163 * extend the id range of associated rows. depending upon the type of index, | |
164 * this may succeed or fail, e.g. if the existing row range for "name_fmt" is | |
165 * n..m where m = row_id - 1, the range can be extended to n..row_id and | |
166 * the update succeeds. if the index supports discontiguous id ranges, the | |
167 * update will also succeed. upon any success updating the index, the output | |
168 * row will be empty. | |
169 * | |
170 * finally, if the temporary "name_fmt" cannot be inserted into the index | |
171 * nor the existing id range updated, the output for the row will be "name_fmt". | |
172 * | |
173 * "name" [ DATA ] - raw spot names from NAME column | |
174 * | |
175 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
176 */ | |
177 function ascii | |
178 NCBI:SRA:extract_name_fmt #1 < ascii idx > ( ascii name, NCBI:SRA:spot_name_token tok ); | |
179 | |
180 | |
181 /* extract_name_coord | |
182 * generates inputs to .X and .Y and possibly other columns | |
183 * | |
184 * if no tokens match "coord"constant, produces an empty row | |
185 * otherwise, produces binary coordinate value | |
186 * if multiple tokens match criteria, all values must be equivalent | |
187 * because only a single value will be output per row | |
188 * | |
189 * "coord" [ CONST ] - either NCBI:SRA:name_token:X or NCBI:SRA:name_token:Y | |
190 * both of these values also match the token NCBI:SRA:name_token:Q and extract | |
191 * contents appropriately. | |
192 * | |
193 * "name" [ DATA ] - raw spot names from NAME column | |
194 * | |
195 * "tok" [ DATA ] - delimiting tokens produced by sub-table | |
196 */ | |
197 function INSDC:coord:val | |
198 NCBI:SRA:extract_name_coord #1 < U16 coord > ( ascii name, NCBI:SRA:spot_name_token tok ); | |
199 | |
200 | |
201 /* lookup | |
202 */ | |
203 function INSDC:SRA:spot_ids_found NCBI:SRA:lookup #1.0 | |
204 < ascii index_name, ascii query_by_name, U8 name_fmt_version > ( * ascii name_prefix ); | |
205 | |
206 | |
207 /*-------------------------------------------------------------------------- | |
208 * spotcoord | |
209 * spot coordinate table implementation | |
210 */ | |
211 table NCBI:SRA:tbl:spotcoord #1 = INSDC:SRA:tbl:spotcoord #1 | |
212 { | |
213 // X and Y stored as I32 | |
214 INSDC:coord:val out_x_coord = .X; | |
215 INSDC:coord:val out_y_coord = .Y; | |
216 | |
217 // T and L are usually present but optional | |
218 INSDC:coord:val out_t_coord = .T; | |
219 INSDC:coord:val out_l_coord = .L; | |
220 | |
221 // .X, .Y, .T and .L get either empty coordinate or proper coordinate | |
222 physical column < INSDC:coord:val > izip_encoding .X | |
223 = in_x_coord | |
224 | in_name_x_coord; | |
225 physical column < INSDC:coord:val > izip_encoding .Y | |
226 = in_y_coord | |
227 | in_name_y_coord; | |
228 physical column < INSDC:coord:val > izip_encoding .T | |
229 = in_t_coord | |
230 | in_name_t_coord; | |
231 physical column < INSDC:coord:val > izip_encoding .L | |
232 = in_l_coord | |
233 | in_name_l_coord; | |
234 }; | |
235 | |
236 | |
237 /*-------------------------------------------------------------------------- | |
238 * skeyname | |
239 * spot name table implementation built upon prefix-tree skey index | |
240 * | |
241 * v1 - maintains a 1->1 key=>spot_id relationship | |
242 * with unique constraint on key. it does NOT | |
243 * implement name_fmt or x_coord or y_coord. | |
244 * | |
245 * v2 - maintains a 1->1 key=>spot_id-range relationship | |
246 * with unique constraint on key. it does NOT | |
247 * implement spot_name. X and Y are stored using | |
248 * 16-bit unsigned quantities. | |
249 * | |
250 * v3 - maintains a flexible naming approach | |
251 * retrieves name directly from column if so stored | |
252 * synthesizes name from name_fmt, X and Y otherwise | |
253 * name_fmt is either retrieved directly from column | |
254 * or from skey index. X and Y are stored as 32-bit | |
255 * signed quantities. | |
256 * | |
257 * history: | |
258 * 1.0.1 - explicitly account for spotname #1.0.1 ancestry | |
259 * 2.0.1 - " " | |
260 * 3.0.1 - moved .X and .Y to spotcoord table | |
261 */ | |
262 table NCBI:SRA:tbl:skeyname #1.0.1 = INSDC:SRA:tbl:spotname #1.0.1 | |
263 { | |
264 // read the skey entry | |
265 ascii out_skey = ( ascii ) idx:text:project #1.0 < 'skey' > (); | |
266 | |
267 // spot_name | |
268 ascii out_spot_name | |
269 = rewritten_spot_name | |
270 | out_skey; | |
271 | |
272 // search skey entry | |
273 INSDC:SRA:spot_ids_found spot_ids_found | |
274 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 1 > ( out_slx_prefix ) | |
275 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 0 > (); | |
276 | |
277 | |
278 /* INSDC:SRA:tbl:spotname inherited productions | |
279 * out_x_coord | |
280 * out_y_coord | |
281 * out_name_fmt | |
282 */ | |
283 | |
284 /* NCBI:SRA:tbl:skeyname productions | |
285 * out_slx_prefix | |
286 * rewritten_spot_name | |
287 */ | |
288 }; | |
289 | |
290 table NCBI:SRA:tbl:skeyname_nocol #2.0.1 = INSDC:SRA:tbl:spotname #1.0.1 | |
291 { | |
292 // name_fmt | |
293 // perform reverse lookup through index to get key | |
294 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > (); | |
295 | |
296 // search skey entry | |
297 INSDC:SRA:spot_ids_found spot_ids_found | |
298 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ( out_slx_prefix ) | |
299 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > (); | |
300 | |
301 // X and Y stored as U16 | |
302 INSDC:coord:val out_x_coord = cast ( .X ); | |
303 INSDC:coord:val out_y_coord = cast ( .Y ); | |
304 | |
305 | |
306 /* NCBI:SRA:tbl:skeyname_nocol virtual productions | |
307 * out_slx_prefix | |
308 */ | |
309 }; | |
310 | |
311 table NCBI:SRA:tbl:skeyname #2.0.1 = NCBI:SRA:tbl:skeyname_nocol #2.0.1 | |
312 { | |
313 // spot_name_tok comes from a platform-specific tokenizer | |
314 // and must be of type 'NCBI:SRA:spot_name_token' | |
315 physical column < INSDC:coord:val > izip_encoding #1 .X | |
316 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok ); | |
317 physical column < INSDC:coord:val > izip_encoding .Y | |
318 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok ); | |
319 | |
320 /* NCBI:SRA:tbl:skeyname_nocol inherited virtual productions | |
321 * out_slx_prefix | |
322 */ | |
323 | |
324 /* NCBI:SRA:tbl:skeyname virtual productions | |
325 * in_spot_name_tok | |
326 */ | |
327 }; | |
328 | |
329 table NCBI:SRA:tbl:skeyname #3.0.1 = INSDC:SRA:tbl:spotname #1.0.1, NCBI:SRA:tbl:spotcoord #1 | |
330 { | |
331 // spot_name | |
332 // retrieve from hard column | |
333 ascii out_spot_name = .SPOT_NAME; | |
334 | |
335 // name_fmt | |
336 // retrieve from hard column or reverse lookup through index | |
337 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ( .NAME_FMT ); | |
338 | |
339 INSDC:SRA:spot_ids_found spot_ids_found | |
340 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > (); | |
341 | |
342 | |
343 /* encoding rules | |
344 * the sub-table will provide a platform-specific parser that | |
345 * produces as its output a series of NCBI:SRA:spot_name_token | |
346 * for each input row in the virtual production "spot_name_tok" | |
347 * | |
348 * the tokenizer will look for X, Y or Q (combined) coordinates | |
349 * within the spot name and issue tokens when found, or in the | |
350 * case that none are found, an "unrecognized" token is issued. | |
351 * | |
352 * the tokens are then processed here by common rules | |
353 */ | |
354 | |
355 // .SPOT_NAME gets either empty strings or unrecognized strings | |
356 physical column < ascii > zip_encoding .SPOT_NAME | |
357 = NCBI:SRA:extract_spot_name ( NAME, in_spot_name_tok ); | |
358 | |
359 // .NAME_FMT gets either empty strings or unindexed but recognized strings | |
360 physical column < ascii > zip_encoding .NAME_FMT | |
361 = NCBI:SRA:extract_name_fmt < 'skey' > ( NAME, in_spot_name_tok ); | |
362 | |
363 // .X, .Y, .T and .L get either empty coordinate or proper coordinate | |
364 INSDC:coord:val in_name_x_coord | |
365 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok ); | |
366 INSDC:coord:val in_name_y_coord | |
367 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok ); | |
368 INSDC:coord:val in_name_t_coord | |
369 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( NAME, in_spot_name_tok ); | |
370 INSDC:coord:val in_name_l_coord | |
371 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( NAME, in_spot_name_tok ); | |
372 | |
373 | |
374 /* NCBI:SRA:tbl:skeyname virtual productions | |
375 * in_spot_name_tok | |
376 */ | |
377 }; |