comparison libs/sratoolkit.2.8.0-centos_linux64/schema/ncbi/spotname.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * NCBI Sequence Read Archive schema
29 */
30 version 1;
31
32 include 'vdb/vdb.vschema';
33 include 'insdc/sra.vschema';
34
35
36 /*--------------------------------------------------------------------------
37 * types
38 */
39
40 /* spot_name_token
41 * a vector describing tokens recognized within a spot name
42 *
43 * COMPONENTS:
44 * 0 - token id
45 * 1 - token starting coordinate
46 * 2 - token length
47 */
48 alias text:token NCBI:SRA:spot_name_token;
49
50
51 /* token values
52 *
53 * tokens are produced by a schema-specific tokenizer function
54 * this function is purposely abstract because it may rely upon
55 * whatever information it needs to perform its task. the only
56 * requirement is that it produce these tokens as its output.
57 *
58 * an empty name input must produce no tokens. in this case,
59 * there is no name to tokenize or data to produce.
60 *
61 * a non-empty name must produce 1 or more tokens of output.
62 * all tokens must be ordered by starting character position.
63 *
64 * if a name does not conform to any pattern recognized by the
65 * tokenizer, then the tokenizer emits a single token of "unrecognized"
66 *
67 * if a name conforms to some pattern but does not have any
68 * substitution tokens, the tokenizer emits a single token of "recognized"
69 *
70 * if a name may be tokenized, then the resulting tokens should
71 * describe only the portions of the string that should be removed
72 * from the name, e.g. "X" or "Y".
73 *
74 * the standard coordinates "X".."L" are given in unsigned decimal.
75 * alternate representations are contained within their respective
76 * namespaces: "signed", "hex" and "octal".
77 *
78 * the special coordinate "Q" represents the 454-specific encoding
79 * of X and Y into base-36, where the formula for Q is:
80 * Q = 4096 * X + Y
81 * and ASCII encoding:
82 * 0..25 => "A-Z", 26..35 => "0-9"
83 */
84 const U16 NCBI:SRA:name_token:unrecognized = 1;
85 const U16 NCBI:SRA:name_token:recognized = 2;
86 const U16 NCBI:SRA:name_token:Q = 3;
87 const U16 NCBI:SRA:name_token:X = 4;
88 const U16 NCBI:SRA:name_token:Y = 5;
89 const U16 NCBI:SRA:name_token:T = 6;
90 const U16 NCBI:SRA:name_token:L = 7;
91 const U16 NCBI:SRA:name_token:signed:X = 8;
92 const U16 NCBI:SRA:name_token:signed:Y = 9;
93 const U16 NCBI:SRA:name_token:signed:T = 10;
94 const U16 NCBI:SRA:name_token:signed:L = 11;
95 const U16 NCBI:SRA:name_token:octal:X = 12;
96 const U16 NCBI:SRA:name_token:octal:Y = 13;
97 const U16 NCBI:SRA:name_token:octal:T = 14;
98 const U16 NCBI:SRA:name_token:octal:L = 15;
99 const U16 NCBI:SRA:name_token:hex:upper:X = 16;
100 const U16 NCBI:SRA:name_token:hex:upper:Y = 17;
101 const U16 NCBI:SRA:name_token:hex:upper:T = 18;
102 const U16 NCBI:SRA:name_token:hex:upper:L = 19;
103 const U16 NCBI:SRA:name_token:hex:lower:X = 20;
104 const U16 NCBI:SRA:name_token:hex:lower:Y = 21;
105 const U16 NCBI:SRA:name_token:hex:lower:T = 22;
106 const U16 NCBI:SRA:name_token:hex:lower:L = 23;
107
108
109 /* token symbols
110 * when a name matches some pattern and tokens are recognized,
111 * the tokens are extracted from the name and sent to individual
112 * columns, and replaced with the symbols below to create a
113 * formatted name.
114 */
115 const ascii NCBI:SRA:name_symbol:Q = '$Q';
116 const ascii NCBI:SRA:name_symbol:X = '$X';
117 const ascii NCBI:SRA:name_symbol:Y = '$Y';
118 const ascii NCBI:SRA:name_symbol:T = '$T';
119 const ascii NCBI:SRA:name_symbol:L = '$L';
120 const ascii NCBI:SRA:name_symbol:octal:X = '$a';
121 const ascii NCBI:SRA:name_symbol:octal:Y = '$b';
122 const ascii NCBI:SRA:name_symbol:octal:T = '$c';
123 const ascii NCBI:SRA:name_symbol:octal:L = '$d';
124 const ascii NCBI:SRA:name_symbol:hex:upper:X = '$e';
125 const ascii NCBI:SRA:name_symbol:hex:upper:Y = '$f';
126 const ascii NCBI:SRA:name_symbol:hex:upper:T = '$g';
127 const ascii NCBI:SRA:name_symbol:hex:upper:L = '$h';
128 const ascii NCBI:SRA:name_symbol:hex:lower:X = '$x';
129 const ascii NCBI:SRA:name_symbol:hex:lower:Y = '$y';
130 const ascii NCBI:SRA:name_symbol:hex:lower:T = '$t';
131 const ascii NCBI:SRA:name_symbol:hex:lower:L = '$l';
132
133
134 /*--------------------------------------------------------------------------
135 * functions
136 */
137
138 /* extract_spot_name
139 * generates input to .SPOT_NAME column
140 *
141 * on NCBI:SRA:name_token:unrecognized, produces the entire spot name row
142 * otherwise, produces an empty row
143 *
144 * "name" [ DATA ] - raw spot names from NAME column
145 *
146 * "tok" [ DATA ] - delimiting tokens produced by sub-table
147 */
148 function ascii
149 NCBI:SRA:extract_spot_name #1 ( ascii name, NCBI:SRA:spot_name_token tok );
150
151
152 /* extract_name_fmt
153 * generates input to .NAME_FMT column and/or updates skey index
154 *
155 * on NCBI:SRA:name_token:unrecognized, produces an empty row
156 * otherwise, it creates a temporary "name_fmt" string from name row
157 *
158 * an attempt is made to insert name_fmt into indicated text index
159 * ( normally 'skey' ). if the insert succeeds, i.e. associates "name_fmt"
160 * with a row_id, then the output for the row is empty.
161 *
162 * if the insert fails due to key duplication, an attempt is made to
163 * extend the id range of associated rows. depending upon the type of index,
164 * this may succeed or fail, e.g. if the existing row range for "name_fmt" is
165 * n..m where m = row_id - 1, the range can be extended to n..row_id and
166 * the update succeeds. if the index supports discontiguous id ranges, the
167 * update will also succeed. upon any success updating the index, the output
168 * row will be empty.
169 *
170 * finally, if the temporary "name_fmt" cannot be inserted into the index
171 * nor the existing id range updated, the output for the row will be "name_fmt".
172 *
173 * "name" [ DATA ] - raw spot names from NAME column
174 *
175 * "tok" [ DATA ] - delimiting tokens produced by sub-table
176 */
177 function ascii
178 NCBI:SRA:extract_name_fmt #1 < ascii idx > ( ascii name, NCBI:SRA:spot_name_token tok );
179
180
181 /* extract_name_coord
182 * generates inputs to .X and .Y and possibly other columns
183 *
184 * if no tokens match "coord"constant, produces an empty row
185 * otherwise, produces binary coordinate value
186 * if multiple tokens match criteria, all values must be equivalent
187 * because only a single value will be output per row
188 *
189 * "coord" [ CONST ] - either NCBI:SRA:name_token:X or NCBI:SRA:name_token:Y
190 * both of these values also match the token NCBI:SRA:name_token:Q and extract
191 * contents appropriately.
192 *
193 * "name" [ DATA ] - raw spot names from NAME column
194 *
195 * "tok" [ DATA ] - delimiting tokens produced by sub-table
196 */
197 function INSDC:coord:val
198 NCBI:SRA:extract_name_coord #1 < U16 coord > ( ascii name, NCBI:SRA:spot_name_token tok );
199
200
201 /* lookup
202 */
203 function INSDC:SRA:spot_ids_found NCBI:SRA:lookup #1.0
204 < ascii index_name, ascii query_by_name, U8 name_fmt_version > ( * ascii name_prefix );
205
206
207 /*--------------------------------------------------------------------------
208 * spotcoord
209 * spot coordinate table implementation
210 */
211 table NCBI:SRA:tbl:spotcoord #1 = INSDC:SRA:tbl:spotcoord #1
212 {
213 // X and Y stored as I32
214 INSDC:coord:val out_x_coord = .X;
215 INSDC:coord:val out_y_coord = .Y;
216
217 // T and L are usually present but optional
218 INSDC:coord:val out_t_coord = .T;
219 INSDC:coord:val out_l_coord = .L;
220
221 // .X, .Y, .T and .L get either empty coordinate or proper coordinate
222 physical column < INSDC:coord:val > izip_encoding .X
223 = in_x_coord
224 | in_name_x_coord;
225 physical column < INSDC:coord:val > izip_encoding .Y
226 = in_y_coord
227 | in_name_y_coord;
228 physical column < INSDC:coord:val > izip_encoding .T
229 = in_t_coord
230 | in_name_t_coord;
231 physical column < INSDC:coord:val > izip_encoding .L
232 = in_l_coord
233 | in_name_l_coord;
234 };
235
236
237 /*--------------------------------------------------------------------------
238 * skeyname
239 * spot name table implementation built upon prefix-tree skey index
240 *
241 * v1 - maintains a 1->1 key=>spot_id relationship
242 * with unique constraint on key. it does NOT
243 * implement name_fmt or x_coord or y_coord.
244 *
245 * v2 - maintains a 1->1 key=>spot_id-range relationship
246 * with unique constraint on key. it does NOT
247 * implement spot_name. X and Y are stored using
248 * 16-bit unsigned quantities.
249 *
250 * v3 - maintains a flexible naming approach
251 * retrieves name directly from column if so stored
252 * synthesizes name from name_fmt, X and Y otherwise
253 * name_fmt is either retrieved directly from column
254 * or from skey index. X and Y are stored as 32-bit
255 * signed quantities.
256 *
257 * history:
258 * 1.0.1 - explicitly account for spotname #1.0.1 ancestry
259 * 2.0.1 - " "
260 * 3.0.1 - moved .X and .Y to spotcoord table
261 */
262 table NCBI:SRA:tbl:skeyname #1.0.1 = INSDC:SRA:tbl:spotname #1.0.1
263 {
264 // read the skey entry
265 ascii out_skey = ( ascii ) idx:text:project #1.0 < 'skey' > ();
266
267 // spot_name
268 ascii out_spot_name
269 = rewritten_spot_name
270 | out_skey;
271
272 // search skey entry
273 INSDC:SRA:spot_ids_found spot_ids_found
274 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 1 > ( out_slx_prefix )
275 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 0 > ();
276
277
278 /* INSDC:SRA:tbl:spotname inherited productions
279 * out_x_coord
280 * out_y_coord
281 * out_name_fmt
282 */
283
284 /* NCBI:SRA:tbl:skeyname productions
285 * out_slx_prefix
286 * rewritten_spot_name
287 */
288 };
289
290 table NCBI:SRA:tbl:skeyname_nocol #2.0.1 = INSDC:SRA:tbl:spotname #1.0.1
291 {
292 // name_fmt
293 // perform reverse lookup through index to get key
294 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ();
295
296 // search skey entry
297 INSDC:SRA:spot_ids_found spot_ids_found
298 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ( out_slx_prefix )
299 | ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();
300
301 // X and Y stored as U16
302 INSDC:coord:val out_x_coord = cast ( .X );
303 INSDC:coord:val out_y_coord = cast ( .Y );
304
305
306 /* NCBI:SRA:tbl:skeyname_nocol virtual productions
307 * out_slx_prefix
308 */
309 };
310
311 table NCBI:SRA:tbl:skeyname #2.0.1 = NCBI:SRA:tbl:skeyname_nocol #2.0.1
312 {
313 // spot_name_tok comes from a platform-specific tokenizer
314 // and must be of type 'NCBI:SRA:spot_name_token'
315 physical column < INSDC:coord:val > izip_encoding #1 .X
316 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
317 physical column < INSDC:coord:val > izip_encoding .Y
318 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );
319
320 /* NCBI:SRA:tbl:skeyname_nocol inherited virtual productions
321 * out_slx_prefix
322 */
323
324 /* NCBI:SRA:tbl:skeyname virtual productions
325 * in_spot_name_tok
326 */
327 };
328
329 table NCBI:SRA:tbl:skeyname #3.0.1 = INSDC:SRA:tbl:spotname #1.0.1, NCBI:SRA:tbl:spotcoord #1
330 {
331 // spot_name
332 // retrieve from hard column
333 ascii out_spot_name = .SPOT_NAME;
334
335 // name_fmt
336 // retrieve from hard column or reverse lookup through index
337 ascii out_name_fmt = ( ascii ) idx:text:project #1.0 < 'skey' > ( .NAME_FMT );
338
339 INSDC:SRA:spot_ids_found spot_ids_found
340 = ( INSDC:SRA:spot_ids_found ) NCBI:SRA:lookup #1 < 'skey' , 'QUERY_BY_NAME', 2 > ();
341
342
343 /* encoding rules
344 * the sub-table will provide a platform-specific parser that
345 * produces as its output a series of NCBI:SRA:spot_name_token
346 * for each input row in the virtual production "spot_name_tok"
347 *
348 * the tokenizer will look for X, Y or Q (combined) coordinates
349 * within the spot name and issue tokens when found, or in the
350 * case that none are found, an "unrecognized" token is issued.
351 *
352 * the tokens are then processed here by common rules
353 */
354
355 // .SPOT_NAME gets either empty strings or unrecognized strings
356 physical column < ascii > zip_encoding .SPOT_NAME
357 = NCBI:SRA:extract_spot_name ( NAME, in_spot_name_tok );
358
359 // .NAME_FMT gets either empty strings or unindexed but recognized strings
360 physical column < ascii > zip_encoding .NAME_FMT
361 = NCBI:SRA:extract_name_fmt < 'skey' > ( NAME, in_spot_name_tok );
362
363 // .X, .Y, .T and .L get either empty coordinate or proper coordinate
364 INSDC:coord:val in_name_x_coord
365 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:X > ( NAME, in_spot_name_tok );
366 INSDC:coord:val in_name_y_coord
367 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:Y > ( NAME, in_spot_name_tok );
368 INSDC:coord:val in_name_t_coord
369 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:T > ( NAME, in_spot_name_tok );
370 INSDC:coord:val in_name_l_coord
371 = NCBI:SRA:extract_name_coord < NCBI:SRA:name_token:L > ( NAME, in_spot_name_tok );
372
373
374 /* NCBI:SRA:tbl:skeyname virtual productions
375 * in_spot_name_tok
376 */
377 };