comparison libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/reference.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * VDB Alignment types, functions and tables
29 */
30 version 1;
31
32 include 'vdb/vdb.vschema';
33 include 'csra2/stats.vschema';
34
35
36 /*--------------------------------------------------------------------------
37 * tables
38 */
39 table NCBI:csra2:tbl:reference #1.0
40 = NCBI:csra2:tbl:read_stats #1
41 {
42 /* CHUNK_SIZE
43 * describes the maximum number of bases in any cell
44 */
45 extern column INSDC:coord:len CHUNK_SIZE;
46
47 /* CIRCULAR
48 * true if the reference is circular
49 */
50 extern column bool CIRCULAR;
51
52 /* CANONICAL_NAME
53 * this should be an accessioned proper name
54 */
55 extern column utf8 CANONICAL_NAME;
56
57 /* COMMON_NAME
58 * this name may be ambiguous or missing entirely
59 */
60 extern column utf8 COMMON_NAME;
61
62 /* LOCAL_SEQUENCE
63 * supports name overloading by type
64 */
65 extern default column INSDC:dna:text LOCAL_SEQUENCE
66 {
67 read = out_local_dna_text;
68 validate = < INSDC:dna:text > compare ( in_local_dna_text, out_local_dna_text );
69 }
70 extern column INSDC:4na:bin LOCAL_SEQUENCE = out_local_4na_bin;
71
72 /* PRIMARY_ALIGNMENT_IDS
73 * SECONDARY_ALIGNMENT_IDS
74 * an index to rows in the PRIMARY_ALIGNMENT and
75 * SECONDARY_ALIGNMENT tables having alignments
76 * STARTING within this chunk
77 *
78 * the indicies MUST be sorted in clustered order,
79 * meaning that they are in ascending numeric order
80 */
81 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS;
82 extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS;
83
84 /* OVERLAP_REF_POS
85 * min ( REF_POS ) for all alignments intersecting this chunk
86 * but starting in a previous chunk, where the stored position
87 * is in reference coordinates.
88 *
89 * a value of 0 indicates that no alignments starting to
90 * the left of this chunk also intersect with it.
91 */
92 extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS;
93
94 /* OVERLAP_REF_LEN
95 * max ( REF_POS + REF_LEN - CHUNK_START ) % CHUNK_SIZE
96 * for all alignments intersecting this chunk but starting
97 * in a previous chunk.
98 *
99 * indicates the amount of this chunk that is needed by
100 * alignments not starting within chunk. so if a slice on
101 * this reference were to start at 100 bases into this chunk,
102 * for example, and the OVERLAP_REF_LEN were 100 or less, then
103 * there are no alignments from prior chunks that need to be
104 * considered.
105 */
106 extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN;
107
108 /* COVERAGE
109 * graphing statistics for the chunk
110 */
111
112 // clipped at 255
113 extern column < U8 > izip_encoding CGRAPH_HIGH;
114 extern column < U8 > izip_encoding CGRAPH_LOW;
115
116 // count of the number of mismatches in the chunk
117 extern column < U32 > izip_encoding CGRAPH_MISMATCHES;
118
119 // count of the number of inserts and deletes in the chunk
120 extern column < U32 > izip_encoding CGRAPH_INDELS;
121
122
123 /* writing rules */
124 INSDC:dna:text in_local_dna_text
125 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( LOCAL_SEQUENCE );
126 ;
127 INSDC:4na:bin in_local_4na_bin
128 = < INSDC:4na:bin > range_validate < 0, 15 > ( LOCAL_SEQUENCE )
129 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_local_dna_text )
130 ;
131 INSDC:2na:bin in_local_2na_bin
132 = INSDC:SEQ:rand_4na_2na ( in_local_4na_bin )
133 ;
134 INSDC:4na:bin in_ambig_4na_bin
135 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_local_4na_bin );
136 ;
137
138 INSDC:4na:bin in_stats_seq = in_local_4na_bin;
139
140 /* physical columns for sequence */
141 physical column INSDC:2na:packed .LOCAL_SEQUENCE
142 = ( INSDC:2na:packed ) pack ( in_local_2na_bin )
143 ;
144 physical column < INSDC:4na:bin > zip_encoding .LOCAL_AMBIGUITY
145 = < INSDC:4na:bin > trim < 0, 0 > ( in_ambig_4na_bin )
146 ;
147
148 /* reading rules */
149 INSDC:2na:packed out_local_2na_packed
150 = .LOCAL_SEQUENCE
151 ;
152 INSDC:2na:bin out_local_2na_bin
153 = ( INSDC:2na:bin ) unpack ( out_local_2na_packed )
154 ;
155 INSDC:4na:bin out_local_2na_4na_bin
156 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_local_2na_bin );
157 ;
158 INSDC:4na:bin out_local_4na_bin
159 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_local_2na_4na_bin, .LOCAL_AMBIGUITY )
160 ;
161 INSDC:dna:text out_local_dna_text
162 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_local_4na_bin )
163 ;
164
165
166 INSDC:coord:len in_local_read_len
167 = ( INSDC:coord:len ) row_len ( in_local_2na_bin )
168 ;
169 INSDC:SRA:xread_type in_local_read_type
170 = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > ()
171 ;
172 }
173
174
175 /*--------------------------------------------------------------------------
176 * "views"
177 */
178 table NCBI:csra2:view:reference #1.0
179 = NCBI:csra2:tbl:reference #1.0
180 {
181 /* EXTERNAL
182 * may need to be a function
183 * it can test the CANONICAL_NAME as in cSRA.v1,
184 * but if internal it can also check row_length of bases
185 */
186 readonly column bool EXTERNAL
187 = < bool > exists < false > ( .LOCAL_SEQUENCE )
188 | < bool > echo < true > ()
189 ;
190
191 /* SEQUENCE
192 * available as text, 4na, x2na, 2na
193 */
194 default readonly column INSDC:dna:text SEQUENCE
195 = out_dna_text
196 ;
197 readonly column INSDC:4na:bin SEQUENCE
198 = out_4na_bin
199 ;
200 readonly column INSDC:4na:packed SEQUENCE
201 = ( INSDC:4na:packed ) pack ( out_4na_bin )
202 ;
203 readonly column INSDC:x2na:bin SEQUENCE
204 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin )
205 ;
206 readonly column INSDC:2na:bin SEQUENCE
207 = out_2na_bin
208 ;
209 readonly column INSDC:2na:packed SEQUENCE
210 = pack ( out_2na_bin )
211 ;
212
213 /* QUALITY
214 * This is fake column for compatibility
215 */
216 readonly column INSDC:quality:phred QUALITY
217 = out_qual_phred
218 ;
219
220 /* column aliases */
221 readonly column INSDC:coord:len MAX_SEQ_LEN = .CHUNK_SIZE;
222 readonly column ascii SEQ_ID = cast ( .CANONICAL_NAME );
223
224 /* sequence productions */
225 INSDC:4na:bin out_4na_bin
226 = out_local_4na_bin
227 // TODO: | sub-select from external table
228 ;
229
230 INSDC:dna:text out_dna_text
231 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
232 ;
233
234 INSDC:2na:bin out_2na_bin
235 = INSDC:SEQ:rand_4na_2na ( out_4na_bin )
236 ;
237
238 /* quality productions */
239 INSDC:quality:phred out_qual_phred
240 = < INSDC:quality:phred > echo < 30 > ( SEQUENCE )
241 ;
242
243 INSDC:quality:phred in_stats_qual_phred = out_qual_phred;
244 }
245