Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/reference.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * VDB Alignment types, functions and tables | |
29 */ | |
30 version 1; | |
31 | |
32 include 'vdb/vdb.vschema'; | |
33 include 'csra2/stats.vschema'; | |
34 | |
35 | |
36 /*-------------------------------------------------------------------------- | |
37 * tables | |
38 */ | |
39 table NCBI:csra2:tbl:reference #1.0 | |
40 = NCBI:csra2:tbl:read_stats #1 | |
41 { | |
42 /* CHUNK_SIZE | |
43 * describes the maximum number of bases in any cell | |
44 */ | |
45 extern column INSDC:coord:len CHUNK_SIZE; | |
46 | |
47 /* CIRCULAR | |
48 * true if the reference is circular | |
49 */ | |
50 extern column bool CIRCULAR; | |
51 | |
52 /* CANONICAL_NAME | |
53 * this should be an accessioned proper name | |
54 */ | |
55 extern column utf8 CANONICAL_NAME; | |
56 | |
57 /* COMMON_NAME | |
58 * this name may be ambiguous or missing entirely | |
59 */ | |
60 extern column utf8 COMMON_NAME; | |
61 | |
62 /* LOCAL_SEQUENCE | |
63 * supports name overloading by type | |
64 */ | |
65 extern default column INSDC:dna:text LOCAL_SEQUENCE | |
66 { | |
67 read = out_local_dna_text; | |
68 validate = < INSDC:dna:text > compare ( in_local_dna_text, out_local_dna_text ); | |
69 } | |
70 extern column INSDC:4na:bin LOCAL_SEQUENCE = out_local_4na_bin; | |
71 | |
72 /* PRIMARY_ALIGNMENT_IDS | |
73 * SECONDARY_ALIGNMENT_IDS | |
74 * an index to rows in the PRIMARY_ALIGNMENT and | |
75 * SECONDARY_ALIGNMENT tables having alignments | |
76 * STARTING within this chunk | |
77 * | |
78 * the indicies MUST be sorted in clustered order, | |
79 * meaning that they are in ascending numeric order | |
80 */ | |
81 extern column < I64 > izip_encoding PRIMARY_ALIGNMENT_IDS; | |
82 extern column < I64 > izip_encoding SECONDARY_ALIGNMENT_IDS; | |
83 | |
84 /* OVERLAP_REF_POS | |
85 * min ( REF_POS ) for all alignments intersecting this chunk | |
86 * but starting in a previous chunk, where the stored position | |
87 * is in reference coordinates. | |
88 * | |
89 * a value of 0 indicates that no alignments starting to | |
90 * the left of this chunk also intersect with it. | |
91 */ | |
92 extern column < INSDC:coord:zero > izip_encoding OVERLAP_REF_POS; | |
93 | |
94 /* OVERLAP_REF_LEN | |
95 * max ( REF_POS + REF_LEN - CHUNK_START ) % CHUNK_SIZE | |
96 * for all alignments intersecting this chunk but starting | |
97 * in a previous chunk. | |
98 * | |
99 * indicates the amount of this chunk that is needed by | |
100 * alignments not starting within chunk. so if a slice on | |
101 * this reference were to start at 100 bases into this chunk, | |
102 * for example, and the OVERLAP_REF_LEN were 100 or less, then | |
103 * there are no alignments from prior chunks that need to be | |
104 * considered. | |
105 */ | |
106 extern column < INSDC:coord:len > izip_encoding OVERLAP_REF_LEN; | |
107 | |
108 /* COVERAGE | |
109 * graphing statistics for the chunk | |
110 */ | |
111 | |
112 // clipped at 255 | |
113 extern column < U8 > izip_encoding CGRAPH_HIGH; | |
114 extern column < U8 > izip_encoding CGRAPH_LOW; | |
115 | |
116 // count of the number of mismatches in the chunk | |
117 extern column < U32 > izip_encoding CGRAPH_MISMATCHES; | |
118 | |
119 // count of the number of inserts and deletes in the chunk | |
120 extern column < U32 > izip_encoding CGRAPH_INDELS; | |
121 | |
122 | |
123 /* writing rules */ | |
124 INSDC:dna:text in_local_dna_text | |
125 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( LOCAL_SEQUENCE ); | |
126 ; | |
127 INSDC:4na:bin in_local_4na_bin | |
128 = < INSDC:4na:bin > range_validate < 0, 15 > ( LOCAL_SEQUENCE ) | |
129 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_local_dna_text ) | |
130 ; | |
131 INSDC:2na:bin in_local_2na_bin | |
132 = INSDC:SEQ:rand_4na_2na ( in_local_4na_bin ) | |
133 ; | |
134 INSDC:4na:bin in_ambig_4na_bin | |
135 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_local_4na_bin ); | |
136 ; | |
137 | |
138 INSDC:4na:bin in_stats_seq = in_local_4na_bin; | |
139 | |
140 /* physical columns for sequence */ | |
141 physical column INSDC:2na:packed .LOCAL_SEQUENCE | |
142 = ( INSDC:2na:packed ) pack ( in_local_2na_bin ) | |
143 ; | |
144 physical column < INSDC:4na:bin > zip_encoding .LOCAL_AMBIGUITY | |
145 = < INSDC:4na:bin > trim < 0, 0 > ( in_ambig_4na_bin ) | |
146 ; | |
147 | |
148 /* reading rules */ | |
149 INSDC:2na:packed out_local_2na_packed | |
150 = .LOCAL_SEQUENCE | |
151 ; | |
152 INSDC:2na:bin out_local_2na_bin | |
153 = ( INSDC:2na:bin ) unpack ( out_local_2na_packed ) | |
154 ; | |
155 INSDC:4na:bin out_local_2na_4na_bin | |
156 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_local_2na_bin ); | |
157 ; | |
158 INSDC:4na:bin out_local_4na_bin | |
159 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_local_2na_4na_bin, .LOCAL_AMBIGUITY ) | |
160 ; | |
161 INSDC:dna:text out_local_dna_text | |
162 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_local_4na_bin ) | |
163 ; | |
164 | |
165 | |
166 INSDC:coord:len in_local_read_len | |
167 = ( INSDC:coord:len ) row_len ( in_local_2na_bin ) | |
168 ; | |
169 INSDC:SRA:xread_type in_local_read_type | |
170 = < INSDC:SRA:xread_type > echo < SRA_READ_TYPE_BIOLOGICAL > () | |
171 ; | |
172 } | |
173 | |
174 | |
175 /*-------------------------------------------------------------------------- | |
176 * "views" | |
177 */ | |
178 table NCBI:csra2:view:reference #1.0 | |
179 = NCBI:csra2:tbl:reference #1.0 | |
180 { | |
181 /* EXTERNAL | |
182 * may need to be a function | |
183 * it can test the CANONICAL_NAME as in cSRA.v1, | |
184 * but if internal it can also check row_length of bases | |
185 */ | |
186 readonly column bool EXTERNAL | |
187 = < bool > exists < false > ( .LOCAL_SEQUENCE ) | |
188 | < bool > echo < true > () | |
189 ; | |
190 | |
191 /* SEQUENCE | |
192 * available as text, 4na, x2na, 2na | |
193 */ | |
194 default readonly column INSDC:dna:text SEQUENCE | |
195 = out_dna_text | |
196 ; | |
197 readonly column INSDC:4na:bin SEQUENCE | |
198 = out_4na_bin | |
199 ; | |
200 readonly column INSDC:4na:packed SEQUENCE | |
201 = ( INSDC:4na:packed ) pack ( out_4na_bin ) | |
202 ; | |
203 readonly column INSDC:x2na:bin SEQUENCE | |
204 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ) | |
205 ; | |
206 readonly column INSDC:2na:bin SEQUENCE | |
207 = out_2na_bin | |
208 ; | |
209 readonly column INSDC:2na:packed SEQUENCE | |
210 = pack ( out_2na_bin ) | |
211 ; | |
212 | |
213 /* QUALITY | |
214 * This is fake column for compatibility | |
215 */ | |
216 readonly column INSDC:quality:phred QUALITY | |
217 = out_qual_phred | |
218 ; | |
219 | |
220 /* column aliases */ | |
221 readonly column INSDC:coord:len MAX_SEQ_LEN = .CHUNK_SIZE; | |
222 readonly column ascii SEQ_ID = cast ( .CANONICAL_NAME ); | |
223 | |
224 /* sequence productions */ | |
225 INSDC:4na:bin out_4na_bin | |
226 = out_local_4na_bin | |
227 // TODO: | sub-select from external table | |
228 ; | |
229 | |
230 INSDC:dna:text out_dna_text | |
231 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) | |
232 ; | |
233 | |
234 INSDC:2na:bin out_2na_bin | |
235 = INSDC:SEQ:rand_4na_2na ( out_4na_bin ) | |
236 ; | |
237 | |
238 /* quality productions */ | |
239 INSDC:quality:phred out_qual_phred | |
240 = < INSDC:quality:phred > echo < 30 > ( SEQUENCE ) | |
241 ; | |
242 | |
243 INSDC:quality:phred in_stats_qual_phred = out_qual_phred; | |
244 } | |
245 |