Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author | charles_s_test |
---|---|
date | Mon, 27 Nov 2017 11:21:07 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:0d65b71ff8df | 3:38ad1130d077 |
---|---|
1 /*=========================================================================== | |
2 * | |
3 * PUBLIC DOMAIN NOTICE | |
4 * National Center for Biotechnology Information | |
5 * | |
6 * This software/database is a "United States Government Work" under the | |
7 * terms of the United States Copyright Act. It was written as part of | |
8 * the author's official duties as a United States Government employee and | |
9 * thus cannot be copyrighted. This software/database is freely available | |
10 * to the public for use. The National Library of Medicine and the U.S. | |
11 * Government have not placed any restriction on its use or reproduction. | |
12 * | |
13 * Although all reasonable efforts have been taken to ensure the accuracy | |
14 * and reliability of the software and data, the NLM and the U.S. | |
15 * Government do not and cannot warrant the performance or results that | |
16 * may be obtained by using this software or data. The NLM and the U.S. | |
17 * Government disclaim all warranties, express or implied, including | |
18 * warranties of performance, merchantability or fitness for any particular | |
19 * purpose. | |
20 * | |
21 * Please cite the author in any work or product based on this material. | |
22 * | |
23 * =========================================================================== | |
24 * | |
25 */ | |
26 | |
27 /*========================================================================== | |
28 * NCBI PacBio Fastq Sequence Read Archive schema | |
29 */ | |
30 version 1; | |
31 | |
32 include 'insdc/sra.vschema'; | |
33 include 'ncbi/sra.vschema'; | |
34 | |
35 | |
36 /*-------------------------------------------------------------------------- | |
37 * NCBI:SRA:PacBio | |
38 * Pacific Biotech SRA Platform | |
39 * | |
40 * history: | |
41 * 1.0.2 - updated ancestry | |
42 * 1.0.3 - updated ancestry | |
43 */ | |
44 table NCBI:SRA:PacBio:common #1.0.3 = NCBI:SRA:tbl:sra #2.1.3 | |
45 { | |
46 } | |
47 | |
48 /* history: | |
49 * 1.0.2 - updated ancestry | |
50 * 1.0.3 - updated ancestry | |
51 */ | |
52 table NCBI:SRA:PacBio:smrt:fastq #1.0.3 | |
53 = NCBI:SRA:PacBio:common #1.0.3 | |
54 , NCBI:tbl:base_space #2.0.3 | |
55 , NCBI:tbl:phred_quality #2.0.3 | |
56 { | |
57 /* PLATFORM | |
58 * platform name is always "PACBIO_SMRT" | |
59 */ | |
60 ascii platform_name | |
61 = < ascii > echo < "PACBIO_SMRT" > (); | |
62 | |
63 /* TRIMMED SEQUENCE | |
64 * need to find the 0-based trim_start and trim_len | |
65 */ | |
66 INSDC:coord:zero bio_start | |
67 = NCBI:SRA:bio_start ( out_read_start, out_read_type ); | |
68 | |
69 INSDC:coord:zero trim_start = bio_start; | |
70 | |
71 U32 trim_left = ( U32 ) trim_start; | |
72 INSDC:coord:len trim_len = ( INSDC:coord:len ) | |
73 < U32 > diff ( spot_len, trim_left ); | |
74 } | |
75 | |
76 /*-------------------------------------------------------------------------- | |
77 * NCBI:SRA:PacBio:smrt:db | |
78 * Pacific Biotech SRA Platform | |
79 */ | |
80 table NCBI:SRA:PacBio:smrt:indelsubst #1 | |
81 { | |
82 // probability that the current base is an insertion | |
83 column < U8 > zip_encoding INSERTION_QV; | |
84 | |
85 // probability of a deletion error following current base | |
86 // and identity of deleted base, if it exists | |
87 column < U8 > zip_encoding DELETION_QV; | |
88 column < INSDC:dna:text > zip_encoding DELETION_TAG; | |
89 | |
90 // probability of a substitution error | |
91 // and most likely alternative base call | |
92 column < U8 > zip_encoding SUBSTITUTION_QV; | |
93 column < INSDC:dna:text > zip_encoding SUBSTITUTION_TAG; | |
94 }; | |
95 | |
96 typedef U8 PacBio:hole:status; | |
97 const PacBio:hole:status PacBio:hole:SEQUENCING = 0; | |
98 const PacBio:hole:status PacBio:hole:ANTIHOLE = 1; | |
99 const PacBio:hole:status PacBio:hole:FIDUCIAL = 2; | |
100 const PacBio:hole:status PacBio:hole:SUSPECT = 3; | |
101 const PacBio:hole:status PacBio:hole:ANTIMIRROR = 4; | |
102 const PacBio:hole:status PacBio:hole:FDZMW = 5; | |
103 const PacBio:hole:status PacBio:hole:FBZMW = 6; | |
104 const PacBio:hole:status PacBio:hole:ANTIBEAMLET = 7; | |
105 const PacBio:hole:status PacBio:hole:OUTSIDEFOV = 8; | |
106 | |
107 /* history: | |
108 * 1.0.1 - updated ancestry | |
109 * 1.0.2 - updated ancestry | |
110 */ | |
111 table NCBI:SRA:PacBio:smrt:basecalls #1.0.2 | |
112 = INSDC:SRA:tbl:spotcoord #1 | |
113 , NCBI:tbl:base_space #2.0.3 | |
114 , NCBI:tbl:phred_quality #2.0.3 | |
115 , NCBI:SRA:PacBio:smrt:indelsubst #1 | |
116 { | |
117 /* PLATFORM | |
118 * platform name is always "PACBIO_SMRT" | |
119 */ | |
120 ascii platform_name | |
121 = < ascii > echo < "PACBIO_SMRT" > (); | |
122 | |
123 // basecalls will be routed to READ column | |
124 readonly column INSDC:dna:text BASECALL | |
125 = out_dna_text; | |
126 | |
127 // quality value for each base | |
128 readonly column INSDC:quality:phred QUALITY_VALUE | |
129 = out_qual_phred; | |
130 | |
131 // zero-based hole number | |
132 column < U32 > izip_encoding HOLE_NUMBER; | |
133 | |
134 // hole status | |
135 column < PacBio:hole:status > zip_encoding HOLE_STATUS; | |
136 | |
137 // optional column pair to describe hole status | |
138 // when/if it does not line up with our constants above | |
139 column < ascii > zip_encoding HOLE_STATUS_VALUE; | |
140 column < INSDC:coord:len > izip_encoding HOLE_STATUS_VALUE_LEN; | |
141 | |
142 // hole ( X,Y ) pair will be split and sent to X and Y columns | |
143 column I16 [ 2 ] HOLE_XY | |
144 = < I16 > paste ( x_clip_I16, y_clip_I16 ); | |
145 I16 x_clip_I16 = cast ( out_x_coord ); | |
146 I16 y_clip_I16 = cast ( out_y_coord ); | |
147 | |
148 I16 in_x16_coord = < I16 > cut < 0 > ( HOLE_XY ); | |
149 I16 in_y16_coord = < I16 > cut < 1 > ( HOLE_XY ); | |
150 | |
151 INSDC:coord:val in_x_coord = cast ( in_x16_coord ); | |
152 INSDC:coord:val in_y_coord = cast ( in_y16_coord ); | |
153 | |
154 // the number of bases in ZMW | |
155 readonly column INSDC:coord:len NUM_EVENT | |
156 = base_space_spot_len; | |
157 }; | |
158 | |
159 /* history: | |
160 * 1.0.1 - updated ancestry | |
161 * 1.0.2 - updated ancestry | |
162 */ | |
163 table NCBI:SRA:PacBio:smrt:sequence #1.0.2 | |
164 = NCBI:SRA:PacBio:smrt:basecalls #1.0.2 | |
165 , NCBI:SRA:tbl:sra_nopos #2.1.3 | |
166 { | |
167 // pulse information | |
168 column < U16 > izip_encoding PRE_BASE_FRAMES; | |
169 column < U16 > izip_encoding WIDTH_IN_FRAMES; | |
170 | |
171 // spot to pulse map | |
172 default column INSDC:position:zero PULSE_INDEX | |
173 = .PULSE_INDEX; | |
174 readonly column INSDC:position:one PULSE_INDEX | |
175 = out_position; | |
176 INSDC:position:one out_position | |
177 = ( INSDC:position:one ) < INSDC:position:zero > sum < 1 > ( .PULSE_INDEX ); | |
178 | |
179 column NCBI:SRA:pos16 PULSE_INDEX | |
180 = cast ( .PULSE_INDEX ); | |
181 NCBI:SRA:pos16 in_pulse_index16 | |
182 = PULSE_INDEX; | |
183 | |
184 INSDC:position:zero in_pulse_index32 | |
185 = PULSE_INDEX | |
186 | cast ( in_pulse_index16 ); | |
187 | |
188 physical column < INSDC:position:zero > izip_encoding .PULSE_INDEX | |
189 = in_pulse_index32; | |
190 | |
191 /* clip quality */ | |
192 extern column < INSDC:coord:zero > izip_encoding CLIP_QUALITY_LEFT; | |
193 extern column < INSDC:coord:one > izip_encoding CLIP_QUALITY_RIGHT; | |
194 | |
195 /* TRIMMED SEQUENCE | |
196 * need to find the 0-based trim_start and trim_len | |
197 */ | |
198 INSDC:coord:zero trim_start | |
199 = .CLIP_QUALITY_LEFT | |
200 | NCBI:SRA:bio_start ( out_read_start, out_read_type ); | |
201 | |
202 U32 trim_right | |
203 = ( U32 ) .CLIP_QUALITY_RIGHT | |
204 | spot_len; | |
205 | |
206 U32 trim_left = ( U32 ) trim_start; | |
207 INSDC:coord:len trim_len = ( INSDC:coord:len ) | |
208 < U32 > diff ( trim_right, trim_left ); | |
209 }; | |
210 | |
211 /* history: | |
212 * 1.0.1 - updated ancestry | |
213 * 1.0.2 - updated ancestry | |
214 */ | |
215 table NCBI:SRA:PacBio:smrt:cons #1.0.2 | |
216 = NCBI:SRA:PacBio:smrt:basecalls #1.0.2 | |
217 , NCBI:SRA:tbl:sra #2.1.3 | |
218 { | |
219 // documented in both hdf5 and xsd as signed... | |
220 column < I32 > izip_encoding NUM_PASSES; | |
221 | |
222 /* TRIMMED SEQUENCE | |
223 * need to find the 0-based trim_start and trim_len | |
224 */ | |
225 INSDC:coord:zero trim_start | |
226 = NCBI:SRA:bio_start ( out_read_start, out_read_type ); | |
227 | |
228 U32 trim_left = ( U32 ) trim_start; | |
229 INSDC:coord:len trim_len = ( INSDC:coord:len ) | |
230 < U32 > diff ( spot_len, trim_left ); | |
231 }; | |
232 | |
233 /* these encoding rules attempt to compress the channels individually, | |
234 although they may compress fine interleaved as they are... */ | |
235 physical | |
236 F32 [ 4 ] NCBI:SRA:PacBio:smrt:F32_4ch_encoding #1.0 < U32 mantissa > | |
237 { | |
238 decode | |
239 { | |
240 fzip_fmt cmp0 = split < 0 > ( @ ); | |
241 fzip_fmt cmp1 = split < 1 > ( @ ); | |
242 fzip_fmt cmp2 = split < 2 > ( @ ); | |
243 fzip_fmt cmp3 = split < 3 > ( @ ); | |
244 | |
245 F32 ch0 = funzip ( cmp0 ); | |
246 F32 ch1 = funzip ( cmp1 ); | |
247 F32 ch2 = funzip ( cmp2 ); | |
248 F32 ch3 = funzip ( cmp3 ); | |
249 | |
250 return < F32 > paste ( ch0, ch1, ch2, ch3 ); | |
251 } | |
252 | |
253 encode | |
254 { | |
255 F32 ch0 = < F32 > cut < 0 > ( @ ); | |
256 F32 ch1 = < F32 > cut < 1 > ( @ ); | |
257 F32 ch2 = < F32 > cut < 2 > ( @ ); | |
258 F32 ch3 = < F32 > cut < 3 > ( @ ); | |
259 | |
260 fzip_fmt cmp0 = fzip < mantissa > ( ch0 ); | |
261 fzip_fmt cmp1 = fzip < mantissa > ( ch1 ); | |
262 fzip_fmt cmp2 = fzip < mantissa > ( ch2 ); | |
263 fzip_fmt cmp3 = fzip < mantissa > ( ch3 ); | |
264 | |
265 return merge ( cmp0, cmp1, cmp2, cmp3 ); | |
266 } | |
267 } | |
268 | |
269 table NCBI:SRA:PacBio:smrt:zmw_metrics #1 | |
270 { | |
271 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > BASE_FRACTION; | |
272 column < F32 > fzip_encoding < 24 > BASE_IPD; | |
273 column < F32 > fzip_encoding < 24 > BASE_RATE; | |
274 column < F32 > fzip_encoding < 24 > BASE_WIDTH; | |
275 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_BASE_QV; | |
276 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_DEL_QV; | |
277 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_INS_QV; | |
278 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_SUB_QV; | |
279 column < F32 > fzip_encoding < 24 > LOCAL_BASE_RATE; | |
280 column < F32 > fzip_encoding < 24 > DARK_BASE_RATE; | |
281 column < F32 > fzip_encoding < 24 > HQ_RGN_START_TIME; | |
282 column < F32 > fzip_encoding < 24 > HQ_RGN_END_TIME; | |
283 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > HQ_RGN_SNR; | |
284 column < I8 > zip_encoding PRODUCTIVITY; | |
285 column < F32 > fzip_encoding < 24 > READ_SCORE; | |
286 column < F32 > fzip_encoding < 24 > READ_BASE_QV; | |
287 column < F32 > fzip_encoding < 24 > READ_DEL_QV; | |
288 column < F32 > fzip_encoding < 24 > READ_INS_QV; | |
289 column < F32 > fzip_encoding < 24 > READ_SUB_QV; | |
290 }; | |
291 | |
292 table NCBI:SRA:PacBio:smrt:passes #1 | |
293 { | |
294 column < U8 > zip_encoding ADAPTER_HIT_BEFORE; | |
295 column < U8 > zip_encoding ADAPTER_HIT_AFTER; | |
296 column < U8 > zip_encoding PASS_DIRECTION; | |
297 column < I32 > izip_encoding PASS_NUM_BASES; | |
298 column < I32 > izip_encoding PASS_START_BASE; | |
299 }; | |
300 | |
301 database NCBI:SRA:PacBio:smrt:db #1.0.1 | |
302 { | |
303 table NCBI:SRA:PacBio:smrt:sequence #1.0 SEQUENCE; | |
304 table NCBI:SRA:PacBio:smrt:cons #1.0 CONSENSUS; | |
305 table NCBI:SRA:PacBio:smrt:passes #1.0 PASSES; | |
306 table NCBI:SRA:PacBio:smrt:zmw_metrics #1.0 ZMW_METRICS; | |
307 }; |