comparison libs/sratoolkit.2.8.0-centos_linux64/schema/sra/pacbio.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * NCBI PacBio Fastq Sequence Read Archive schema
29 */
30 version 1;
31
32 include 'insdc/sra.vschema';
33 include 'ncbi/sra.vschema';
34
35
36 /*--------------------------------------------------------------------------
37 * NCBI:SRA:PacBio
38 * Pacific Biotech SRA Platform
39 *
40 * history:
41 * 1.0.2 - updated ancestry
42 * 1.0.3 - updated ancestry
43 */
44 table NCBI:SRA:PacBio:common #1.0.3 = NCBI:SRA:tbl:sra #2.1.3
45 {
46 }
47
48 /* history:
49 * 1.0.2 - updated ancestry
50 * 1.0.3 - updated ancestry
51 */
52 table NCBI:SRA:PacBio:smrt:fastq #1.0.3
53 = NCBI:SRA:PacBio:common #1.0.3
54 , NCBI:tbl:base_space #2.0.3
55 , NCBI:tbl:phred_quality #2.0.3
56 {
57 /* PLATFORM
58 * platform name is always "PACBIO_SMRT"
59 */
60 ascii platform_name
61 = < ascii > echo < "PACBIO_SMRT" > ();
62
63 /* TRIMMED SEQUENCE
64 * need to find the 0-based trim_start and trim_len
65 */
66 INSDC:coord:zero bio_start
67 = NCBI:SRA:bio_start ( out_read_start, out_read_type );
68
69 INSDC:coord:zero trim_start = bio_start;
70
71 U32 trim_left = ( U32 ) trim_start;
72 INSDC:coord:len trim_len = ( INSDC:coord:len )
73 < U32 > diff ( spot_len, trim_left );
74 }
75
76 /*--------------------------------------------------------------------------
77 * NCBI:SRA:PacBio:smrt:db
78 * Pacific Biotech SRA Platform
79 */
80 table NCBI:SRA:PacBio:smrt:indelsubst #1
81 {
82 // probability that the current base is an insertion
83 column < U8 > zip_encoding INSERTION_QV;
84
85 // probability of a deletion error following current base
86 // and identity of deleted base, if it exists
87 column < U8 > zip_encoding DELETION_QV;
88 column < INSDC:dna:text > zip_encoding DELETION_TAG;
89
90 // probability of a substitution error
91 // and most likely alternative base call
92 column < U8 > zip_encoding SUBSTITUTION_QV;
93 column < INSDC:dna:text > zip_encoding SUBSTITUTION_TAG;
94 };
95
96 typedef U8 PacBio:hole:status;
97 const PacBio:hole:status PacBio:hole:SEQUENCING = 0;
98 const PacBio:hole:status PacBio:hole:ANTIHOLE = 1;
99 const PacBio:hole:status PacBio:hole:FIDUCIAL = 2;
100 const PacBio:hole:status PacBio:hole:SUSPECT = 3;
101 const PacBio:hole:status PacBio:hole:ANTIMIRROR = 4;
102 const PacBio:hole:status PacBio:hole:FDZMW = 5;
103 const PacBio:hole:status PacBio:hole:FBZMW = 6;
104 const PacBio:hole:status PacBio:hole:ANTIBEAMLET = 7;
105 const PacBio:hole:status PacBio:hole:OUTSIDEFOV = 8;
106
107 /* history:
108 * 1.0.1 - updated ancestry
109 * 1.0.2 - updated ancestry
110 */
111 table NCBI:SRA:PacBio:smrt:basecalls #1.0.2
112 = INSDC:SRA:tbl:spotcoord #1
113 , NCBI:tbl:base_space #2.0.3
114 , NCBI:tbl:phred_quality #2.0.3
115 , NCBI:SRA:PacBio:smrt:indelsubst #1
116 {
117 /* PLATFORM
118 * platform name is always "PACBIO_SMRT"
119 */
120 ascii platform_name
121 = < ascii > echo < "PACBIO_SMRT" > ();
122
123 // basecalls will be routed to READ column
124 readonly column INSDC:dna:text BASECALL
125 = out_dna_text;
126
127 // quality value for each base
128 readonly column INSDC:quality:phred QUALITY_VALUE
129 = out_qual_phred;
130
131 // zero-based hole number
132 column < U32 > izip_encoding HOLE_NUMBER;
133
134 // hole status
135 column < PacBio:hole:status > zip_encoding HOLE_STATUS;
136
137 // optional column pair to describe hole status
138 // when/if it does not line up with our constants above
139 column < ascii > zip_encoding HOLE_STATUS_VALUE;
140 column < INSDC:coord:len > izip_encoding HOLE_STATUS_VALUE_LEN;
141
142 // hole ( X,Y ) pair will be split and sent to X and Y columns
143 column I16 [ 2 ] HOLE_XY
144 = < I16 > paste ( x_clip_I16, y_clip_I16 );
145 I16 x_clip_I16 = cast ( out_x_coord );
146 I16 y_clip_I16 = cast ( out_y_coord );
147
148 I16 in_x16_coord = < I16 > cut < 0 > ( HOLE_XY );
149 I16 in_y16_coord = < I16 > cut < 1 > ( HOLE_XY );
150
151 INSDC:coord:val in_x_coord = cast ( in_x16_coord );
152 INSDC:coord:val in_y_coord = cast ( in_y16_coord );
153
154 // the number of bases in ZMW
155 readonly column INSDC:coord:len NUM_EVENT
156 = base_space_spot_len;
157 };
158
159 /* history:
160 * 1.0.1 - updated ancestry
161 * 1.0.2 - updated ancestry
162 */
163 table NCBI:SRA:PacBio:smrt:sequence #1.0.2
164 = NCBI:SRA:PacBio:smrt:basecalls #1.0.2
165 , NCBI:SRA:tbl:sra_nopos #2.1.3
166 {
167 // pulse information
168 column < U16 > izip_encoding PRE_BASE_FRAMES;
169 column < U16 > izip_encoding WIDTH_IN_FRAMES;
170
171 // spot to pulse map
172 default column INSDC:position:zero PULSE_INDEX
173 = .PULSE_INDEX;
174 readonly column INSDC:position:one PULSE_INDEX
175 = out_position;
176 INSDC:position:one out_position
177 = ( INSDC:position:one ) < INSDC:position:zero > sum < 1 > ( .PULSE_INDEX );
178
179 column NCBI:SRA:pos16 PULSE_INDEX
180 = cast ( .PULSE_INDEX );
181 NCBI:SRA:pos16 in_pulse_index16
182 = PULSE_INDEX;
183
184 INSDC:position:zero in_pulse_index32
185 = PULSE_INDEX
186 | cast ( in_pulse_index16 );
187
188 physical column < INSDC:position:zero > izip_encoding .PULSE_INDEX
189 = in_pulse_index32;
190
191 /* clip quality */
192 extern column < INSDC:coord:zero > izip_encoding CLIP_QUALITY_LEFT;
193 extern column < INSDC:coord:one > izip_encoding CLIP_QUALITY_RIGHT;
194
195 /* TRIMMED SEQUENCE
196 * need to find the 0-based trim_start and trim_len
197 */
198 INSDC:coord:zero trim_start
199 = .CLIP_QUALITY_LEFT
200 | NCBI:SRA:bio_start ( out_read_start, out_read_type );
201
202 U32 trim_right
203 = ( U32 ) .CLIP_QUALITY_RIGHT
204 | spot_len;
205
206 U32 trim_left = ( U32 ) trim_start;
207 INSDC:coord:len trim_len = ( INSDC:coord:len )
208 < U32 > diff ( trim_right, trim_left );
209 };
210
211 /* history:
212 * 1.0.1 - updated ancestry
213 * 1.0.2 - updated ancestry
214 */
215 table NCBI:SRA:PacBio:smrt:cons #1.0.2
216 = NCBI:SRA:PacBio:smrt:basecalls #1.0.2
217 , NCBI:SRA:tbl:sra #2.1.3
218 {
219 // documented in both hdf5 and xsd as signed...
220 column < I32 > izip_encoding NUM_PASSES;
221
222 /* TRIMMED SEQUENCE
223 * need to find the 0-based trim_start and trim_len
224 */
225 INSDC:coord:zero trim_start
226 = NCBI:SRA:bio_start ( out_read_start, out_read_type );
227
228 U32 trim_left = ( U32 ) trim_start;
229 INSDC:coord:len trim_len = ( INSDC:coord:len )
230 < U32 > diff ( spot_len, trim_left );
231 };
232
233 /* these encoding rules attempt to compress the channels individually,
234 although they may compress fine interleaved as they are... */
235 physical
236 F32 [ 4 ] NCBI:SRA:PacBio:smrt:F32_4ch_encoding #1.0 < U32 mantissa >
237 {
238 decode
239 {
240 fzip_fmt cmp0 = split < 0 > ( @ );
241 fzip_fmt cmp1 = split < 1 > ( @ );
242 fzip_fmt cmp2 = split < 2 > ( @ );
243 fzip_fmt cmp3 = split < 3 > ( @ );
244
245 F32 ch0 = funzip ( cmp0 );
246 F32 ch1 = funzip ( cmp1 );
247 F32 ch2 = funzip ( cmp2 );
248 F32 ch3 = funzip ( cmp3 );
249
250 return < F32 > paste ( ch0, ch1, ch2, ch3 );
251 }
252
253 encode
254 {
255 F32 ch0 = < F32 > cut < 0 > ( @ );
256 F32 ch1 = < F32 > cut < 1 > ( @ );
257 F32 ch2 = < F32 > cut < 2 > ( @ );
258 F32 ch3 = < F32 > cut < 3 > ( @ );
259
260 fzip_fmt cmp0 = fzip < mantissa > ( ch0 );
261 fzip_fmt cmp1 = fzip < mantissa > ( ch1 );
262 fzip_fmt cmp2 = fzip < mantissa > ( ch2 );
263 fzip_fmt cmp3 = fzip < mantissa > ( ch3 );
264
265 return merge ( cmp0, cmp1, cmp2, cmp3 );
266 }
267 }
268
269 table NCBI:SRA:PacBio:smrt:zmw_metrics #1
270 {
271 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > BASE_FRACTION;
272 column < F32 > fzip_encoding < 24 > BASE_IPD;
273 column < F32 > fzip_encoding < 24 > BASE_RATE;
274 column < F32 > fzip_encoding < 24 > BASE_WIDTH;
275 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_BASE_QV;
276 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_DEL_QV;
277 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_INS_QV;
278 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > CHAN_SUB_QV;
279 column < F32 > fzip_encoding < 24 > LOCAL_BASE_RATE;
280 column < F32 > fzip_encoding < 24 > DARK_BASE_RATE;
281 column < F32 > fzip_encoding < 24 > HQ_RGN_START_TIME;
282 column < F32 > fzip_encoding < 24 > HQ_RGN_END_TIME;
283 column NCBI:SRA:PacBio:smrt:F32_4ch_encoding < 24 > HQ_RGN_SNR;
284 column < I8 > zip_encoding PRODUCTIVITY;
285 column < F32 > fzip_encoding < 24 > READ_SCORE;
286 column < F32 > fzip_encoding < 24 > READ_BASE_QV;
287 column < F32 > fzip_encoding < 24 > READ_DEL_QV;
288 column < F32 > fzip_encoding < 24 > READ_INS_QV;
289 column < F32 > fzip_encoding < 24 > READ_SUB_QV;
290 };
291
292 table NCBI:SRA:PacBio:smrt:passes #1
293 {
294 column < U8 > zip_encoding ADAPTER_HIT_BEFORE;
295 column < U8 > zip_encoding ADAPTER_HIT_AFTER;
296 column < U8 > zip_encoding PASS_DIRECTION;
297 column < I32 > izip_encoding PASS_NUM_BASES;
298 column < I32 > izip_encoding PASS_START_BASE;
299 };
300
301 database NCBI:SRA:PacBio:smrt:db #1.0.1
302 {
303 table NCBI:SRA:PacBio:smrt:sequence #1.0 SEQUENCE;
304 table NCBI:SRA:PacBio:smrt:cons #1.0 CONSENSUS;
305 table NCBI:SRA:PacBio:smrt:passes #1.0 PASSES;
306 table NCBI:SRA:PacBio:smrt:zmw_metrics #1.0 ZMW_METRICS;
307 };