comparison libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/read.vschema @ 3:38ad1130d077 draft

planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
author charles_s_test
date Mon, 27 Nov 2017 11:21:07 -0500
parents
children
comparison
equal deleted inserted replaced
2:0d65b71ff8df 3:38ad1130d077
1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 /*==========================================================================
28 * General read table which will be inherited by others
29 */
30 version 1;
31
32 include 'vdb/vdb.vschema';
33 include 'insdc/insdc.vschema';
34 include 'csra2/stats.vschema';
35
36
37 /*--------------------------------------------------------------------------
38 * tables
39 */
40 table NCBI:csra2:tbl:read #1.0 = NCBI:csra2:tbl:read_stats #1
41 {
42 /* CHUNK_SZ
43 * describes the maximum number of bases in any row
44 *
45 * if present, allows a single sequence to be broken into multiple rows
46 * where this value gives the limit on the number of bases in any row.
47 *
48 * the sequence will be split across some number of rows, depending upon
49 * the value of CHUNK_SZ. if length ( seq ) > CHUNK_SZ, then there will
50 * be multiple rows, where all but the last will have a length of CHUNK_SZ.
51 * the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
52 */
53 extern column INSDC:coord:len CHUNK_SZ;
54
55
56 /* READ
57 * base calls
58 */
59
60 // textual representation
61 extern default column INSDC:dna:text READ
62 {
63 read = out_dna_text;
64 validate = < INSDC:dna:text > compare ( in_dna_text, out_dna_text );
65 }
66
67 // 4na representation - unpacked
68 extern column INSDC:4na:bin READ
69 = out_4na_bin
70 ;
71
72
73 /* QUALITY
74 * phred-score quality values
75 */
76 extern default column INSDC:quality:phred QUALITY
77 = out_qual_phred
78 ;
79 extern column INSDC:quality:text:phred_33 QUALITY
80 = ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( out_qual_phred )
81 ;
82 extern column INSDC:quality:text:phred_64 QUALITY
83 = ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( out_qual_phred )
84 ;
85
86 /* ---------------------------- optional columns ---------------------------- */
87
88 /* RD_ID
89 * RD_GROUP
90 * reports group and id of current row
91 */
92 extern column I64 RD_ID;
93 extern column ascii RD_GROUP;
94
95 /* RD_FILTER
96 * records filter value if used
97 */
98 extern column INSDC:SRA:read_filter RD_FILTER;
99
100
101 /* ---------------------------- input rules ---------------------------- */
102
103 // input text
104 INSDC:dna:text in_dna_text
105 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ )
106 ;
107
108 // input 4na bin
109 INSDC:4na:bin in_4na_bin
110 = < INSDC:4na:bin > range_validate < 0, 15 > ( READ )
111 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text )
112 ;
113
114 // input 2na bin
115 INSDC:2na:bin in_2na_bin
116 = INSDC:SEQ:rand_4na_2na ( in_4na_bin )
117 ;
118
119 // input 4na alt-read ( ambiguities )
120 INSDC:4na:bin in_alt_4na_bin
121 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin )
122 ;
123
124 // feed the statistics
125 INSDC:4na:bin in_stats_seq = in_4na_bin;
126
127 // quality
128 INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY;
129 INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY;
130
131 INSDC:quality:phred in_qual_phred
132 = QUALITY
133 | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 )
134 | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 )
135 ;
136
137 // feed the statistics
138 INSDC:quality:phred in_stats_qual_phred = in_qual_phred;
139
140 ascii in_stats_read_group
141 = in_stats_spot_group
142 | RD_GROUP
143 ;
144
145
146 /* ---------------------------- physical columns ---------------------------- */
147
148 physical column INSDC:2na:packed .READ
149 = ( INSDC:2na:packed ) pack ( in_2na_bin )
150 ;
151
152 physical column < INSDC:4na:bin > zip_encoding .ALTREAD
153 = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin )
154 ;
155
156 physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY
157 = in_qual_phred
158 ;
159
160
161 /* ---------------------------- output rules ---------------------------- */
162
163 // output 2na packed
164 INSDC:2na:packed out_2na_packed
165 = .READ
166 ;
167
168 // output 2na bin
169 INSDC:2na:bin out_2na_bin
170 = ( INSDC:2na:bin ) unpack ( out_2na_packed )
171 ;
172
173 // output 2na->4na bin
174 INSDC:4na:bin out_2na_4na_bin
175 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin )
176 ;
177
178 // output 4na bin
179 INSDC:4na:bin out_4na_bin
180 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD )
181 | out_2na_4na_bin
182 ;
183
184 // output text
185 INSDC:dna:text out_dna_text
186 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin )
187 ;
188
189 // output quality
190 INSDC:quality:phred out_qual_phred
191 = .QUALITY
192 | < INSDC:quality:phred > echo < 30 > ( out_4na_bin )
193 ;
194 }
195
196
197 /*--------------------------------------------------------------------------
198 * views
199 */
200 table NCBI:csra2:view:read #1.0 =
201 NCBI:csra2:tbl:read #1.0
202 {
203 /* CHUNK_SIZE
204 * describes the maximum number of bases in any row
205 *
206 * if present, allows a single sequence to be broken into multiple rows
207 * where this value gives the limit on the number of bases in any row.
208 *
209 * the sequence will be split across some number of rows, depending upon
210 * the value of CHUNK_SIZE. if length ( seq ) > CHUNK_SIZE, then there will
211 * be multiple rows, where all but the last will have a length of CHUNK_SIZE.
212 * the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE.
213 */
214 readonly column INSDC:coord:len CHUNK_SIZE
215 = .CHUNK_SZ
216 | < INSDC:coord:len > echo < 0xFFFFFFFF > ()
217 ;
218
219 /* READ
220 * generate remaining 4 types
221 */
222 readonly column INSDC:4na:packed READ
223 = ( INSDC:4na:packed ) pack ( out_4na_bin )
224 ;
225 readonly column INSDC:x2na:bin READ
226 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin )
227 ;
228 readonly column INSDC:2na:bin READ
229 = out_2na_bin
230 ;
231 readonly column INSDC:2na:packed READ
232 = out_2na_packed
233 ;
234
235 /* READ_ID
236 * READ_GROUP
237 * reports group and id of current row
238 */
239 readonly column I64 READ_ID
240 = .RD_ID
241 | row_id ()
242 ;
243 readonly column ascii READ_GROUP
244 = .RD_GROUP
245 | < ascii > echo < '' > ()
246 ;
247
248 /* READ_FILTER
249 * records filter value if used
250 */
251 readonly column INSDC:SRA:read_filter READ_FILTER
252 = .RD_FILTER
253 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > ()
254 ;
255 }