Mercurial > repos > charles_s_test > seqsero2
comparison libs/sratoolkit.2.8.0-centos_linux64/schema/csra2/read.vschema @ 3:38ad1130d077 draft
planemo upload commit a4fb57231f274270afbfebd47f67df05babffa4a-dirty
| author | charles_s_test |
|---|---|
| date | Mon, 27 Nov 2017 11:21:07 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:0d65b71ff8df | 3:38ad1130d077 |
|---|---|
| 1 /*=========================================================================== | |
| 2 * | |
| 3 * PUBLIC DOMAIN NOTICE | |
| 4 * National Center for Biotechnology Information | |
| 5 * | |
| 6 * This software/database is a "United States Government Work" under the | |
| 7 * terms of the United States Copyright Act. It was written as part of | |
| 8 * the author's official duties as a United States Government employee and | |
| 9 * thus cannot be copyrighted. This software/database is freely available | |
| 10 * to the public for use. The National Library of Medicine and the U.S. | |
| 11 * Government have not placed any restriction on its use or reproduction. | |
| 12 * | |
| 13 * Although all reasonable efforts have been taken to ensure the accuracy | |
| 14 * and reliability of the software and data, the NLM and the U.S. | |
| 15 * Government do not and cannot warrant the performance or results that | |
| 16 * may be obtained by using this software or data. The NLM and the U.S. | |
| 17 * Government disclaim all warranties, express or implied, including | |
| 18 * warranties of performance, merchantability or fitness for any particular | |
| 19 * purpose. | |
| 20 * | |
| 21 * Please cite the author in any work or product based on this material. | |
| 22 * | |
| 23 * =========================================================================== | |
| 24 * | |
| 25 */ | |
| 26 | |
| 27 /*========================================================================== | |
| 28 * General read table which will be inherited by others | |
| 29 */ | |
| 30 version 1; | |
| 31 | |
| 32 include 'vdb/vdb.vschema'; | |
| 33 include 'insdc/insdc.vschema'; | |
| 34 include 'csra2/stats.vschema'; | |
| 35 | |
| 36 | |
| 37 /*-------------------------------------------------------------------------- | |
| 38 * tables | |
| 39 */ | |
| 40 table NCBI:csra2:tbl:read #1.0 = NCBI:csra2:tbl:read_stats #1 | |
| 41 { | |
| 42 /* CHUNK_SZ | |
| 43 * describes the maximum number of bases in any row | |
| 44 * | |
| 45 * if present, allows a single sequence to be broken into multiple rows | |
| 46 * where this value gives the limit on the number of bases in any row. | |
| 47 * | |
| 48 * the sequence will be split across some number of rows, depending upon | |
| 49 * the value of CHUNK_SZ. if length ( seq ) > CHUNK_SZ, then there will | |
| 50 * be multiple rows, where all but the last will have a length of CHUNK_SZ. | |
| 51 * the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE. | |
| 52 */ | |
| 53 extern column INSDC:coord:len CHUNK_SZ; | |
| 54 | |
| 55 | |
| 56 /* READ | |
| 57 * base calls | |
| 58 */ | |
| 59 | |
| 60 // textual representation | |
| 61 extern default column INSDC:dna:text READ | |
| 62 { | |
| 63 read = out_dna_text; | |
| 64 validate = < INSDC:dna:text > compare ( in_dna_text, out_dna_text ); | |
| 65 } | |
| 66 | |
| 67 // 4na representation - unpacked | |
| 68 extern column INSDC:4na:bin READ | |
| 69 = out_4na_bin | |
| 70 ; | |
| 71 | |
| 72 | |
| 73 /* QUALITY | |
| 74 * phred-score quality values | |
| 75 */ | |
| 76 extern default column INSDC:quality:phred QUALITY | |
| 77 = out_qual_phred | |
| 78 ; | |
| 79 extern column INSDC:quality:text:phred_33 QUALITY | |
| 80 = ( INSDC:quality:text:phred_33 ) < B8 > sum < 33 > ( out_qual_phred ) | |
| 81 ; | |
| 82 extern column INSDC:quality:text:phred_64 QUALITY | |
| 83 = ( INSDC:quality:text:phred_64 ) < B8 > sum < 64 > ( out_qual_phred ) | |
| 84 ; | |
| 85 | |
| 86 /* ---------------------------- optional columns ---------------------------- */ | |
| 87 | |
| 88 /* RD_ID | |
| 89 * RD_GROUP | |
| 90 * reports group and id of current row | |
| 91 */ | |
| 92 extern column I64 RD_ID; | |
| 93 extern column ascii RD_GROUP; | |
| 94 | |
| 95 /* RD_FILTER | |
| 96 * records filter value if used | |
| 97 */ | |
| 98 extern column INSDC:SRA:read_filter RD_FILTER; | |
| 99 | |
| 100 | |
| 101 /* ---------------------------- input rules ---------------------------- */ | |
| 102 | |
| 103 // input text | |
| 104 INSDC:dna:text in_dna_text | |
| 105 = < INSDC:dna:text, INSDC:dna:text > map < '.acmgrsvtwyhkdbn','NACMGRSVTWYHKDBN' > ( READ ) | |
| 106 ; | |
| 107 | |
| 108 // input 4na bin | |
| 109 INSDC:4na:bin in_4na_bin | |
| 110 = < INSDC:4na:bin > range_validate < 0, 15 > ( READ ) | |
| 111 | < INSDC:dna:text, INSDC:4na:bin > map < INSDC:4na:map:CHARSET, INSDC:4na:map:BINSET > ( in_dna_text ) | |
| 112 ; | |
| 113 | |
| 114 // input 2na bin | |
| 115 INSDC:2na:bin in_2na_bin | |
| 116 = INSDC:SEQ:rand_4na_2na ( in_4na_bin ) | |
| 117 ; | |
| 118 | |
| 119 // input 4na alt-read ( ambiguities ) | |
| 120 INSDC:4na:bin in_alt_4na_bin | |
| 121 = < INSDC:4na:bin, INSDC:4na:bin > map < INSDC:4na:map:BINSET, [ 15,0,0,3,0,5,6,7,0,9,10,11,12,13,14,15 ] > ( in_4na_bin ) | |
| 122 ; | |
| 123 | |
| 124 // feed the statistics | |
| 125 INSDC:4na:bin in_stats_seq = in_4na_bin; | |
| 126 | |
| 127 // quality | |
| 128 INSDC:quality:text:phred_33 in_qual_text_phred_33 = QUALITY; | |
| 129 INSDC:quality:text:phred_64 in_qual_text_phred_64 = QUALITY; | |
| 130 | |
| 131 INSDC:quality:phred in_qual_phred | |
| 132 = QUALITY | |
| 133 | ( INSDC:quality:phred ) < B8 > diff < 33 > ( in_qual_text_phred_33 ) | |
| 134 | ( INSDC:quality:phred ) < B8 > diff < 64 > ( in_qual_text_phred_64 ) | |
| 135 ; | |
| 136 | |
| 137 // feed the statistics | |
| 138 INSDC:quality:phred in_stats_qual_phred = in_qual_phred; | |
| 139 | |
| 140 ascii in_stats_read_group | |
| 141 = in_stats_spot_group | |
| 142 | RD_GROUP | |
| 143 ; | |
| 144 | |
| 145 | |
| 146 /* ---------------------------- physical columns ---------------------------- */ | |
| 147 | |
| 148 physical column INSDC:2na:packed .READ | |
| 149 = ( INSDC:2na:packed ) pack ( in_2na_bin ) | |
| 150 ; | |
| 151 | |
| 152 physical column < INSDC:4na:bin > zip_encoding .ALTREAD | |
| 153 = < INSDC:4na:bin > trim < 0, 0 > ( in_alt_4na_bin ) | |
| 154 ; | |
| 155 | |
| 156 physical column < INSDC:quality:phred > delta_average_zip_encoding .QUALITY | |
| 157 = in_qual_phred | |
| 158 ; | |
| 159 | |
| 160 | |
| 161 /* ---------------------------- output rules ---------------------------- */ | |
| 162 | |
| 163 // output 2na packed | |
| 164 INSDC:2na:packed out_2na_packed | |
| 165 = .READ | |
| 166 ; | |
| 167 | |
| 168 // output 2na bin | |
| 169 INSDC:2na:bin out_2na_bin | |
| 170 = ( INSDC:2na:bin ) unpack ( out_2na_packed ) | |
| 171 ; | |
| 172 | |
| 173 // output 2na->4na bin | |
| 174 INSDC:4na:bin out_2na_4na_bin | |
| 175 = < INSDC:2na:bin, INSDC:4na:bin > map < INSDC:2na:map:BINSET, [ 1, 2, 4, 8 ] > ( out_2na_bin ) | |
| 176 ; | |
| 177 | |
| 178 // output 4na bin | |
| 179 INSDC:4na:bin out_4na_bin | |
| 180 = < INSDC:4na:bin > bit_or < ALIGN_RIGHT > ( out_2na_4na_bin, .ALTREAD ) | |
| 181 | out_2na_4na_bin | |
| 182 ; | |
| 183 | |
| 184 // output text | |
| 185 INSDC:dna:text out_dna_text | |
| 186 = < INSDC:4na:bin, INSDC:dna:text > map < INSDC:4na:map:BINSET, INSDC:4na:map:CHARSET > ( out_4na_bin ) | |
| 187 ; | |
| 188 | |
| 189 // output quality | |
| 190 INSDC:quality:phred out_qual_phred | |
| 191 = .QUALITY | |
| 192 | < INSDC:quality:phred > echo < 30 > ( out_4na_bin ) | |
| 193 ; | |
| 194 } | |
| 195 | |
| 196 | |
| 197 /*-------------------------------------------------------------------------- | |
| 198 * views | |
| 199 */ | |
| 200 table NCBI:csra2:view:read #1.0 = | |
| 201 NCBI:csra2:tbl:read #1.0 | |
| 202 { | |
| 203 /* CHUNK_SIZE | |
| 204 * describes the maximum number of bases in any row | |
| 205 * | |
| 206 * if present, allows a single sequence to be broken into multiple rows | |
| 207 * where this value gives the limit on the number of bases in any row. | |
| 208 * | |
| 209 * the sequence will be split across some number of rows, depending upon | |
| 210 * the value of CHUNK_SIZE. if length ( seq ) > CHUNK_SIZE, then there will | |
| 211 * be multiple rows, where all but the last will have a length of CHUNK_SIZE. | |
| 212 * the last ( or only ) row will have a length of length(seq)%CHUNK_SIZE. | |
| 213 */ | |
| 214 readonly column INSDC:coord:len CHUNK_SIZE | |
| 215 = .CHUNK_SZ | |
| 216 | < INSDC:coord:len > echo < 0xFFFFFFFF > () | |
| 217 ; | |
| 218 | |
| 219 /* READ | |
| 220 * generate remaining 4 types | |
| 221 */ | |
| 222 readonly column INSDC:4na:packed READ | |
| 223 = ( INSDC:4na:packed ) pack ( out_4na_bin ) | |
| 224 ; | |
| 225 readonly column INSDC:x2na:bin READ | |
| 226 = < INSDC:4na:bin, INSDC:x2na:bin > map < INSDC:4na:map:BINSET, [ 4,0,1,4,2,4,4,4,3,4,4,4,4,4,4,4 ] > ( out_4na_bin ) | |
| 227 ; | |
| 228 readonly column INSDC:2na:bin READ | |
| 229 = out_2na_bin | |
| 230 ; | |
| 231 readonly column INSDC:2na:packed READ | |
| 232 = out_2na_packed | |
| 233 ; | |
| 234 | |
| 235 /* READ_ID | |
| 236 * READ_GROUP | |
| 237 * reports group and id of current row | |
| 238 */ | |
| 239 readonly column I64 READ_ID | |
| 240 = .RD_ID | |
| 241 | row_id () | |
| 242 ; | |
| 243 readonly column ascii READ_GROUP | |
| 244 = .RD_GROUP | |
| 245 | < ascii > echo < '' > () | |
| 246 ; | |
| 247 | |
| 248 /* READ_FILTER | |
| 249 * records filter value if used | |
| 250 */ | |
| 251 readonly column INSDC:SRA:read_filter READ_FILTER | |
| 252 = .RD_FILTER | |
| 253 | < INSDC:SRA:read_filter > echo < SRA_READ_FILTER_PASS > () | |
| 254 ; | |
| 255 } |
