Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/ClusterIO/dbsnp.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: dbsnp.pm,v 1.7.2.1 2003/08/21 21:07:06 allenday Exp $ | |
2 # BioPerl module for Bio::ClusterIO::dbsnp | |
3 # | |
4 # Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu> | |
5 # Human Genetics, UCLA Medical School, University of California, Los Angeles | |
6 | |
7 # POD documentation - main docs before the code | |
8 | |
9 =head1 NAME | |
10 | |
11 Bio::ClusterIO::dbsnp - dbSNP input stream | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 Do not use this module directly. Use it via the Bio::ClusterIO class. | |
16 | |
17 =head1 DESCRIPTION | |
18 | |
19 Parse dbSNP XML files, one refSNP entry at a time. | |
20 | |
21 =head1 FEEDBACK | |
22 | |
23 =head2 Mailing Lists | |
24 | |
25 User feedback is an integral part of the evolution of this and other | |
26 Bioperl modules. Send your comments and suggestions preferably to one | |
27 of the Bioperl mailing lists. Your participation is much appreciated. | |
28 | |
29 bioperl-l@bioperl.org - General discussion | |
30 http://bioperl.org/MailList.shtml - About the mailing lists | |
31 | |
32 =head2 Reporting Bugs | |
33 | |
34 Report bugs to the Bioperl bug tracking system to help us keep track | |
35 the bugs and their resolution. | |
36 Bug reports can be submitted via email or the web: | |
37 | |
38 bioperl-bugs@bio.perl.org | |
39 http://bugzilla.bioperl.org/ | |
40 | |
41 =head1 AUTHOR | |
42 | |
43 Allen Day E<lt>allenday@ucla.eduE<gt> | |
44 | |
45 =head1 APPENDIX | |
46 | |
47 The rest of the documentation details each of the object | |
48 methods. Internal methods are usually preceded with a _ | |
49 | |
50 =cut | |
51 | |
52 # Let the code begin... | |
53 package Bio::ClusterIO::dbsnp; | |
54 | |
55 use strict; | |
56 use Bio::Root::Root; | |
57 use Bio::ClusterIO; | |
58 use Bio::Variation::SNP; | |
59 use XML::Parser::PerlSAX; | |
60 use XML::Handler::Subs; | |
61 use Data::Dumper; | |
62 use IO::File; | |
63 | |
64 use vars qw(@ISA $DTD $DEBUG %MODEMAP %MAPPING); | |
65 $DTD = 'ftp://ftp.ncbi.nih.gov/snp/specs/NSE.dtd'; | |
66 @ISA = qw(Bio::ClusterIO); | |
67 | |
68 BEGIN { | |
69 %MAPPING = ( | |
70 #the ones commented out i haven't written methods for yet... -Allen | |
71 'NSE-rs_refsnp-id' => 'id', | |
72 # 'NSE-rs_taxid' => 'tax_id', | |
73 # 'NSE-rs_organism' => 'organism', | |
74 'NSE-rs_snp-type' => {'type' => 'value'}, | |
75 'NSE-rs_observed' => 'observed', | |
76 'NSE-rs_seq-5_E' => 'seq_5', | |
77 'NSE-rs_seq-3_E' => 'seq_3', | |
78 # 'NSE-rs_seq-ss-exemplar' => 'exemplar_subsnp', | |
79 'NSE-rs_ncbi-build-id' => 'ncbi_build', | |
80 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits', | |
81 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits', | |
82 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc', | |
83 # 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight', | |
84 'NSE-rs_ucsc-build-id' => 'ucsc_build', | |
85 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits', | |
86 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits', | |
87 # 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight', | |
88 'NSE-rs_het' => 'heterozygous', | |
89 'NSE-rs_het-SE' => 'heterozygous_SE', | |
90 'NSE-rs_validated' => {'validated' => 'value'}, | |
91 'NSE-rs_genotype' => {'genotype' => 'value'}, | |
92 | |
93 'NSE-ss_handle' => 'handle', | |
94 'NSE-ss_batch-id' => 'batch_id', | |
95 'NSE-ss_subsnp-id' => 'id', | |
96 # 'NSE-ss_loc-snp-id' => 'loc_id', | |
97 # 'NSE-ss_orient' => {'orient' => 'value'}, | |
98 # 'NSE-ss_build-id' => 'build', | |
99 'NSE-ss_method-class' => {'method' => 'value'}, | |
100 # 'NSE-ss_accession_E' => 'accession', | |
101 # 'NSE-ss_comment_E' => 'comment', | |
102 # 'NSE-ss_genename' => 'gene_name', | |
103 # 'NSE-ss_assay-5_E' => 'seq_5', | |
104 # 'NSE-ss_assay-3_E' => 'seq_3', | |
105 # 'NSE-ss_observed' => 'observed', | |
106 | |
107 # 'NSE-ss-popinfo_type' => 'pop_type', | |
108 # 'NSE-ss-popinfo_batch-id' => 'pop_batch_id', | |
109 # 'NSE-ss-popinfo_pop-name' => 'pop_name', | |
110 # 'NSE-ss-popinfo_samplesize' => 'pop_samplesize', | |
111 # 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous', | |
112 # 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq', | |
113 | |
114 # 'NSE-ss-alleleinfo_type' => 'allele_type', | |
115 # 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id', | |
116 # 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id', | |
117 # 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp', | |
118 # 'NSE-ss-alleleinfo_other-allele' => 'allele_other', | |
119 # 'NSE-ss-alleleinfo_freq' => 'allele_freq', | |
120 # 'NSE-ss-alleleinfo_count' => 'allele_count', | |
121 | |
122 # 'NSE-rsContigHit_contig-id' => 'contig_hit', | |
123 # 'NSE-rsContigHit_accession' => 'accession_hit', | |
124 # 'NSE-rsContigHit_version' => 'version', | |
125 # 'NSE-rsContigHit_chromosome' => 'chromosome_hit', | |
126 | |
127 # 'NSE-rsMaploc_asn-from' => 'asn_from', | |
128 # 'NSE-rsMaploc_asn-to' => 'asn_to', | |
129 # 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'}, | |
130 # 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'}, | |
131 # 'NSE-rsMaploc_orient' => {'orient' => 'value'}, | |
132 # 'NSE-rsMaploc_physmap-str' => 'phys_from', | |
133 # 'NSE-rsMaploc_physmap-int' => 'phys_to', | |
134 | |
135 'NSE-FxnSet_locusid' => 'locus_id', | |
136 'NSE-FxnSet_symbol' => 'symbol', | |
137 'NSE-FxnSet_mrna-acc' => 'mrna', | |
138 'NSE-FxnSet_prot-acc' => 'protein', | |
139 'NSE-FxnSet_fxn-class-contig' => {'functional_class' => 'value'}, | |
140 | |
141 #... | |
142 #... | |
143 #there are lots more, but i don't need them at the moment... -Allen | |
144 ); | |
145 } | |
146 | |
147 sub _initialize{ | |
148 my ($self,@args) = @_; | |
149 $self->SUPER::_initialize(@args); | |
150 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); | |
151 defined $usetempfile && $self->use_tempfile($usetempfile); | |
152 $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); | |
153 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); | |
154 } | |
155 | |
156 =head2 next_cluster | |
157 | |
158 Title : next_cluster | |
159 Usage : $dbsnp = $stream->next_cluster() | |
160 Function: returns the next refSNP in the stream | |
161 Returns : Bio::Variation::SNP object representing composite refSNP | |
162 and its component subSNP(s). | |
163 Args : NONE | |
164 | |
165 =cut | |
166 | |
167 ### | |
168 #Adapted from Jason's blastxml.pm | |
169 ### | |
170 sub next_cluster { | |
171 my $self = shift; | |
172 my $data = ''; | |
173 my($tfh); | |
174 | |
175 if( $self->use_tempfile ) { | |
176 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); | |
177 $tfh->autoflush(1); | |
178 } | |
179 | |
180 my $start = 1; | |
181 while( defined( $_ = $self->_readline ) ){ | |
182 #skip to beginning of refSNP entry | |
183 if($_ !~ m!<NSE-rs>! && $start){ | |
184 next; | |
185 } elsif($_ =~ m!<NSE-rs>! && $start){ | |
186 $start = 0; | |
187 } | |
188 | |
189 #slurp up the data | |
190 if( defined $tfh ) { | |
191 print $tfh $_; | |
192 } else { | |
193 $data .= $_; | |
194 } | |
195 | |
196 #and stop at the end of the refSNP entry | |
197 last if $_ =~ m!</NSE-rs>!; | |
198 } | |
199 | |
200 #if we didn't find a start tag | |
201 return undef if $start; | |
202 | |
203 my %parser_args; | |
204 if( defined $tfh ) { | |
205 seek($tfh,0,0); | |
206 %parser_args = ('Source' => { 'ByteStream' => $tfh }, | |
207 'Handler' => $self); | |
208 } else { | |
209 %parser_args = ('Source' => { 'String' => $data }, | |
210 'Handler' => $self); | |
211 } | |
212 | |
213 my $starttime; | |
214 my $result; | |
215 | |
216 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } | |
217 | |
218 eval { | |
219 $result = $self->{'_xmlparser'}->parse(%parser_args); | |
220 }; | |
221 | |
222 if( $@ ) { | |
223 $self->warn("error in parsing a report:\n $@"); | |
224 $result = undef; | |
225 } | |
226 | |
227 if( $DEBUG ) { | |
228 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); | |
229 } | |
230 | |
231 return $self->refsnp; | |
232 } | |
233 | |
234 =head2 SAX methods | |
235 | |
236 =cut | |
237 | |
238 =head2 start_document | |
239 | |
240 Title : start_document | |
241 Usage : $parser->start_document; | |
242 Function: SAX method to indicate starting to parse a new document. | |
243 Creates a Bio::Variation::SNP | |
244 Returns : none | |
245 Args : none | |
246 | |
247 =cut | |
248 | |
249 sub start_document{ | |
250 my ($self) = @_; | |
251 $self->{refsnp} = Bio::Variation::SNP->new; | |
252 } | |
253 | |
254 sub refsnp { | |
255 return shift->{refsnp}; | |
256 } | |
257 | |
258 =head2 end_document | |
259 | |
260 Title : end_document | |
261 Usage : $parser->end_document; | |
262 Function: SAX method to indicate finishing parsing a new document | |
263 Returns : none | |
264 Args : none | |
265 | |
266 =cut | |
267 | |
268 sub end_document{ | |
269 my ($self,@args) = @_; | |
270 } | |
271 | |
272 =head2 start_element | |
273 | |
274 Title : start_element | |
275 Usage : $parser->start_element($data) | |
276 Function: SAX method to indicate starting a new element | |
277 Returns : none | |
278 Args : hash ref for data | |
279 | |
280 =cut | |
281 | |
282 sub start_element{ | |
283 my ($self,$data) = @_; | |
284 my $nm = $data->{'Name'}; | |
285 my $at = $data->{'Attributes'}; | |
286 | |
287 if($nm eq 'NSE-ss'){ | |
288 $self->refsnp->add_subsnp; | |
289 return; | |
290 } | |
291 if(my $type = $MAPPING{$nm}){ | |
292 if(ref $type eq 'HASH'){ | |
293 #okay, this is nasty. what can you do? | |
294 $self->{will_handle} = (keys %$type)[0]; | |
295 my $valkey = (values %$type)[0]; | |
296 $self->{last_data} = $at->{$valkey}; | |
297 } else { | |
298 $self->{will_handle} = $type; | |
299 $self->{last_data} = undef; | |
300 } | |
301 } else { | |
302 undef $self->{will_handle}; | |
303 } | |
304 } | |
305 | |
306 =head2 end_element | |
307 | |
308 Title : end_element | |
309 Usage : $parser->end_element($data) | |
310 Function: Signals finishing an element | |
311 Returns : none | |
312 Args : hash ref for data | |
313 | |
314 =cut | |
315 | |
316 sub end_element { | |
317 my ($self,$data) = @_; | |
318 my $nm = $data->{'Name'}; | |
319 my $at = $data->{'Attributes'}; | |
320 | |
321 my $method = $self->{will_handle}; | |
322 if($method){ | |
323 if($nm =~ /^NSE-rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^NSE-FxnSet/){ | |
324 $self->refsnp->$method($self->{last_data}); | |
325 } elsif ($nm =~ /^NSE-ss/){ | |
326 $self->refsnp->subsnp->$method($self->{last_data}); | |
327 } | |
328 } | |
329 } | |
330 | |
331 =head2 characters | |
332 | |
333 Title : characters | |
334 Usage : $parser->characters($data) | |
335 Function: Signals new characters to be processed | |
336 Returns : characters read | |
337 Args : hash ref with the key 'Data' | |
338 | |
339 =cut | |
340 | |
341 sub characters{ | |
342 my ($self,$data) = @_; | |
343 $self->{last_data} = $data->{Data} | |
344 if $data->{Data} =~ /\S/; #whitespace is meaningless -ad | |
345 } | |
346 | |
347 =head2 use_tempfile | |
348 | |
349 Title : use_tempfile | |
350 Usage : $obj->use_tempfile($newval) | |
351 Function: Get/Set boolean flag on whether or not use a tempfile | |
352 Example : | |
353 Returns : value of use_tempfile | |
354 Args : newvalue (optional) | |
355 | |
356 =cut | |
357 | |
358 sub use_tempfile{ | |
359 my ($self,$value) = @_; | |
360 if( defined $value) { | |
361 $self->{'_use_tempfile'} = $value; | |
362 } | |
363 return $self->{'_use_tempfile'}; | |
364 } | |
365 | |
366 1; |