comparison variant_effect_predictor/Bio/ClusterIO/dbsnp.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: dbsnp.pm,v 1.7.2.1 2003/08/21 21:07:06 allenday Exp $
2 # BioPerl module for Bio::ClusterIO::dbsnp
3 #
4 # Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu>
5 # Human Genetics, UCLA Medical School, University of California, Los Angeles
6
7 # POD documentation - main docs before the code
8
9 =head1 NAME
10
11 Bio::ClusterIO::dbsnp - dbSNP input stream
12
13 =head1 SYNOPSIS
14
15 Do not use this module directly. Use it via the Bio::ClusterIO class.
16
17 =head1 DESCRIPTION
18
19 Parse dbSNP XML files, one refSNP entry at a time.
20
21 =head1 FEEDBACK
22
23 =head2 Mailing Lists
24
25 User feedback is an integral part of the evolution of this and other
26 Bioperl modules. Send your comments and suggestions preferably to one
27 of the Bioperl mailing lists. Your participation is much appreciated.
28
29 bioperl-l@bioperl.org - General discussion
30 http://bioperl.org/MailList.shtml - About the mailing lists
31
32 =head2 Reporting Bugs
33
34 Report bugs to the Bioperl bug tracking system to help us keep track
35 the bugs and their resolution.
36 Bug reports can be submitted via email or the web:
37
38 bioperl-bugs@bio.perl.org
39 http://bugzilla.bioperl.org/
40
41 =head1 AUTHOR
42
43 Allen Day E<lt>allenday@ucla.eduE<gt>
44
45 =head1 APPENDIX
46
47 The rest of the documentation details each of the object
48 methods. Internal methods are usually preceded with a _
49
50 =cut
51
52 # Let the code begin...
53 package Bio::ClusterIO::dbsnp;
54
55 use strict;
56 use Bio::Root::Root;
57 use Bio::ClusterIO;
58 use Bio::Variation::SNP;
59 use XML::Parser::PerlSAX;
60 use XML::Handler::Subs;
61 use Data::Dumper;
62 use IO::File;
63
64 use vars qw(@ISA $DTD $DEBUG %MODEMAP %MAPPING);
65 $DTD = 'ftp://ftp.ncbi.nih.gov/snp/specs/NSE.dtd';
66 @ISA = qw(Bio::ClusterIO);
67
68 BEGIN {
69 %MAPPING = (
70 #the ones commented out i haven't written methods for yet... -Allen
71 'NSE-rs_refsnp-id' => 'id',
72 # 'NSE-rs_taxid' => 'tax_id',
73 # 'NSE-rs_organism' => 'organism',
74 'NSE-rs_snp-type' => {'type' => 'value'},
75 'NSE-rs_observed' => 'observed',
76 'NSE-rs_seq-5_E' => 'seq_5',
77 'NSE-rs_seq-3_E' => 'seq_3',
78 # 'NSE-rs_seq-ss-exemplar' => 'exemplar_subsnp',
79 'NSE-rs_ncbi-build-id' => 'ncbi_build',
80 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits',
81 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits',
82 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc',
83 # 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight',
84 'NSE-rs_ucsc-build-id' => 'ucsc_build',
85 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits',
86 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits',
87 # 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight',
88 'NSE-rs_het' => 'heterozygous',
89 'NSE-rs_het-SE' => 'heterozygous_SE',
90 'NSE-rs_validated' => {'validated' => 'value'},
91 'NSE-rs_genotype' => {'genotype' => 'value'},
92
93 'NSE-ss_handle' => 'handle',
94 'NSE-ss_batch-id' => 'batch_id',
95 'NSE-ss_subsnp-id' => 'id',
96 # 'NSE-ss_loc-snp-id' => 'loc_id',
97 # 'NSE-ss_orient' => {'orient' => 'value'},
98 # 'NSE-ss_build-id' => 'build',
99 'NSE-ss_method-class' => {'method' => 'value'},
100 # 'NSE-ss_accession_E' => 'accession',
101 # 'NSE-ss_comment_E' => 'comment',
102 # 'NSE-ss_genename' => 'gene_name',
103 # 'NSE-ss_assay-5_E' => 'seq_5',
104 # 'NSE-ss_assay-3_E' => 'seq_3',
105 # 'NSE-ss_observed' => 'observed',
106
107 # 'NSE-ss-popinfo_type' => 'pop_type',
108 # 'NSE-ss-popinfo_batch-id' => 'pop_batch_id',
109 # 'NSE-ss-popinfo_pop-name' => 'pop_name',
110 # 'NSE-ss-popinfo_samplesize' => 'pop_samplesize',
111 # 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous',
112 # 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq',
113
114 # 'NSE-ss-alleleinfo_type' => 'allele_type',
115 # 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id',
116 # 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id',
117 # 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp',
118 # 'NSE-ss-alleleinfo_other-allele' => 'allele_other',
119 # 'NSE-ss-alleleinfo_freq' => 'allele_freq',
120 # 'NSE-ss-alleleinfo_count' => 'allele_count',
121
122 # 'NSE-rsContigHit_contig-id' => 'contig_hit',
123 # 'NSE-rsContigHit_accession' => 'accession_hit',
124 # 'NSE-rsContigHit_version' => 'version',
125 # 'NSE-rsContigHit_chromosome' => 'chromosome_hit',
126
127 # 'NSE-rsMaploc_asn-from' => 'asn_from',
128 # 'NSE-rsMaploc_asn-to' => 'asn_to',
129 # 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'},
130 # 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'},
131 # 'NSE-rsMaploc_orient' => {'orient' => 'value'},
132 # 'NSE-rsMaploc_physmap-str' => 'phys_from',
133 # 'NSE-rsMaploc_physmap-int' => 'phys_to',
134
135 'NSE-FxnSet_locusid' => 'locus_id',
136 'NSE-FxnSet_symbol' => 'symbol',
137 'NSE-FxnSet_mrna-acc' => 'mrna',
138 'NSE-FxnSet_prot-acc' => 'protein',
139 'NSE-FxnSet_fxn-class-contig' => {'functional_class' => 'value'},
140
141 #...
142 #...
143 #there are lots more, but i don't need them at the moment... -Allen
144 );
145 }
146
147 sub _initialize{
148 my ($self,@args) = @_;
149 $self->SUPER::_initialize(@args);
150 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
151 defined $usetempfile && $self->use_tempfile($usetempfile);
152 $self->{'_xmlparser'} = new XML::Parser::PerlSAX();
153 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
154 }
155
156 =head2 next_cluster
157
158 Title : next_cluster
159 Usage : $dbsnp = $stream->next_cluster()
160 Function: returns the next refSNP in the stream
161 Returns : Bio::Variation::SNP object representing composite refSNP
162 and its component subSNP(s).
163 Args : NONE
164
165 =cut
166
167 ###
168 #Adapted from Jason's blastxml.pm
169 ###
170 sub next_cluster {
171 my $self = shift;
172 my $data = '';
173 my($tfh);
174
175 if( $self->use_tempfile ) {
176 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
177 $tfh->autoflush(1);
178 }
179
180 my $start = 1;
181 while( defined( $_ = $self->_readline ) ){
182 #skip to beginning of refSNP entry
183 if($_ !~ m!<NSE-rs>! && $start){
184 next;
185 } elsif($_ =~ m!<NSE-rs>! && $start){
186 $start = 0;
187 }
188
189 #slurp up the data
190 if( defined $tfh ) {
191 print $tfh $_;
192 } else {
193 $data .= $_;
194 }
195
196 #and stop at the end of the refSNP entry
197 last if $_ =~ m!</NSE-rs>!;
198 }
199
200 #if we didn't find a start tag
201 return undef if $start;
202
203 my %parser_args;
204 if( defined $tfh ) {
205 seek($tfh,0,0);
206 %parser_args = ('Source' => { 'ByteStream' => $tfh },
207 'Handler' => $self);
208 } else {
209 %parser_args = ('Source' => { 'String' => $data },
210 'Handler' => $self);
211 }
212
213 my $starttime;
214 my $result;
215
216 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
217
218 eval {
219 $result = $self->{'_xmlparser'}->parse(%parser_args);
220 };
221
222 if( $@ ) {
223 $self->warn("error in parsing a report:\n $@");
224 $result = undef;
225 }
226
227 if( $DEBUG ) {
228 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
229 }
230
231 return $self->refsnp;
232 }
233
234 =head2 SAX methods
235
236 =cut
237
238 =head2 start_document
239
240 Title : start_document
241 Usage : $parser->start_document;
242 Function: SAX method to indicate starting to parse a new document.
243 Creates a Bio::Variation::SNP
244 Returns : none
245 Args : none
246
247 =cut
248
249 sub start_document{
250 my ($self) = @_;
251 $self->{refsnp} = Bio::Variation::SNP->new;
252 }
253
254 sub refsnp {
255 return shift->{refsnp};
256 }
257
258 =head2 end_document
259
260 Title : end_document
261 Usage : $parser->end_document;
262 Function: SAX method to indicate finishing parsing a new document
263 Returns : none
264 Args : none
265
266 =cut
267
268 sub end_document{
269 my ($self,@args) = @_;
270 }
271
272 =head2 start_element
273
274 Title : start_element
275 Usage : $parser->start_element($data)
276 Function: SAX method to indicate starting a new element
277 Returns : none
278 Args : hash ref for data
279
280 =cut
281
282 sub start_element{
283 my ($self,$data) = @_;
284 my $nm = $data->{'Name'};
285 my $at = $data->{'Attributes'};
286
287 if($nm eq 'NSE-ss'){
288 $self->refsnp->add_subsnp;
289 return;
290 }
291 if(my $type = $MAPPING{$nm}){
292 if(ref $type eq 'HASH'){
293 #okay, this is nasty. what can you do?
294 $self->{will_handle} = (keys %$type)[0];
295 my $valkey = (values %$type)[0];
296 $self->{last_data} = $at->{$valkey};
297 } else {
298 $self->{will_handle} = $type;
299 $self->{last_data} = undef;
300 }
301 } else {
302 undef $self->{will_handle};
303 }
304 }
305
306 =head2 end_element
307
308 Title : end_element
309 Usage : $parser->end_element($data)
310 Function: Signals finishing an element
311 Returns : none
312 Args : hash ref for data
313
314 =cut
315
316 sub end_element {
317 my ($self,$data) = @_;
318 my $nm = $data->{'Name'};
319 my $at = $data->{'Attributes'};
320
321 my $method = $self->{will_handle};
322 if($method){
323 if($nm =~ /^NSE-rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^NSE-FxnSet/){
324 $self->refsnp->$method($self->{last_data});
325 } elsif ($nm =~ /^NSE-ss/){
326 $self->refsnp->subsnp->$method($self->{last_data});
327 }
328 }
329 }
330
331 =head2 characters
332
333 Title : characters
334 Usage : $parser->characters($data)
335 Function: Signals new characters to be processed
336 Returns : characters read
337 Args : hash ref with the key 'Data'
338
339 =cut
340
341 sub characters{
342 my ($self,$data) = @_;
343 $self->{last_data} = $data->{Data}
344 if $data->{Data} =~ /\S/; #whitespace is meaningless -ad
345 }
346
347 =head2 use_tempfile
348
349 Title : use_tempfile
350 Usage : $obj->use_tempfile($newval)
351 Function: Get/Set boolean flag on whether or not use a tempfile
352 Example :
353 Returns : value of use_tempfile
354 Args : newvalue (optional)
355
356 =cut
357
358 sub use_tempfile{
359 my ($self,$value) = @_;
360 if( defined $value) {
361 $self->{'_use_tempfile'} = $value;
362 }
363 return $self->{'_use_tempfile'};
364 }
365
366 1;