0
|
1 # $Id: dbsnp.pm,v 1.7.2.1 2003/08/21 21:07:06 allenday Exp $
|
|
2 # BioPerl module for Bio::ClusterIO::dbsnp
|
|
3 #
|
|
4 # Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu>
|
|
5 # Human Genetics, UCLA Medical School, University of California, Los Angeles
|
|
6
|
|
7 # POD documentation - main docs before the code
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 Bio::ClusterIO::dbsnp - dbSNP input stream
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 Do not use this module directly. Use it via the Bio::ClusterIO class.
|
|
16
|
|
17 =head1 DESCRIPTION
|
|
18
|
|
19 Parse dbSNP XML files, one refSNP entry at a time.
|
|
20
|
|
21 =head1 FEEDBACK
|
|
22
|
|
23 =head2 Mailing Lists
|
|
24
|
|
25 User feedback is an integral part of the evolution of this and other
|
|
26 Bioperl modules. Send your comments and suggestions preferably to one
|
|
27 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
28
|
|
29 bioperl-l@bioperl.org - General discussion
|
|
30 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
31
|
|
32 =head2 Reporting Bugs
|
|
33
|
|
34 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
35 the bugs and their resolution.
|
|
36 Bug reports can be submitted via email or the web:
|
|
37
|
|
38 bioperl-bugs@bio.perl.org
|
|
39 http://bugzilla.bioperl.org/
|
|
40
|
|
41 =head1 AUTHOR
|
|
42
|
|
43 Allen Day E<lt>allenday@ucla.eduE<gt>
|
|
44
|
|
45 =head1 APPENDIX
|
|
46
|
|
47 The rest of the documentation details each of the object
|
|
48 methods. Internal methods are usually preceded with a _
|
|
49
|
|
50 =cut
|
|
51
|
|
52 # Let the code begin...
|
|
53 package Bio::ClusterIO::dbsnp;
|
|
54
|
|
55 use strict;
|
|
56 use Bio::Root::Root;
|
|
57 use Bio::ClusterIO;
|
|
58 use Bio::Variation::SNP;
|
|
59 use XML::Parser::PerlSAX;
|
|
60 use XML::Handler::Subs;
|
|
61 use Data::Dumper;
|
|
62 use IO::File;
|
|
63
|
|
64 use vars qw(@ISA $DTD $DEBUG %MODEMAP %MAPPING);
|
|
65 $DTD = 'ftp://ftp.ncbi.nih.gov/snp/specs/NSE.dtd';
|
|
66 @ISA = qw(Bio::ClusterIO);
|
|
67
|
|
68 BEGIN {
|
|
69 %MAPPING = (
|
|
70 #the ones commented out i haven't written methods for yet... -Allen
|
|
71 'NSE-rs_refsnp-id' => 'id',
|
|
72 # 'NSE-rs_taxid' => 'tax_id',
|
|
73 # 'NSE-rs_organism' => 'organism',
|
|
74 'NSE-rs_snp-type' => {'type' => 'value'},
|
|
75 'NSE-rs_observed' => 'observed',
|
|
76 'NSE-rs_seq-5_E' => 'seq_5',
|
|
77 'NSE-rs_seq-3_E' => 'seq_3',
|
|
78 # 'NSE-rs_seq-ss-exemplar' => 'exemplar_subsnp',
|
|
79 'NSE-rs_ncbi-build-id' => 'ncbi_build',
|
|
80 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits',
|
|
81 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits',
|
|
82 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc',
|
|
83 # 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight',
|
|
84 'NSE-rs_ucsc-build-id' => 'ucsc_build',
|
|
85 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits',
|
|
86 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits',
|
|
87 # 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight',
|
|
88 'NSE-rs_het' => 'heterozygous',
|
|
89 'NSE-rs_het-SE' => 'heterozygous_SE',
|
|
90 'NSE-rs_validated' => {'validated' => 'value'},
|
|
91 'NSE-rs_genotype' => {'genotype' => 'value'},
|
|
92
|
|
93 'NSE-ss_handle' => 'handle',
|
|
94 'NSE-ss_batch-id' => 'batch_id',
|
|
95 'NSE-ss_subsnp-id' => 'id',
|
|
96 # 'NSE-ss_loc-snp-id' => 'loc_id',
|
|
97 # 'NSE-ss_orient' => {'orient' => 'value'},
|
|
98 # 'NSE-ss_build-id' => 'build',
|
|
99 'NSE-ss_method-class' => {'method' => 'value'},
|
|
100 # 'NSE-ss_accession_E' => 'accession',
|
|
101 # 'NSE-ss_comment_E' => 'comment',
|
|
102 # 'NSE-ss_genename' => 'gene_name',
|
|
103 # 'NSE-ss_assay-5_E' => 'seq_5',
|
|
104 # 'NSE-ss_assay-3_E' => 'seq_3',
|
|
105 # 'NSE-ss_observed' => 'observed',
|
|
106
|
|
107 # 'NSE-ss-popinfo_type' => 'pop_type',
|
|
108 # 'NSE-ss-popinfo_batch-id' => 'pop_batch_id',
|
|
109 # 'NSE-ss-popinfo_pop-name' => 'pop_name',
|
|
110 # 'NSE-ss-popinfo_samplesize' => 'pop_samplesize',
|
|
111 # 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous',
|
|
112 # 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq',
|
|
113
|
|
114 # 'NSE-ss-alleleinfo_type' => 'allele_type',
|
|
115 # 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id',
|
|
116 # 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id',
|
|
117 # 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp',
|
|
118 # 'NSE-ss-alleleinfo_other-allele' => 'allele_other',
|
|
119 # 'NSE-ss-alleleinfo_freq' => 'allele_freq',
|
|
120 # 'NSE-ss-alleleinfo_count' => 'allele_count',
|
|
121
|
|
122 # 'NSE-rsContigHit_contig-id' => 'contig_hit',
|
|
123 # 'NSE-rsContigHit_accession' => 'accession_hit',
|
|
124 # 'NSE-rsContigHit_version' => 'version',
|
|
125 # 'NSE-rsContigHit_chromosome' => 'chromosome_hit',
|
|
126
|
|
127 # 'NSE-rsMaploc_asn-from' => 'asn_from',
|
|
128 # 'NSE-rsMaploc_asn-to' => 'asn_to',
|
|
129 # 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'},
|
|
130 # 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'},
|
|
131 # 'NSE-rsMaploc_orient' => {'orient' => 'value'},
|
|
132 # 'NSE-rsMaploc_physmap-str' => 'phys_from',
|
|
133 # 'NSE-rsMaploc_physmap-int' => 'phys_to',
|
|
134
|
|
135 'NSE-FxnSet_locusid' => 'locus_id',
|
|
136 'NSE-FxnSet_symbol' => 'symbol',
|
|
137 'NSE-FxnSet_mrna-acc' => 'mrna',
|
|
138 'NSE-FxnSet_prot-acc' => 'protein',
|
|
139 'NSE-FxnSet_fxn-class-contig' => {'functional_class' => 'value'},
|
|
140
|
|
141 #...
|
|
142 #...
|
|
143 #there are lots more, but i don't need them at the moment... -Allen
|
|
144 );
|
|
145 }
|
|
146
|
|
147 sub _initialize{
|
|
148 my ($self,@args) = @_;
|
|
149 $self->SUPER::_initialize(@args);
|
|
150 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
|
|
151 defined $usetempfile && $self->use_tempfile($usetempfile);
|
|
152 $self->{'_xmlparser'} = new XML::Parser::PerlSAX();
|
|
153 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
|
|
154 }
|
|
155
|
|
156 =head2 next_cluster
|
|
157
|
|
158 Title : next_cluster
|
|
159 Usage : $dbsnp = $stream->next_cluster()
|
|
160 Function: returns the next refSNP in the stream
|
|
161 Returns : Bio::Variation::SNP object representing composite refSNP
|
|
162 and its component subSNP(s).
|
|
163 Args : NONE
|
|
164
|
|
165 =cut
|
|
166
|
|
167 ###
|
|
168 #Adapted from Jason's blastxml.pm
|
|
169 ###
|
|
170 sub next_cluster {
|
|
171 my $self = shift;
|
|
172 my $data = '';
|
|
173 my($tfh);
|
|
174
|
|
175 if( $self->use_tempfile ) {
|
|
176 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
|
|
177 $tfh->autoflush(1);
|
|
178 }
|
|
179
|
|
180 my $start = 1;
|
|
181 while( defined( $_ = $self->_readline ) ){
|
|
182 #skip to beginning of refSNP entry
|
|
183 if($_ !~ m!<NSE-rs>! && $start){
|
|
184 next;
|
|
185 } elsif($_ =~ m!<NSE-rs>! && $start){
|
|
186 $start = 0;
|
|
187 }
|
|
188
|
|
189 #slurp up the data
|
|
190 if( defined $tfh ) {
|
|
191 print $tfh $_;
|
|
192 } else {
|
|
193 $data .= $_;
|
|
194 }
|
|
195
|
|
196 #and stop at the end of the refSNP entry
|
|
197 last if $_ =~ m!</NSE-rs>!;
|
|
198 }
|
|
199
|
|
200 #if we didn't find a start tag
|
|
201 return undef if $start;
|
|
202
|
|
203 my %parser_args;
|
|
204 if( defined $tfh ) {
|
|
205 seek($tfh,0,0);
|
|
206 %parser_args = ('Source' => { 'ByteStream' => $tfh },
|
|
207 'Handler' => $self);
|
|
208 } else {
|
|
209 %parser_args = ('Source' => { 'String' => $data },
|
|
210 'Handler' => $self);
|
|
211 }
|
|
212
|
|
213 my $starttime;
|
|
214 my $result;
|
|
215
|
|
216 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
|
|
217
|
|
218 eval {
|
|
219 $result = $self->{'_xmlparser'}->parse(%parser_args);
|
|
220 };
|
|
221
|
|
222 if( $@ ) {
|
|
223 $self->warn("error in parsing a report:\n $@");
|
|
224 $result = undef;
|
|
225 }
|
|
226
|
|
227 if( $DEBUG ) {
|
|
228 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
|
|
229 }
|
|
230
|
|
231 return $self->refsnp;
|
|
232 }
|
|
233
|
|
234 =head2 SAX methods
|
|
235
|
|
236 =cut
|
|
237
|
|
238 =head2 start_document
|
|
239
|
|
240 Title : start_document
|
|
241 Usage : $parser->start_document;
|
|
242 Function: SAX method to indicate starting to parse a new document.
|
|
243 Creates a Bio::Variation::SNP
|
|
244 Returns : none
|
|
245 Args : none
|
|
246
|
|
247 =cut
|
|
248
|
|
249 sub start_document{
|
|
250 my ($self) = @_;
|
|
251 $self->{refsnp} = Bio::Variation::SNP->new;
|
|
252 }
|
|
253
|
|
254 sub refsnp {
|
|
255 return shift->{refsnp};
|
|
256 }
|
|
257
|
|
258 =head2 end_document
|
|
259
|
|
260 Title : end_document
|
|
261 Usage : $parser->end_document;
|
|
262 Function: SAX method to indicate finishing parsing a new document
|
|
263 Returns : none
|
|
264 Args : none
|
|
265
|
|
266 =cut
|
|
267
|
|
268 sub end_document{
|
|
269 my ($self,@args) = @_;
|
|
270 }
|
|
271
|
|
272 =head2 start_element
|
|
273
|
|
274 Title : start_element
|
|
275 Usage : $parser->start_element($data)
|
|
276 Function: SAX method to indicate starting a new element
|
|
277 Returns : none
|
|
278 Args : hash ref for data
|
|
279
|
|
280 =cut
|
|
281
|
|
282 sub start_element{
|
|
283 my ($self,$data) = @_;
|
|
284 my $nm = $data->{'Name'};
|
|
285 my $at = $data->{'Attributes'};
|
|
286
|
|
287 if($nm eq 'NSE-ss'){
|
|
288 $self->refsnp->add_subsnp;
|
|
289 return;
|
|
290 }
|
|
291 if(my $type = $MAPPING{$nm}){
|
|
292 if(ref $type eq 'HASH'){
|
|
293 #okay, this is nasty. what can you do?
|
|
294 $self->{will_handle} = (keys %$type)[0];
|
|
295 my $valkey = (values %$type)[0];
|
|
296 $self->{last_data} = $at->{$valkey};
|
|
297 } else {
|
|
298 $self->{will_handle} = $type;
|
|
299 $self->{last_data} = undef;
|
|
300 }
|
|
301 } else {
|
|
302 undef $self->{will_handle};
|
|
303 }
|
|
304 }
|
|
305
|
|
306 =head2 end_element
|
|
307
|
|
308 Title : end_element
|
|
309 Usage : $parser->end_element($data)
|
|
310 Function: Signals finishing an element
|
|
311 Returns : none
|
|
312 Args : hash ref for data
|
|
313
|
|
314 =cut
|
|
315
|
|
316 sub end_element {
|
|
317 my ($self,$data) = @_;
|
|
318 my $nm = $data->{'Name'};
|
|
319 my $at = $data->{'Attributes'};
|
|
320
|
|
321 my $method = $self->{will_handle};
|
|
322 if($method){
|
|
323 if($nm =~ /^NSE-rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^NSE-FxnSet/){
|
|
324 $self->refsnp->$method($self->{last_data});
|
|
325 } elsif ($nm =~ /^NSE-ss/){
|
|
326 $self->refsnp->subsnp->$method($self->{last_data});
|
|
327 }
|
|
328 }
|
|
329 }
|
|
330
|
|
331 =head2 characters
|
|
332
|
|
333 Title : characters
|
|
334 Usage : $parser->characters($data)
|
|
335 Function: Signals new characters to be processed
|
|
336 Returns : characters read
|
|
337 Args : hash ref with the key 'Data'
|
|
338
|
|
339 =cut
|
|
340
|
|
341 sub characters{
|
|
342 my ($self,$data) = @_;
|
|
343 $self->{last_data} = $data->{Data}
|
|
344 if $data->{Data} =~ /\S/; #whitespace is meaningless -ad
|
|
345 }
|
|
346
|
|
347 =head2 use_tempfile
|
|
348
|
|
349 Title : use_tempfile
|
|
350 Usage : $obj->use_tempfile($newval)
|
|
351 Function: Get/Set boolean flag on whether or not use a tempfile
|
|
352 Example :
|
|
353 Returns : value of use_tempfile
|
|
354 Args : newvalue (optional)
|
|
355
|
|
356 =cut
|
|
357
|
|
358 sub use_tempfile{
|
|
359 my ($self,$value) = @_;
|
|
360 if( defined $value) {
|
|
361 $self->{'_use_tempfile'} = $value;
|
|
362 }
|
|
363 return $self->{'_use_tempfile'};
|
|
364 }
|
|
365
|
|
366 1;
|