Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/ClusterIO/dbsnp.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterIO/dbsnp.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,366 @@ +# $Id: dbsnp.pm,v 1.7.2.1 2003/08/21 21:07:06 allenday Exp $ +# BioPerl module for Bio::ClusterIO::dbsnp +# +# Copyright Allen Day <allenday@ucla.edu>, Stan Nelson <snelson@ucla.edu> +# Human Genetics, UCLA Medical School, University of California, Los Angeles + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterIO::dbsnp - dbSNP input stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::ClusterIO class. + +=head1 DESCRIPTION + +Parse dbSNP XML files, one refSNP entry at a time. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Allen Day E<lt>allenday@ucla.eduE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... +package Bio::ClusterIO::dbsnp; + +use strict; +use Bio::Root::Root; +use Bio::ClusterIO; +use Bio::Variation::SNP; +use XML::Parser::PerlSAX; +use XML::Handler::Subs; +use Data::Dumper; +use IO::File; + +use vars qw(@ISA $DTD $DEBUG %MODEMAP %MAPPING); +$DTD = 'ftp://ftp.ncbi.nih.gov/snp/specs/NSE.dtd'; +@ISA = qw(Bio::ClusterIO); + +BEGIN { + %MAPPING = ( +#the ones commented out i haven't written methods for yet... -Allen + 'NSE-rs_refsnp-id' => 'id', +# 'NSE-rs_taxid' => 'tax_id', +# 'NSE-rs_organism' => 'organism', + 'NSE-rs_snp-type' => {'type' => 'value'}, + 'NSE-rs_observed' => 'observed', + 'NSE-rs_seq-5_E' => 'seq_5', + 'NSE-rs_seq-3_E' => 'seq_3', +# 'NSE-rs_seq-ss-exemplar' => 'exemplar_subsnp', + 'NSE-rs_ncbi-build-id' => 'ncbi_build', + 'NSE-rs_ncbi-num-chr-hits' => 'ncbi_chr_hits', + 'NSE-rs_ncbi-num-ctg-hits' => 'ncbi_ctg_hits', + 'NSE-rs_ncbi-num-seq-loc' => 'ncbi_seq_loc', +# 'NSE-rs_ncbi-mapweight' => 'ncbi_mapweight', + 'NSE-rs_ucsc-build-id' => 'ucsc_build', + 'NSE-rs_ucsc-num-chr-hits' => 'ucsc_chr_hits', + 'NSE-rs_ucsc-num-seq-loc' => 'ucsc_ctg_hits', +# 'NSE-rs_ucsc-mapweight' => 'ucsc_mapweight', + 'NSE-rs_het' => 'heterozygous', + 'NSE-rs_het-SE' => 'heterozygous_SE', + 'NSE-rs_validated' => {'validated' => 'value'}, + 'NSE-rs_genotype' => {'genotype' => 'value'}, + + 'NSE-ss_handle' => 'handle', + 'NSE-ss_batch-id' => 'batch_id', + 'NSE-ss_subsnp-id' => 'id', +# 'NSE-ss_loc-snp-id' => 'loc_id', +# 'NSE-ss_orient' => {'orient' => 'value'}, +# 'NSE-ss_build-id' => 'build', + 'NSE-ss_method-class' => {'method' => 'value'}, +# 'NSE-ss_accession_E' => 'accession', +# 'NSE-ss_comment_E' => 'comment', +# 'NSE-ss_genename' => 'gene_name', +# 'NSE-ss_assay-5_E' => 'seq_5', +# 'NSE-ss_assay-3_E' => 'seq_3', +# 'NSE-ss_observed' => 'observed', + +# 'NSE-ss-popinfo_type' => 'pop_type', +# 'NSE-ss-popinfo_batch-id' => 'pop_batch_id', +# 'NSE-ss-popinfo_pop-name' => 'pop_name', +# 'NSE-ss-popinfo_samplesize' => 'pop_samplesize', +# 'NSE-ss_popinfo_est-het' => 'pop_est_heterozygous', +# 'NSE-ss_popinfo_est-het-se-sq' => 'pop_est_heterozygous_se_sq', + +# 'NSE-ss-alleleinfo_type' => 'allele_type', +# 'NSE-ss-alleleinfo_batch-id' => 'allele_batch_id', +# 'NSE-ss-alleleinfo_pop-id' => 'allele_pop_id', +# 'NSE-ss-alleleinfo_snp-allele' => 'allele_snp', +# 'NSE-ss-alleleinfo_other-allele' => 'allele_other', +# 'NSE-ss-alleleinfo_freq' => 'allele_freq', +# 'NSE-ss-alleleinfo_count' => 'allele_count', + +# 'NSE-rsContigHit_contig-id' => 'contig_hit', +# 'NSE-rsContigHit_accession' => 'accession_hit', +# 'NSE-rsContigHit_version' => 'version', +# 'NSE-rsContigHit_chromosome' => 'chromosome_hit', + +# 'NSE-rsMaploc_asn-from' => 'asn_from', +# 'NSE-rsMaploc_asn-to' => 'asn_to', +# 'NSE-rsMaploc_loc-type' => {'loc_type' => 'value'}, +# 'NSE-rsMaploc_hit-quality' => {'hit_quality' => 'value'}, +# 'NSE-rsMaploc_orient' => {'orient' => 'value'}, +# 'NSE-rsMaploc_physmap-str' => 'phys_from', +# 'NSE-rsMaploc_physmap-int' => 'phys_to', + + 'NSE-FxnSet_locusid' => 'locus_id', + 'NSE-FxnSet_symbol' => 'symbol', + 'NSE-FxnSet_mrna-acc' => 'mrna', + 'NSE-FxnSet_prot-acc' => 'protein', + 'NSE-FxnSet_fxn-class-contig' => {'functional_class' => 'value'}, + + #... + #... + #there are lots more, but i don't need them at the moment... -Allen + ); +} + +sub _initialize{ + my ($self,@args) = @_; + $self->SUPER::_initialize(@args); + my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); + defined $usetempfile && $self->use_tempfile($usetempfile); + $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); + $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); +} + +=head2 next_cluster + + Title : next_cluster + Usage : $dbsnp = $stream->next_cluster() + Function: returns the next refSNP in the stream + Returns : Bio::Variation::SNP object representing composite refSNP + and its component subSNP(s). + Args : NONE + +=cut + +### +#Adapted from Jason's blastxml.pm +### +sub next_cluster { + my $self = shift; + my $data = ''; + my($tfh); + + if( $self->use_tempfile ) { + $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); + $tfh->autoflush(1); + } + + my $start = 1; + while( defined( $_ = $self->_readline ) ){ + #skip to beginning of refSNP entry + if($_ !~ m!<NSE-rs>! && $start){ + next; + } elsif($_ =~ m!<NSE-rs>! && $start){ + $start = 0; + } + + #slurp up the data + if( defined $tfh ) { + print $tfh $_; + } else { + $data .= $_; + } + + #and stop at the end of the refSNP entry + last if $_ =~ m!</NSE-rs>!; + } + + #if we didn't find a start tag + return undef if $start; + + my %parser_args; + if( defined $tfh ) { + seek($tfh,0,0); + %parser_args = ('Source' => { 'ByteStream' => $tfh }, + 'Handler' => $self); + } else { + %parser_args = ('Source' => { 'String' => $data }, + 'Handler' => $self); + } + + my $starttime; + my $result; + + if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } + + eval { + $result = $self->{'_xmlparser'}->parse(%parser_args); + }; + + if( $@ ) { + $self->warn("error in parsing a report:\n $@"); + $result = undef; + } + + if( $DEBUG ) { + $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); + } + + return $self->refsnp; +} + +=head2 SAX methods + +=cut + +=head2 start_document + + Title : start_document + Usage : $parser->start_document; + Function: SAX method to indicate starting to parse a new document. + Creates a Bio::Variation::SNP + Returns : none + Args : none + +=cut + +sub start_document{ + my ($self) = @_; + $self->{refsnp} = Bio::Variation::SNP->new; +} + +sub refsnp { + return shift->{refsnp}; +} + +=head2 end_document + + Title : end_document + Usage : $parser->end_document; + Function: SAX method to indicate finishing parsing a new document + Returns : none + Args : none + +=cut + +sub end_document{ + my ($self,@args) = @_; +} + +=head2 start_element + + Title : start_element + Usage : $parser->start_element($data) + Function: SAX method to indicate starting a new element + Returns : none + Args : hash ref for data + +=cut + +sub start_element{ + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $at = $data->{'Attributes'}; + + if($nm eq 'NSE-ss'){ + $self->refsnp->add_subsnp; + return; + } + if(my $type = $MAPPING{$nm}){ + if(ref $type eq 'HASH'){ + #okay, this is nasty. what can you do? + $self->{will_handle} = (keys %$type)[0]; + my $valkey = (values %$type)[0]; + $self->{last_data} = $at->{$valkey}; + } else { + $self->{will_handle} = $type; + $self->{last_data} = undef; + } + } else { + undef $self->{will_handle}; + } +} + +=head2 end_element + + Title : end_element + Usage : $parser->end_element($data) + Function: Signals finishing an element + Returns : none + Args : hash ref for data + +=cut + +sub end_element { + my ($self,$data) = @_; + my $nm = $data->{'Name'}; + my $at = $data->{'Attributes'}; + + my $method = $self->{will_handle}; + if($method){ + if($nm =~ /^NSE-rs/ or $nm =~ /^NSE-SeqLoc/ or $nm =~ /^NSE-FxnSet/){ + $self->refsnp->$method($self->{last_data}); + } elsif ($nm =~ /^NSE-ss/){ + $self->refsnp->subsnp->$method($self->{last_data}); + } + } +} + +=head2 characters + + Title : characters + Usage : $parser->characters($data) + Function: Signals new characters to be processed + Returns : characters read + Args : hash ref with the key 'Data' + +=cut + +sub characters{ + my ($self,$data) = @_; + $self->{last_data} = $data->{Data} + if $data->{Data} =~ /\S/; #whitespace is meaningless -ad +} + +=head2 use_tempfile + + Title : use_tempfile + Usage : $obj->use_tempfile($newval) + Function: Get/Set boolean flag on whether or not use a tempfile + Example : + Returns : value of use_tempfile + Args : newvalue (optional) + +=cut + +sub use_tempfile{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_use_tempfile'} = $value; + } + return $self->{'_use_tempfile'}; +} + +1;