Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/game.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/game.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,672 @@ +# $Id: game.pm,v 1.26.2.1 2003/06/28 22:23:15 jason Exp $ +# +# BioPerl module for Bio::SeqIO::game +# +# Cared for by Brad Marshall <bradmars@yahoo.com> +# +# Copyright Ewan Birney & Lincoln Stein & Brad Marshall +# +# You may distribute this module under the same terms as perl itself +# _history +# June 25, 2000 written by Brad Marshall +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::game - Parses GAME XML 0.1 and higher into and out of Bio::Seq objects. + +=head1 SYNOPSIS + +To use this module you need XML::Parser, XML::Parser::PerlSAX +and XML::Writer. + +Do not use this module directly. Use it via the Bio::SeqIO class. + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from bioxml seq, +computation, feature and annotation dtds,versions 0.1 and higher. +These can be found at http://www.bioxml.org/dtds/current. It does +this using the idHandler, seqHandler and featureHandler modules you +should have gotten with this one. + +The idea is that any bioxml features can be turned into bioperl +annotations. When Annotations and computations are parsed in, they +gain additional info in the bioperl SeqFeature tag attribute. These +can be used to reconstitute a computation or annotation by the bioxml +with the bx-handler module when write_seq is called. + +If you use this to write SeqFeatures that were not generated from +computations or annotations, it will output a list of bioxml features. +Some data may be lost in this step, since bioxml features just have a +span, type and description - nothing about the anlysis performed. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - Technical bioperl discussion + bioxml-dev@bioxml.org - Technical discussion - Moderate volume + bioxml-announce@bioxml.org - General Announcements - Pretty dead + http://www.bioxml.org/MailingLists/ - About the mailing lists + +=head1 AUTHOR - Brad Marshall & Ewan Birney & Lincoln Stein + +Email: bradmars@yahoo.com + birney@sanger.ac.uk + lstein@cshl.org + + +=head1 CONTRIBUTORS + +Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::game; +use vars qw(@ISA); +use strict; +# Object preamble - inherits from Bio::Root::Object + +use Bio::SeqIO; +use Bio::SeqIO::game::seqHandler; +use Bio::SeqIO::game::featureHandler; +use Bio::SeqIO::game::idHandler; +use XML::Parser::PerlSAX; +use Bio::SeqFeature::Generic; +use XML::Writer; + +use Bio::Seq; + +@ISA = qw(Bio::SeqIO); + + +sub _initialize { + my($self,@args) = @_; + $self->SUPER::_initialize(@args); + my $xmlfile = ""; + $self->{'counter'} = 0; + $self->{'id_counter'} = 1; + $self->{'leftovers'} = undef; + $self->{'header'} = undef; + $self->{'chunkable'} = undef; + $self->{'xmldoc'} = undef; + + $self->_export_subfeatures(1); + $self->_group_subfeatures(1); + $self->_subfeature_types('exons', 'promoters','poly_A_sites', + 'utrs','introns','sub_SeqFeature'); + + # filehandle is stored by superclass _initialize +} + + +=head2 _export_subfeatures + + Title : _export_subfeatures + Usage : $obj->_export_subfeatures + Function: export all subfeatures (also in the geneprediction structure) + Returns : value of _export_subfeatures + Args : newvalue (optional) + +=cut + +sub _export_subfeatures{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_export_subfeatures'} = $value; + } + return $obj->{'_export_subfeatures'}; + +} + +=head2 _group_subfeatures + + Title : _group_subfeatures + Usage : $obj->_group_subfeatures + Function: Groups all subfeatures in separate feature_sets + Returns : value of _group_subfeatures + Args : newvalue (optional) + +=cut + +sub _group_subfeatures{ + my $obj = shift; + if( @_ ) { + my $value = shift; + $obj->{'_group_subfeatures'} = $value; + } + return $obj->{'_group_subfeatures'}; +} + +=head2 _subfeature_types + + Title : _subfeature_types + Usage : $obj->_subfeature_types + Function: array of all possible subfeatures, it should be a + name of a function which + : returns an arrau of sub_seqfeatures when called: + @array = $feature->subfeaturetype() + Returns : array of _subfeature_types + Args : array of subfeature types (optional) + +=cut + +sub _subfeature_types{ + my $obj = shift; + if( @_ ) { + my @values = @_; + $obj->{'_subfeature_types'} = \@values; + } + return @{$obj->{'_subfeature_types'}}; + +} + +=head2 _add_subfeature_type + + Title : _add_subfeature_type + Usage : $obj->_add_subfeature_type + Function: add one possible subfeature, it should be a name of a function which + : returns an arrau of sub_seqfeatures when called: @array = $feature->subfeaturetyp() + Returns : 1 + Args : one subfeature type (optional) + +=cut + +sub _add_subfeature_type{ + my $obj = shift; + if( @_ ) { + my @values = @_; + push @{$obj->{'_subfeature_types'}}, @values; + } + return 1; + +} + + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : NONE + +=cut + +sub next_seq { + my $self = shift; + + + # The header is the top level stuff in the XML file. + # IE before the first <bx-seq:seq> tag. + # If you don't include this in each 'chunk', the + # parser will barf. + my $header; + unless ($self->{'header'}) { + while (my $next_line = $self->_readline) { + if($next_line=~/<bx-seq:seq?/) { + $header .= $`; + $self->{'header'}=$header; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } else { + $header .= $next_line; + } + } + if ($self->{'header'}=~m|<bx-game:flavor>.*chunkable.*</bx-game:flavor>|) { + $self->{'chunkable'}=1; + } + + } + + my $not_top_level; + my $xmldoc; + my $seq; + # If chunkable, we read in the document until the next + # TOP LEVEL sequence. + if ($self->{'chunkable'}) { + $xmldoc = $self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + # Maintain depth of computations and annotations. + # We only want TOP LEVEL seqs if chunkable. + while ($next_line=~ m|<bx-computation:computation|g) { + $not_top_level++; + } + while ($next_line=~ m|<bx-annotation:annotation|g) { + $not_top_level++; + } + while ($next_line=~ m|</bx-computation:computation|g) { + $not_top_level--; + } + while ($next_line=~ m|</bx-annotation:annotation|g) { + $not_top_level--; + } + if($next_line=~/<bx-seq:seq?/) { + if (!$not_top_level) { + $xmldoc .= $`; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } + } else { + $xmldoc .= $next_line; + } + } + # Make sure the 'doc chunk' has a closing tag + # to make the parser happy. + if (!$xmldoc=~m|</bx-game:game>|){ + $xmldoc .= "</bx-game:game>"; + } + # Grab the TOP LEVEL seq.. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + } else { # No sequences. + return 0; + } + # Get the seq out of the array. + $seq = @{$self->{'seqs'}}[0]; + # If not chunkable, + # only read document into memory once! + } elsif (!$self->{'xmldoc'}) { + $self->{'xmldoc'}=$self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + $self->{'xmldoc'} .= $next_line; + } + $xmldoc=$self->{'xmldoc'}; + # Get the seq id index. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + $seq = shift @{$self->{'seqs'}}; + } else { # No sequences. + return 0; + } + my $seq = @{$self->{'seqs'}}[0]; + # if we already have the doc in memory, + # just get the doc. + } elsif ($self->{'xmldoc'}) { + $xmldoc=$self->{'xmldoc'}; + $seq = shift @{$self->{'seqs'}}; + } + # If there's more sequences: + if ($seq) { + # Get the next seq. + my $handler = Bio::SeqIO::game::seqHandler->new(-seq => $seq); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + my $pseq = $parser->parse(Source => { String => $xmldoc }); + # get the features. + my $fhandler = Bio::SeqIO::game::featureHandler->new($pseq->id(), + $pseq->length(), + $pseq->alphabet()); + $options = {Handler=>$fhandler}; + + $parser = XML::Parser::PerlSAX->new($options); + my $features = $parser->parse(Source => { String => $xmldoc }); + my $seq = Bio::Seq->new(); + # Build the Bioperl Seq and return it. + foreach my $feature (@{$features}) { + $seq->add_SeqFeature($feature); + } + $seq->primary_seq($pseq); + return $seq; + } else { + return 0; + } +} + +=head2 next_primary_seq + + Title : next_primary_seq + Usage : $seq = $stream->next_primary_seq() + Function: returns the next primary sequence (ie no seq_features) in the stream + Returns : Bio::PrimarySeq object + Args : NONE + +=cut + +sub next_primary_seq { + my $self=shift; + + # The header is the top level stuff in the XML file. + # IE before the first <bx-seq:seq> tag. + # If you don't include this in each 'chunk', the + # parser will barf. + my $header; + unless ($self->{'header'}) { + while (my $next_line = $self->_readline) { + if($next_line=~/<bx-seq:seq?/) { + $header .= $`; + $self->{'header'}=$header; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } else { + $header .= $next_line; + } + } + if ($self->{'header'}=~m|<bx-game:flavor>.*chunkable.*</bx-game:flavor>|) { + $self->{'chunkable'}=1; + } + + } + + my $not_top_level = 0; + my $xmldoc; + my $seq; + # If chunkable, we read in the document until the next + # TOP LEVEL sequence. + if ($self->{'chunkable'}) { + $xmldoc = $self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + # Maintain depth of computations and annotations. + # We only want TOP LEVEL seqs if chunkable. + while ($next_line=~ m|<bx-computation:computation|g) { + $not_top_level++; + } + while ($next_line=~ m|<bx-annotation:annotationn|g) { + $not_top_level++; + } + while ($next_line=~ m|</bx-computation:computation|g) { + $not_top_level--; + } + while ($next_line=~ m|</bx-annotation:annotationn|g) { + $not_top_level--; + } + if($next_line=~/<bx-seq:seq?/) { + if (!$not_top_level) { + $xmldoc .= $`; + $self->{'leftovers'} .= "<bx-seq:seq".$'; + last; + } + } else { + $xmldoc .= $next_line; + } + } + # Make sure the 'doc chunk' has a closing tag + # to make the parser happy. + if (!$xmldoc=~m|</bx-game:game>|){ + $xmldoc .= "</bx-game:game>"; + } + # Grab the TOP LEVEL seq.. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + } else { # No sequences. + return 0; + } + $seq = @{$self->{'seqs'}}[0]; + # If not chunkable, + # only read document into memory once! + } elsif (!$self->{'xmldoc'}) { + $self->{'xmldoc'}=$self->{'header'}.$self->{'leftovers'}; + while (my $next_line = $self->_readline) { + $self->{'xmldoc'} .= $next_line; + } + $xmldoc=$self->{'xmldoc'}; + # Get the seq id index. + if ($xmldoc =~ m|</bx-seq:seq|) { + my $handler = Bio::SeqIO::game::idHandler->new(); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + $self->{'seqs'} = $parser->parse(Source => { String => $xmldoc }); + $seq = shift @{$self->{'seqs'}}; + } else { # No sequences. + return 0; + } + my $seq = @{$self->{'seqs'}}[0]; + # if we already have the doc in memory, + # just get the doc. + } elsif ($self->{'xmldoc'}) { + $xmldoc=$self->{'xmldoc'}; + $seq = shift @{$self->{'seqs'}}; + } + + #print $xmldoc; + + if ($seq) { + # Get the next seq. + my $handler = Bio::SeqIO::game::seqHandler->new(-seq => $seq); + my $options = {Handler=>$handler}; + my $parser = XML::Parser::PerlSAX->new($options); + my $pseq = $parser->parse(Source => { String => $xmldoc }); + return $pseq; + } else { + return 0; + } +} + + +=head2 write_seq + + Title : write_seq + Usage : Not Yet Implemented! $stream->write_seq(@seq) + Function: writes the $seq object into the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq object + + +=cut + +sub write_seq { + my ($self,@seqs) = @_; + + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + my $bxann = "http://www.bioxml.org/dtds/current/annotation.dtd"; + my $bxcomp = "http://www.bioxml.org/dtds/current/computation.dtd"; + my $bxgame = "http://www.bioxml.org/dtds/current/game.dtd"; + my $bxlink = "http://www.bioxml.org/dtds/current/link.dtd"; + my $bxseq = "http://www.bioxml.org/dtds/current/seq.dtd"; + + my $writer = new XML::Writer(OUTPUT => $self->_fh || \*STDOUT, + NAMESPACES => 1, + DATA_MODE => 1, + DATA_INDENT => 4, + PREFIX_MAP => { + '' => '', # to keep undef warnings away in XML::Writer, fill in with something as a default prefix later? + $bxfeat => 'bx-feature', + $bxann => 'bx-annotation', + $bxcomp => 'bx-computation', + $bxgame => 'bx-game', + $bxlink => 'bx-link', + $bxseq => 'bx-seq' + }); + $writer->xmlDecl("UTF-8"); + $writer->doctype("bx-game:game", 'game', $bxgame); + $writer ->startTag ([$bxgame, 'game']); + $writer->startTag ([$bxgame, 'flavor']); + $writer->characters('chunkable'); + $writer->endTag ([$bxgame, 'flavor']); + + foreach my $seq (@seqs) { + $writer->startTag([$bxseq, 'seq'], + [$bxseq, 'id'] => $seq->display_id, + [$bxseq, 'length'] => $seq->length, + [$bxseq, 'type'] => $seq->alphabet); + if ($seq->length > 0) { + $writer->startTag([$bxseq, 'residues']); + $writer->characters($seq->seq); + $writer->endTag([$bxseq, 'residues']); + } + $writer->endTag([$bxseq, 'seq']); + + my @feats = $seq->all_SeqFeatures; + + my $features; + foreach my $feature (@feats) { + if ($feature->has_tag('annotation_id')) { + my @ann_id = $feature->each_tag_value('annotation_id'); + push (@{$features->{'annotations'}->{$ann_id[0]}}, $feature); + } elsif ($feature->has_tag('computation_id')) { + my @comp_id = $feature->each_tag_value('computation_id'); + push (@{$features->{'computations'}->{$comp_id[0]}}, $feature); + } else { + push (@{$features->{'everybody_else'}}, $feature); + } + } + foreach my $key (keys %{$features->{'annotations'}}) { + $writer->startTag([$bxann, 'annotation'], + [$bxann, 'id']=>$key + ); + $writer->startTag([$bxann, 'seq_link']); + $writer->startTag([$bxlink, 'link']); + $writer->emptyTag([$bxlink, 'ref_link'], + [$bxlink, 'ref'] => $seq->display_id()); + $writer->endTag([$bxlink, 'link']); + $writer->endTag([$bxann, 'seq_link']); + $self->__draw_feature_set($writer, $seq, $bxann, "", @{$features->{'annotations'}->{$key}}); + $writer->endTag([$bxann, 'annotation']); + } + + foreach my $key (keys %{$features->{'computations'}}) { + $writer->startTag([$bxcomp, 'computation'], + [$bxcomp, 'id']=>$key + ); + $writer->startTag([$bxcomp, 'seq_link']); + $writer->startTag([$bxlink, 'link']); + $writer->emptyTag([$bxlink, 'ref_link'], + [$bxlink, 'ref'] => $seq->display_id()); + $writer->endTag([$bxlink, 'link']); + $writer->endTag([$bxcomp, 'seq_link']); + $self->__draw_feature_set($writer, $seq, $bxcomp, "", @{$features->{'computations'}->{$key}}); + $writer->endTag([$bxcomp, 'computation']); + } + foreach my $feature (@{$features->{'everybody_else'}}) { + $self->__draw_feature($writer, $feature, $seq, "", + $self->_export_subfeatures()); + } + } + $writer->endTag([$bxgame, 'game']); + + $self->flush if $self->_flush_on_write && defined $self->_fh; + return 1; +} + + +#these two subroutines are very specific! + +sub __draw_feature_set { + my ($self, $writer, $seq, $namespace, $parent, @features) = @_; + my ($feature_set_id); + + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + + if ($self->_export_subfeatures() && $self->_group_subfeatures()) { + $feature_set_id = $self->{'id_counter'}; $self->{'id_counter'}++; + $writer->startTag([$namespace, 'feature_set'], + [$namespace, 'id'] => $feature_set_id); + foreach my $feature (@features) { + $self->__draw_feature($writer, $feature, $seq, $parent , 0); + } + $writer->endTag([$namespace, 'feature_set']); + foreach my $feature (@features) { + foreach my $subset ($self->_subfeature_types()) { + if (my @subfeatures = eval ( '$feature->' . $subset . '()' )) { + my @id = $feature->each_tag_value('id'); + $self->__draw_feature_set($writer, $seq, $namespace, $id[0], @subfeatures); + } + } + } + + } else { + $feature_set_id = $self->{'id_counter'}; $self->{'id_counter'}++; + $writer->startTag([$namespace, 'feature_set'], + [$namespace, 'id'] => $feature_set_id); + foreach my $feature (@features) { + $self->__draw_feature($writer, $feature, $seq, "" , $self->_export_subfeatures()); + } + $writer->endTag([$namespace, 'feature_set']); + } +} + + +sub __draw_feature { + my ($self, $writer, $feature, $seq, $parent, $recursive) = @_; + my ($subfeature, $subset, @subfeatures, $score, $score_val, $score_no); + my $bxfeat = "http://www.bioxml.org/dtds/current/feature.dtd"; + + if (!$feature->has_tag('id')) { + $feature->add_tag_value('id', $self->{'id_counter'}); + $self->{'id_counter'}++; + } + + my @id = $feature->each_tag_value('id'); + if ($parent) { + $writer->startTag([$bxfeat, 'feature'], + [$bxfeat, 'id'] => $id[0] + ); + } else { + $writer->startTag([$bxfeat, 'feature'], + [$bxfeat, 'id'] => $id[0], + [$bxfeat, 'parent'] => $parent + ); + } + $writer->startTag([$bxfeat, 'type']); + $writer->characters($feature->primary_tag()); + $writer->endTag([$bxfeat, 'type']); + foreach $score ($feature->all_tags()) { + next if ($score eq 'id'); + $writer->startTag([$bxfeat, 'score'], + [$bxfeat, 'type'] => $score + ); + $score_no = 0; + foreach $score_val ($feature->each_tag_value($score)) { + next unless defined $score_val; + $writer->characters(' ') if ($score_no > 0); + $writer->characters($score_val); + $score_no++; + } + $writer->endTag([$bxfeat, 'score']); + } + + $writer->startTag([$bxfeat, 'seq_relationship'], + [$bxfeat, 'seq'] => $seq->display_id, + [$bxfeat, 'type'] => 'query' + ); + + $writer->startTag([$bxfeat, 'span']); + $writer->startTag([$bxfeat, 'start']); + $writer->characters($feature->start()); + $writer->endTag([$bxfeat, 'start']); + $writer->startTag([$bxfeat, 'end']); + $writer->characters($feature->end()); + $writer->endTag([$bxfeat, 'end']); + $writer->endTag([$bxfeat, 'span']); + $writer->endTag([$bxfeat, 'seq_relationship']); + $writer->endTag([$bxfeat, 'feature']); + + #proces subseqfeature's, exons, introns, promotors, whatever... + if ($recursive) { + foreach $subset ($self->_subfeature_types()) { + #determine if it exists + if (@subfeatures = eval ( '$feature->' . $subset . '()' )) { + foreach $subfeature (@subfeatures) { + $self->__draw_feature ($writer, $subfeature, $seq, $id[0], 1); + } + } + } + } +} + +1; +