Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SeqIO/chado.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/chado.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,417 @@ +# $Id: chado.pm,v 1.1 2002/12/03 08:13:55 cjm Exp $ +# +# BioPerl module for Bio::SeqIO::chado +# +# Chris Mungall <cjm@fruitfly.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::chado - chado sequence input/output stream + +=head1 SYNOPSIS + +It is probably best not to use this object directly, but +rather go through the SeqIO handler system. Go: + + $stream = Bio::SeqIO->new(-file => $filename, -format => 'chado'); + + while ( my $seq = $stream->next_seq() ) { + # do something with $seq + } + +=head1 DESCRIPTION + +This object can transform Bio::Seq objects to and from chado flat +file databases. CURRENTLY ONLY TO + + +=head2 Optional functions + +=over 3 + +=item _show_dna() + +(output only) shows the dna or not + +=item _post_sort() + +(output only) provides a sorting func which is applied to the FTHelpers +before printing + + +=back + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this +and other Bioperl modules. Send your comments and suggestions preferably + to one of the Bioperl mailing lists. +Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://www.bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bio.perl.org/bioperl-bugs/ + +=head1 AUTHOR - Chris Mungall + +Email cjm@fruitfly.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::SeqIO::chado; +use vars qw(@ISA); +use strict; + +use Bio::SeqIO; +use Bio::SeqFeature::Generic; +use Bio::Species; +use Bio::Seq::SeqFactory; +use Bio::Annotation::Collection; +use Bio::Annotation::Comment; +use Bio::Annotation::Reference; +use Bio::Annotation::DBLink; + +use Data::Stag qw(:all); + +@ISA = qw(Bio::SeqIO); + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + if( ! defined $self->sequence_factory ) { + $self->sequence_factory(new Bio::Seq::SeqFactory + (-verbose => $self->verbose(), + -type => 'Bio::Seq::RichSeq')); + } + my $wclass = $self->default_handler_class; + $self->handler($wclass->new); + $self->{_end_of_data} = 0; + $self->handler->S("chado"); + return; +} + +sub DESTROY { + my $self = shift; + $self->end_of_data(); + $self->SUPER::DESTROY(); +} + +sub end_of_data { + my $self = shift; + $self->{_end_of_data} = 1; + $self->handler->E("chado"); +} + +sub default_handler_class { + return "Data::Stag::BaseHandler"; +} + +=head2 next_seq + + Title : next_seq + Usage : $seq = $stream->next_seq() + Function: returns the next sequence in the stream + Returns : Bio::Seq object + Args : + +=cut + +sub next_seq { + my ($self,@args) = @_; + my $seq = $self->sequence_factory->create + ( + # '-verbose' =>$self->verbose(), + # %params, + # -seq => $seqc, + # -annotation => $annotation, + # -features => \@features + ); + return $seq; +} + +sub handler { + my $self = shift; + $self->{_handler} = shift if @_; + return $self->{_handler}; +} + + +=head2 write_seq + + Title : write_seq + Usage : $stream->write_seq($seq) + Function: writes the $seq object (must be seq) to the stream + Returns : 1 for success and 0 for error + Args : Bio::Seq + + +=cut + +sub write_seq { + my ($self,$seq) = @_; + + if( !defined $seq ) { + $self->throw("Attempting to write with no seq!"); + } + + if( ! ref $seq || ! $seq->isa('Bio::SeqI') ) { + $self->warn(" $seq is not a SeqI compliant module. Attempting to dump, but may fail!"); + } + + # get a handler - must inherit from Data::Stag::BaseHandler; + my $w = $self->handler; + + # start of data + $w->S("seqset"); + + # my $seq_temp_uid = $self->get_temp_uid($seq); + + my $seq_temp_uid = $seq->accession . '.' . ($seq->can('seq_version') ? $seq->seq_version : $seq->version); + + # data structure representing the core sequence for this record + my $seqnode = + Data::Stag->new(feature=>[ + [feature_id=>$seq_temp_uid], + [dbxrefstr=>$seq->accession_number], + [name=>$seq->display_name], + [residues=>$seq->seq], + ]); + + # soft properties + my %prop = (); + + my ($div, $mol); + my $len = $seq->length(); + + if ( $seq->can('division') ) { + $div=$seq->division; + } + if( !defined $div || ! $div ) { $div = 'UNK'; } + + if( !$seq->can('molecule') || ! defined ($mol = $seq->molecule()) ) { + $mol = $seq->alphabet || 'DNA'; + } + + + my $circular = 'linear '; + $circular = 'circular' if $seq->is_circular; + + # cheeky hack - access symbol table + no strict 'refs'; + map { + $prop{$_} = + $ {*$_}; + } qw(mol div circular); + use strict 'refs'; + + map { + $prop{$_} = $seq->$_() if $seq->can($_); + } qw(desc keywords); + + local($^W) = 0; # supressing warnings about uninitialized fields. + + # Organism lines + if (my $spec = $seq->species) { + my ($species, $genus, @class) = $spec->classification(); + my $OS; + if( $spec->common_name ) { + $OS = $spec->common_name; + } else { + $OS = "$genus $species"; + } + if (my $ssp = $spec->sub_species) { + $OS .= " $ssp"; + } + } + + # Reference lines + my $count = 1; + foreach my $ref ( $seq->annotation->get_Annotations('reference') ) { + # TODO + } + # Comment lines + + foreach my $comment ( $seq->annotation->get_Annotations('comment') ) { + $seqnode->add_featureprop([[pkey=>'comment'],[pval=>$comment->text]]); + } + + # throw the writer an event + $w->ev(@$seqnode); + + $seqnode = undef; # free memory + + # make events for all the features within the record + foreach my $sf ( $seq->top_SeqFeatures ) { + $self->write_sf($sf, $seq_temp_uid); + } + + # data end + $w->E("seqset"); + return 1; +} + +# ---- +# writes a seq feature +# ---- + +sub write_sf { + my $self = shift; + my $sf = shift; + my $seq_temp_uid = shift; + + my $w = $self->handler; + + my %props = + map { + $_=>[$sf->each_tag_value($_)] + } $sf->all_tags; + + my $loc = $sf->location; + my $name = $sf->display_name; + my $type = $sf->primary_tag; + my @subsfs = $sf->sub_SeqFeature; + my @locnodes = (); + my $sid = $loc->is_remote ? $loc->seq_id : $seq_temp_uid; + if( $loc->isa("Bio::Location::SplitLocationI") ) { + # turn splitlocs into subfeatures + my $n = 1; + push(@subsfs, + map { + my $ssf = + Bio::SeqFeature::Generic->new( + + -start=>$_->start, + -end=>$_->end, + -strand=>$_->strand, + -primary=>$self->subpartof($type), + ); + if ($_->is_remote) { + $ssf->location->is_remote(1); + $ssf->location->seq_id($_->seq_id); + } + $ssf; + } $loc->each_Location); + } + elsif( $loc->isa("Bio::Location::RemoteLocationI") ) { + # turn splitlocs into subfeatures + my $n = 1; + push(@subsfs, + map { + Bio::SeqFeature::Generic->new( +# -name=>$name.'.'.$n++, + -start=>$_->start, + -end=>$_->end, + -strand=>$_->strand, + -primary=>$self->subpartof($type), + ) + } $loc->each_Location); + } + else { + my ($beg, $end, $strand) = $self->bp2ib($loc); + @locnodes = ( + [featureloc=>[ + [nbeg=>$beg], + [nend=>$end], + [strand=>$strand], + [srcfeature_id=>$sid], + [group=>0], + [rank=>0], + ] + ] + ); + } + my $feature_id = $self->get_temp_uid($sf); + + my $fnode = + [feature=>[ + [feature_id=>$feature_id], + [name=>$name], + [typename=>$type], + @locnodes, + (map { + my $k = $_; + map { [featureprop=>[[pkey=>$k],[pval=>$_]]] } @{$props{$k}} + } keys %props), + ]]; + $w->ev(@$fnode); + + foreach my $ssf (@subsfs) { + my $ssfid = $self->write_sf($ssf, $sid); + $w->ev(feature_relationship=>[ + [subjfeature_id=>$ssfid], + [objfeature_id=>$feature_id] + ] + ); + } + return $feature_id; +} + +# private; +# an ID for this session that should be +# unique... hmm +sub session_id { + my $self = shift; + $self->{_session_id} = shift if @_; + if (!$self->{_session_id}) { + $self->{_session_id} = $$.time; + } + return $self->{_session_id}; +} + + +our $next_id = 1; +our %obj2id_hash = (); +sub get_temp_uid { + my $self = shift; + my $ob = shift; + my $id = $obj2id_hash{$ob}; + if (!$id) { + $id = $next_id++; + $obj2id_hash{$ob} = $id; + } + return $self->session_id.'.'.$id; +} + +# interbase and directional semantics +sub bp2ib { + my $self = shift; + my $loc = shift; + my ($s, $e, $str) = + ref($loc) eq "ARRAY" ? (@$loc) : ($loc->start, $loc->end, $loc->strand); + if ($str < 0) { + ($s, $e) = ($e, $s); + } + $s--; + return ($s, $e, $str); +} + +sub subpartof { + my $self = shift; + my $type = 'partof_'.shift; + $type =~ s/partof_CDS/CDS_exon/; + $type =~ s/partof_\wRNA/exon/; + return $type; +} + +1;