Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Tools/GFF.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/GFF.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,621 @@ +# $Id: GFF.pm,v 1.26 2002/11/24 21:35:40 jason Exp $ +# +# BioPerl module for Bio::Tools::GFF +# +# Cared for by the Bioperl core team +# +# Copyright Matthew Pocock +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser + +=head1 SYNOPSIS + + use Bio::Tools::GFF; + + # specify input via -fh or -file + my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2); + my $feature; + # loop over the input stream + while($feature = $gffio->next_feature()) { + # do something with feature + } + $gffio->close(); + + # you can also obtain a GFF parser as a SeqAnalasisParserI in + # HT analysis pipelines (see Bio::SeqAnalysisParserI and + # Bio::Factory::SeqAnalysisParserFactory) + my $factory = Bio::Factory::SeqAnalysisParserFactory->new(); + my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff"); + while($feature = $parser->next_feature()) { + # do something with feature + } + +=head1 DESCRIPTION + +This class provides a simple GFF parser and writer. In the sense of a +SeqAnalysisParser, it parses an input file or stream into SeqFeatureI +objects, but is not in any way specific to a particular analysis +program and the output that program produces. + +That is, if you can get your analysis program spit out GFF, here is +your result parser. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Matthew Pocock + +Email mrp@sanger.ac.uk + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::GFF; + +use vars qw(@ISA); +use strict; + +use Bio::Root::IO; +use Bio::SeqAnalysisParserI; +use Bio::SeqFeature::Generic; + +@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); + +=head2 new + + Title : new + Usage : + Function: Creates a new instance. Recognized named parameters are -file, -fh, + and -gff_version. + + Returns : a new object + Args : names parameters + + +=cut + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my ($gff_version) = $self->_rearrange([qw(GFF_VERSION)],@args); + + # initialize IO + $self->_initialize_io(@args); + + $gff_version ||= 2; + if(($gff_version != 1) && ($gff_version != 2)) { + $self->throw("Can't build a GFF object with the unknown version ". + $gff_version); + } + $self->gff_version($gff_version); + return $self; +} + +=head2 next_feature + + Title : next_feature + Usage : $seqfeature = $gffio->next_feature(); + Function: Returns the next feature available in the input file or stream, or + undef if there are no more features. + Example : + Returns : A Bio::SeqFeatureI implementing object, or undef if there are no + more features. + Args : none + +=cut + +sub next_feature { + my ($self) = @_; + + my $gff_string; + + # be graceful about empty lines or comments, and make sure we return undef + # if the input's consumed + while(($gff_string = $self->_readline()) && defined($gff_string)) { + next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ || + $gff_string =~ /^\/\//); + last; + } + return undef unless $gff_string; + + my $feat = Bio::SeqFeature::Generic->new(); + $self->from_gff_string($feat, $gff_string); + + return $feat; +} + +=head2 from_gff_string + + Title : from_gff_string + Usage : $gff->from_gff_string($feature, $gff_string); + Function: Sets properties of a SeqFeatureI object from a GFF-formatted + string. Interpretation of the string depends on the version + that has been specified at initialization. + + This method is used by next_feature(). It actually dispatches to + one of the version-specific (private) methods. + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF-formatted string to initialize it from + +=cut + +sub from_gff_string { + my ($self, $feat, $gff_string) = @_; + + if($self->gff_version() == 1) { + $self->_from_gff1_string($feat, $gff_string); + } else { + $self->_from_gff2_string($feat, $gff_string); + } +} + +=head2 _from_gff1_string + + Title : _from_gff1_string + Usage : + Function: + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF-formatted string to initialize it from + +=cut + +sub _from_gff1_string { + my ($gff, $feat, $string) = @_; + chomp $string; + my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @group) = split(/\t/, $string); + + if ( !defined $frame ) { + $feat->throw("[$string] does not look like GFF to me"); + } + $frame = 0 unless( $frame =~ /^\d+$/); + $feat->seq_id($seqname); + $feat->source_tag($source); + $feat->primary_tag($primary); + $feat->start($start); + $feat->end($end); + $feat->frame($frame); + if ( $score eq '.' ) { + #$feat->score(undef); + } else { + $feat->score($score); + } + if ( $strand eq '-' ) { $feat->strand(-1); } + if ( $strand eq '+' ) { $feat->strand(1); } + if ( $strand eq '.' ) { $feat->strand(0); } + foreach my $g ( @group ) { + if ( $g =~ /(\S+)=(\S+)/ ) { + my $tag = $1; + my $value = $2; + $feat->add_tag_value($1, $2); + } else { + $feat->add_tag_value('group', $g); + } + } +} + +=head2 _from_gff2_string + + Title : _from_gff2_string + Usage : + Function: + Example : + Returns : void + Args : A Bio::SeqFeatureI implementing object to be initialized + The GFF2-formatted string to initialize it from + + +=cut + +sub _from_gff2_string { + my ($gff, $feat, $string) = @_; + chomp($string); + # according to the Sanger website, GFF2 should be single-tab separated elements, and the + # free-text at the end should contain text-translated tab symbols but no "real" tabs, + # so splitting on \t is safe, and $attribs gets the entire attributes field to be parsed later + my ($seqname, $source, $primary, $start, $end, $score, $strand, $frame, @attribs) = split(/\t+/, $string); + my $attribs = join '', @attribs; # just in case the rule against tab characters has been broken + if ( !defined $frame ) { + $feat->throw("[$string] does not look like GFF2 to me"); + } + $feat->seq_id($seqname); + $feat->source_tag($source); + $feat->primary_tag($primary); + $feat->start($start); + $feat->end($end); + $feat->frame($frame); + if ( $score eq '.' ) { + #$feat->score(undef); + } else { + $feat->score($score); + } + if ( $strand eq '-' ) { $feat->strand(-1); } + if ( $strand eq '+' ) { $feat->strand(1); } + if ( $strand eq '.' ) { $feat->strand(0); } + + + # <Begin Inefficient Code from Mark Wilkinson> + # this routine is necessay to allow the presence of semicolons in + # quoted text Semicolons are the delimiting character for new + # tag/value attributes. it is more or less a "state" machine, with + # the "quoted" flag going up and down as we pass thorugh quotes to + # distinguish free-text semicolon and hash symbols from GFF control + # characters + + + my $flag = 0; # this could be changed to a bit and just be twiddled + my @parsed; + + # run through each character one at a time and check it + # NOTE: changed to foreach loop which is more efficient in perl + # --jasons + + foreach my $a ( split //, $attribs ) { + # flag up on entering quoted text, down on leaving it + if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 } + elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"} + elsif( $a eq '#' && ! $flag ) { last } + push @parsed, $a; + } + $attribs = join "", @parsed; # rejoin into a single string + + # <End Inefficient Code> + # Please feel free to fix this and make it more "perlish" + + my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited + + foreach my $pair ( @key_vals ) { + # replace semicolons that were removed from free-text above. + $pair =~ s/INSERT_SEMICOLON_HERE/;/g; + + # separate the key from the value + my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; + + + if( defined $values ) { + my @values; + # free text is quoted, so match each free-text block + # and remove it from the $values string + while ($values =~ s/"(.*?)"//){ + # and push it on to the list of values (tags may have + # more than one value... and the value may be undef) + push @values, $1; + } + + # and what is left over should be space-separated + # non-free-text values + + my @othervals = split /\s+/, $values; + foreach my $othervalue(@othervals){ + # get rid of any empty strings which might + # result from the split + if (CORE::length($othervalue) > 0) {push @values, $othervalue} + } + + foreach my $value(@values){ + $feat->add_tag_value($key, $value); + } + } + } +} + +=head2 write_feature + + Title : write_feature + Usage : $gffio->write_feature($feature); + Function: Writes the specified SeqFeatureI object in GFF format to the stream + associated with this instance. + Returns : none + Args : An array of Bio::SeqFeatureI implementing objects to be serialized + +=cut + +sub write_feature { + my ($self, @features) = @_; + foreach my $feature ( @features ) { + $self->_print($self->gff_string($feature)."\n"); + } +} + +=head2 gff_string + + Title : gff_string + Usage : $gffstr = $gffio->gff_string($feature); + Function: Obtain the GFF-formatted representation of a SeqFeatureI object. + The formatting depends on the version specified at initialization. + + This method is used by write_feature(). It actually dispatches to + one of the version-specific (private) methods. + Example : + Returns : A GFF-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF-stringified + +=cut + +sub gff_string{ + my ($self, $feature) = @_; + + if($self->gff_version() == 1) { + return $self->_gff1_string($feature); + } else { + return $self->_gff2_string($feature); + } +} + +=head2 _gff1_string + + Title : _gff1_string + Usage : $gffstr = $gffio->_gff1_string + Function: + Example : + Returns : A GFF1-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF-stringified + +=cut + +sub _gff1_string{ + my ($gff, $feat) = @_; + my ($str,$score,$frame,$name,$strand); + + if( $feat->can('score') ) { + $score = $feat->score(); + } + $score = '.' unless defined $score; + + if( $feat->can('frame') ) { + $frame = $feat->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $feat->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $feat->strand == -1 ) { + $strand = '-'; + } + + if( $feat->can('seqname') ) { + $name = $feat->seq_id(); + $name ||= 'SEQ'; + } else { + $name = 'SEQ'; + } + + + $str = join("\t", + $name, + $feat->source_tag(), + $feat->primary_tag(), + $feat->start(), + $feat->end(), + $score, + $strand, + $frame); + + foreach my $tag ( $feat->all_tags ) { + foreach my $value ( $feat->each_tag_value($tag) ) { + $str .= " $tag=$value"; + } + } + + + return $str; +} + +=head2 _gff2_string + + Title : _gff2_string + Usage : $gffstr = $gffio->_gff2_string + Function: + Example : + Returns : A GFF2-formatted string representation of the SeqFeature + Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified + +=cut + +sub _gff2_string{ + my ($gff, $feat) = @_; + my ($str,$score,$frame,$name,$strand); + + if( $feat->can('score') ) { + $score = $feat->score(); + } + $score = '.' unless defined $score; + + if( $feat->can('frame') ) { + $frame = $feat->frame(); + } + $frame = '.' unless defined $frame; + + $strand = $feat->strand(); + if(! $strand) { + $strand = "."; + } elsif( $strand == 1 ) { + $strand = '+'; + } elsif ( $feat->strand == -1 ) { + $strand = '-'; + } + + if( $feat->can('seqname') ) { + $name = $feat->seq_id(); + $name ||= 'SEQ'; + } else { + $name = 'SEQ'; + } + $str = join("\t", + $name, + $feat->source_tag(), + $feat->primary_tag(), + $feat->start(), + $feat->end(), + $score, + $strand, + $frame); + + # the routine below is the only modification I made to the original + # ->gff_string routine (above) as on November 17th, 2000, the + # Sanger webpage describing GFF2 format reads: "From version 2 + # onwards, the attribute field must have a tag value structure + # following the syntax used within objects in a .ace file, + # flattened onto one line by semicolon separators. Tags must be + # standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values + # must be quoted with double quotes". + + # MW + + my $valuestr; + my @all_tags = $feat->all_tags; + if (@all_tags) { # only play this game if it is worth playing... + $str .= "\t"; # my interpretation of the GFF2 + # specification suggests the need + # for this additional TAB character...?? + foreach my $tag ( @all_tags ) { + my $valuestr; # a string which will hold one or more values + # for this tag, with quoted free text and + # space-separated individual values. + foreach my $value ( $feat->each_tag_value($tag) ) { + if ($value =~ /[^A-Za-z0-9_]/){ + $value =~ s/\t/\\t/g; # substitute tab and newline + # characters + $value =~ s/\n/\\n/g; # to their UNIX equivalents + $value = '"' . $value . '" '} # if the value contains + # anything other than valid + # tag/value characters, then + # quote it + $value = "\"\"" unless defined $value; + # if it is completely empty, + # then just make empty double + # quotes + $valuestr .= $value . " "; # with a trailing space in case + # there are multiple values + # for this tag (allowed in GFF2 and .ace format) + } + $str .= "$tag $valuestr ; "; # semicolon delimited with no '=' sign + } + chop $str; chop $str # remove the trailing semicolon and space + } + return $str; +} + +=head2 gff_version + + Title : _gff_version + Usage : $gffversion = $gffio->gff_version + Function: + Example : + Returns : The GFF version this parser will accept and emit. + Args : none + +=cut + +sub gff_version { + my ($self, $value) = @_; + if(defined $value && (($value == 1) || ($value == 2))) { + $self->{'GFF_VERSION'} = $value; + } + return $self->{'GFF_VERSION'}; +} + +# Make filehandles + +=head2 newFh + + Title : newFh + Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') + Function: does a new() followed by an fh() + Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') + $feature = <$fh>; # read a feature object + print $fh $feature ; # write a feature object + Returns : filehandle tied to the Bio::Tools::GFF class + Args : + +=cut + +sub newFh { + my $class = shift; + return unless my $self = $class->new(@_); + return $self->fh; +} + +=head2 fh + + Title : fh + Usage : $obj->fh + Function: + Example : $fh = $obj->fh; # make a tied filehandle + $feature = <$fh>; # read a feature object + print $fh $feature; # write a feature object + Returns : filehandle tied to Bio::Tools::GFF class + Args : none + +=cut + + +sub fh { + my $self = shift; + my $class = ref($self) || $self; + my $s = Symbol::gensym; + tie $$s,$class,$self; + return $s; +} + +sub DESTROY { + my $self = shift; + + $self->close(); +} + +sub TIEHANDLE { + my ($class,$val) = @_; + return bless {'gffio' => $val}, $class; +} + +sub READLINE { + my $self = shift; + return $self->{'gffio'}->next_feature() unless wantarray; + my (@list, $obj); + push @list, $obj while $obj = $self->{'gffio'}->next_feature(); + return @list; +} + +sub PRINT { + my $self = shift; + $self->{'gffio'}->write_feature(@_); +} + +1; +