Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,328 @@ +=pod + +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 NAME + +GFFParser - simple gff3 parser. + + +=head1 AUTHOR + +Monika Komorowska, 2012 - monika@ebi.ac.uk + +=head1 SYNOPSIS + +use strict; +use Bio::EnsEMBL::Utils::IO::GFFParser; +use IO::File; + +my $file_name = "features.gff"; +my $fh = IO::File->new($file_name, 'r'); +my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh); + +my @header_lines = @{$parser->parse_header()}; +#do something with the header lines array, e.g. print array elements + +foreach my $header_line (@header_lines) { + print $header_line . "\n"; +} +print "\n\n"; +my $feature = $parser->parse_next_feature(); + +while (defined($feature) ) { + + my %feature = %{$feature}; + + #do something with the feature, e.g. print hash keys and values + foreach my $key (keys %feature) { + if ($key ne 'attribute') { + print $key . " " . $feature{$key} ."\n"; + } else { + print $key . "\n"; + my %attribs = %{$feature{$key}}; + foreach my $attrib_key (keys %attribs) { + printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)})); + + } + } + } + print "\n\n"; + $feature = $parser->parse_next_feature(); +} + +my $sequence = $parser->parse_next_sequence(); + +while (defined($sequence)) { + my %sequence = %{$sequence}; + + foreach my $key (keys %sequence) { + print $key . " " . $sequence{$key} ."\n"; + } + print "\n\n"; + + $sequence = $parser->parse_next_sequence(); +} + +$parser->close(); + +$fh->close(); + + + +=head1 DESCRIPTION + +GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml + +Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file. + +This class can be extended to convert a feature hash into a feature object reversing +the processing done by GFFSerializer. + +=cut + +package Bio::EnsEMBL::Utils::IO::GFFParser; +use strict; +use warnings; +use Bio::EnsEMBL::Utils::Exception; +use IO::File; +use URI::Escape; +use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/; + + +my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1'); + +=head2 new + + Constructor + Arg [1] : File handle + + Returntype : Bio::EnsEMBL::Utils::IO::GFFParser + +=cut + +sub new { + my $class = shift; + my $self = { + filehandle => shift, + }; + bless $self, $class; + if (!defined($self->{'filehandle'})) { + throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); + } + return $self; + +} + +=head2 parse_header + + Arg [1] : File handle + Description: Returns a arrayref with each header line stored in array element + Returntype : Arrayref of GFF3 file header lines + +=cut + +sub parse_header { + + my $self = shift; + + my $next_line; + my @header_lines; + + while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) ) { + + #stop parsing features if ##FASTA directive encountered + last if ($next_line =~ /\#\#FASTA/ ); + + #header lines start with ## (except for the ##FASTA directive indicating sequence section) + if ($next_line =~ /^[\#]{2}/ ) { + push @header_lines, $next_line; + if ($next_line =~ /gff-version\s+(\d+)/) { + if ($1 != 3) { + warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files."); + } + } + } + } + + if (defined($next_line)) { + $self->{'first_non_header_line'} = $next_line; + } + return \@header_lines; + +} + +=head2 parse_next_feature + + Arg [1] : File handle + Description: Returns a hashref in the format - + { + seqid => scalar, + source => scalar, + type => scalar, + start => scalar, + end => scalar, + score => scalar, + strand => scalar, + phase => scalar, + attribute => hashref, + + } + Returntype : Hashref of a GFF3 feature line + +=cut + +sub parse_next_feature { + + my $self = shift; + + my $next_line; + my $feature_line; + + while (($next_line = $self->_read_line() ) && defined($next_line) ) { + + #stop parsing features if ##FASTA directive + last if ($next_line =~ /\#\#FASTA/); + + + next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || + $next_line =~ /^\/\//); + + $feature_line = $next_line; + last; + } + + return undef unless $feature_line; + + my %feature; + my %attribute; + + + #strip off trailing comments + $feature_line =~ s/\#.*//; + + my @chunks = split(/\t/, $feature_line); + + %feature = ( + 'seqid' => uri_unescape($chunks[0]), + 'source' => uri_unescape($chunks[1]), + 'type' => uri_unescape($chunks[2]), + 'start' => $chunks[3], + 'end' => $chunks[4], + 'score' => $chunks[5], + 'strand' => $strand_conversion{$chunks[6]}, + 'phase' => $chunks[7] + ); + + if ($chunks[8]) { + my @attributes = split( /;/, $chunks[8] ); + my %attributes; + foreach my $attribute (@attributes) { + my ( $name, $value ) = split( /=/, $attribute ); + $name = uri_unescape($name); + my @split_values = map { uri_unescape($_) } split(/,/, $value); + if(scalar(@split_values) > 1) { + $attributes{$name} = \@split_values; + } + else { + $attributes{$name} = $split_values[0]; + } + } + $feature{'attribute'} = \%attributes; + } + + return \%feature; +} + +=head2 parse_next_sequence + + Arg [1] : File handle + Description: Returns a hashref in the format - + { + header => scalar, + sequence => scalar, + + } + Returntype : Hashref of a GFF3 sequence line + +=cut + +sub parse_next_sequence { + + my $self = shift; + + my $next_line; + my $sequence; + my $header; + + while (($next_line = $self->_read_line() ) && defined($next_line) ) { + + next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ || + $next_line =~ /^\/\//); + + if ($next_line =~ /^>/) { + if ($header) { + #next fasta header encountered + $self->{'next_fasta_header'} = $next_line; + last; + + } else { + $header = $next_line; + } + } else { + $sequence .= $next_line; + } + } + + return undef unless ($sequence || $header); + + my %sequence = (header => $header , sequence => $sequence ); + + return \%sequence; +} + + +sub _read_line { + + my $self = shift; + my $fh = $self->{'filehandle'}; + + my $line; + + if (defined($self->{'first_non_header_line'})) { + $line = $self->{'first_non_header_line'}; + $self->{'first_non_header_line'} = undef; + } elsif ( defined($self->{'next_fasta_header'} )) { + $line = $self->{'next_fasta_header'}; + $self->{'next_fasta_header'} = undef; + } + else { + $line = <$fh>; + if (defined($line)) { + chomp $line; + if (!$line) { + #parse next line if current line is empty + $line = $self->_read_line(); + } + } + } + + return $line; +} + +sub close { + + my $self = shift; + $self->{"filehandle"} = undef; + +} + +1;