Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/IO/GFFParser.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,328 @@
+=pod
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 NAME
+
+GFFParser - simple gff3 parser.
+
+
+=head1 AUTHOR
+
+Monika Komorowska, 2012 - monika@ebi.ac.uk
+
+=head1 SYNOPSIS
+
+use strict;
+use Bio::EnsEMBL::Utils::IO::GFFParser;
+use IO::File;
+
+my $file_name = "features.gff";
+my $fh = IO::File->new($file_name, 'r');
+my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
+
+my @header_lines = @{$parser->parse_header()};
+#do something with the header lines array, e.g. print array elements
+
+foreach my $header_line (@header_lines) {
+    print $header_line . "\n";
+}
+print "\n\n";
+my $feature = $parser->parse_next_feature();
+
+while (defined($feature) ) {
+
+    my %feature = %{$feature};
+
+    #do something with the feature, e.g. print hash keys and values 
+    foreach my $key (keys %feature) {
+	if ($key ne 'attribute') {
+	    print $key . " " . $feature{$key} ."\n";
+	} else {
+	    print $key . "\n";
+	    my %attribs =  %{$feature{$key}};
+	    foreach my $attrib_key (keys %attribs) {
+		printf("\t%s %s\n", $attrib_key, join(q{, }, @{wrap_array($values)}));
+
+	    }
+	}
+    }
+    print "\n\n";
+    $feature = $parser->parse_next_feature();
+}
+
+my $sequence = $parser->parse_next_sequence();
+
+while (defined($sequence)) {
+    my %sequence = %{$sequence};
+
+    foreach my $key (keys %sequence) {      
+        print $key . " " . $sequence{$key} ."\n";
+    }
+    print "\n\n";   
+
+    $sequence = $parser->parse_next_sequence();
+}
+
+$parser->close();
+
+$fh->close();
+
+
+
+=head1 DESCRIPTION
+
+GFF3 format as defined in http://www.sequenceontology.org/gff3.shtml
+
+Use parse_header method to parse a GFF3 file header, and parse_next_feature to parse the next feature line in the file.
+
+This class can be extended to convert a feature hash into a feature object reversing
+the processing done by GFFSerializer.
+
+=cut
+
+package Bio::EnsEMBL::Utils::IO::GFFParser;
+use strict;
+use warnings;
+use Bio::EnsEMBL::Utils::Exception;
+use IO::File;
+use URI::Escape;
+use Bio::EnsEMBL::Utils::Scalar qw/wrap_array/;
+
+
+my %strand_conversion = ( '+' => '1', '?' => '0', '-' => '-1');
+
+=head2 new
+
+    Constructor
+    Arg [1]    : File handle
+    
+    Returntype : Bio::EnsEMBL::Utils::IO::GFFParser
+
+=cut
+
+sub new {
+    my $class = shift;
+    my $self = {
+        filehandle => shift,
+    };
+    bless $self, $class;
+    if (!defined($self->{'filehandle'})) {
+        throw("GFFParser requires a valid filehandle to a GFF3 formatted file"); 
+    }
+    return $self;
+
+}
+
+=head2 parse_header
+
+    Arg [1]    : File handle 
+    Description: Returns a arrayref with each header line stored in array element
+    Returntype : Arrayref of GFF3 file header lines
+
+=cut
+
+sub parse_header {
+
+    my $self = shift;
+
+    my $next_line;
+    my @header_lines;
+    
+    while (($next_line = $self->_read_line()) && ($next_line =~ /^[\#|\s]/) )  {
+
+	#stop parsing features if ##FASTA directive encountered
+	last if ($next_line =~ /\#\#FASTA/ );
+
+	#header lines start with ## (except for the ##FASTA directive indicating sequence section)
+	if ($next_line =~ /^[\#]{2}/ ) {
+	    push @header_lines, $next_line;
+	    if ($next_line =~ /gff-version\s+(\d+)/) {
+		if ($1 != 3) {
+		    warning("File has been formatted in GFF version $1. GFFParser may return unexpected results as it is designed to parse GFF3 formatted files.");  
+		}
+	    }
+	}
+    }
+
+    if (defined($next_line)) {
+	$self->{'first_non_header_line'} = $next_line;
+    }
+    return \@header_lines;
+
+}
+
+=head2 parse_next_feature
+
+    Arg [1]    : File handle
+    Description: Returns a hashref in the format -
+                 {
+                   seqid => scalar,
+                   source => scalar,
+                   type => scalar,
+                   start => scalar,
+                   end => scalar,
+                   score => scalar,
+                   strand => scalar,
+                   phase => scalar,
+                   attribute => hashref, 
+                   
+		 }
+    Returntype : Hashref of a GFF3 feature line
+
+=cut
+
+sub parse_next_feature {
+
+    my $self = shift;
+
+    my $next_line;
+    my $feature_line;
+    
+    while (($next_line = $self->_read_line() ) && defined($next_line) ) {
+
+	#stop parsing features if ##FASTA directive
+	last if ($next_line =~ /\#\#FASTA/);
+
+
+	next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
+		$next_line =~ /^\/\//);
+
+	$feature_line = $next_line;
+	last;
+    }
+
+    return undef unless $feature_line;
+
+    my %feature;
+    my %attribute;
+
+
+    #strip off trailing comments
+    $feature_line =~ s/\#.*//;
+	
+    my @chunks = split(/\t/, $feature_line);
+
+    %feature = (
+	    'seqid' => uri_unescape($chunks[0]),
+            'source' => uri_unescape($chunks[1]),
+            'type' => uri_unescape($chunks[2]),
+            'start' => $chunks[3],
+            'end' => $chunks[4],
+            'score' => $chunks[5],
+            'strand' => $strand_conversion{$chunks[6]},
+            'phase' => $chunks[7] 
+    );
+	
+    if ($chunks[8]) {
+    my @attributes = split( /;/, $chunks[8] );
+      my %attributes;
+      foreach my $attribute (@attributes) {
+        my ( $name, $value ) = split( /=/, $attribute );
+        $name = uri_unescape($name);
+        my @split_values = map { uri_unescape($_) } split(/,/, $value);
+        if(scalar(@split_values) > 1) {
+          $attributes{$name} = \@split_values;
+        }
+        else {
+          $attributes{$name} = $split_values[0];
+        }
+      }
+      $feature{'attribute'} = \%attributes;
+    }
+
+    return \%feature;    
+}
+
+=head2 parse_next_sequence
+
+    Arg [1]    : File handle
+    Description: Returns a hashref in the format -
+                 {
+                   header => scalar,
+                   sequence => scalar,
+                   
+		 }
+    Returntype : Hashref of a GFF3 sequence line
+
+=cut
+
+sub parse_next_sequence {
+
+    my $self = shift;
+
+    my $next_line;
+    my $sequence;
+    my $header;
+    
+    while (($next_line = $self->_read_line() ) && defined($next_line) ) {
+
+	next if ($next_line =~ /^\#/ || $next_line =~ /^\s*$/ ||
+		$next_line =~ /^\/\//);
+
+	if ($next_line =~ /^>/) {
+	    if ($header) {
+		#next fasta header encountered
+		$self->{'next_fasta_header'} = $next_line; 
+		last;
+		
+	    } else {
+		$header = $next_line;
+	    }
+	} else {
+	    $sequence .= $next_line;
+	}
+    }
+
+    return undef unless ($sequence || $header);
+
+    my %sequence = (header => $header , sequence => $sequence );
+
+    return \%sequence;    
+}
+
+
+sub _read_line {
+
+    my $self = shift;
+    my $fh = $self->{'filehandle'};
+
+    my $line;
+    
+    if (defined($self->{'first_non_header_line'})) {
+	$line = $self->{'first_non_header_line'};
+	$self->{'first_non_header_line'} = undef;
+    } elsif ( defined($self->{'next_fasta_header'} )) {
+	$line = $self->{'next_fasta_header'};
+	$self->{'next_fasta_header'} = undef;
+    }
+    else {
+	$line = <$fh>;
+	if (defined($line)) {
+	    chomp $line;
+	    if (!$line) {
+		#parse next line if current line is empty
+		$line = $self->_read_line();
+	    }
+	}
+    }
+
+    return $line;
+}
+
+sub close {
+
+    my $self = shift;
+    $self->{"filehandle"} = undef;
+
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children