diff variant_effect_predictor/Bio/EnsEMBL/Compara/GenomeMF.pm @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Compara/GenomeMF.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,159 @@
+
+package Bio::EnsEMBL::Compara::GenomeMF;
+
+use strict;
+use warnings;
+
+use Bio::EnsEMBL::Utils::Argument;
+use Bio::EnsEMBL::Utils::Scalar qw(:assert);
+use Bio::EnsEMBL::Utils::IO::GFFParser;
+
+use JSON;
+use Bio::SeqIO;
+use FileHandle;
+use Data::Dumper;
+
+
+sub new {
+    my ($class, @args) = @_;
+
+    my ($filename, $index);
+    if (scalar @args) {
+        ($filename, $index) = rearrange([qw(FILENAME INDEX)], @args);
+    }
+
+    die unless defined $filename;
+    die unless defined $index;
+
+    return $class->all_from_file($filename)->[$index-1];
+}
+
+sub all_from_file {
+    my $self = shift;
+    my $filename = shift;
+
+    # Loads the file with JSON
+    die "'filename' must be defined" unless defined $filename;
+    die "Can't read from '$filename'" unless -r $filename;
+    my $json_text   = `cat $filename`;
+    my $json_parser = JSON->new->relaxed;
+    my $perl_array   = $json_parser->decode($json_text);
+
+    # List of fields that must / can be present
+    my @obligatory_fields = qw(production_name taxonomy_id assembly genebuild prot_fasta cds_fasta);
+    my $possible_fields = {map {$_ => 1} (@obligatory_fields, qw(gene_coord_gff))};
+
+    # Checks the integrity of the file
+    my $i = 0;
+    die "The first level structure in '$filename' must be an array" unless ref($perl_array) eq 'ARRAY';
+    foreach my $entry (@$perl_array) {
+        die "The second level structures in '$filename' must be hashes" unless ref($entry) eq 'HASH';
+        map {die "'$_' must map to a scalar in the registry file '$filename'" if ref($entry->{$_})} keys %$entry;
+        map {die "'$_' is not a registered key in the registry file '$filename'" unless exists $possible_fields->{$_}} keys %$entry;
+        map {die "'$_' must be present in every entry of the registry file '$filename'" unless exists $entry->{$_}} @obligatory_fields;
+        $entry->{'_registry_file'} = $filename;
+        $entry->{'_registry_index'} = ++$i;
+        bless $entry, $self;
+    }
+    #print Dumper($perl_array);
+    return $perl_array;
+}
+
+sub get_gene_coordinates {
+    my $self = shift;
+    $self->_load_coordinates unless exists $self->{'_gene_coordinates'};
+    return $self->{'_gene_coordinates'}
+}
+
+sub get_cds_coordinates {
+    my $self = shift;
+    $self->_load_coordinates unless exists $self->{'_cds_coordinates'};
+    return $self->{'_cds_coordinates'}
+}
+
+sub _load_coordinates {
+
+    my $self = shift;
+
+    my %gene_coordinates = ();
+    my %cds_coordinates = ();
+    if (exists $self->{'gene_coord_gff'}) {
+        my $fh = FileHandle->new;
+        $fh->open("<".$self->{'gene_coord_gff'});
+        my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
+        $parser->parse_header();
+        my $feature;
+        while ($feature = $parser->parse_next_feature()) {
+            my %feature = %{$feature};
+            #print Dumper($feature);
+            $gene_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'mRNA';
+            $cds_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'match';
+        }
+    }
+    print scalar(keys %gene_coordinates), " gene coordinates\n";
+    print scalar(keys %cds_coordinates), " cds coordinates\n";
+    $self->{'_gene_coordinates'} = \%gene_coordinates;
+    $self->{'_cds_coordinates'} = \%cds_coordinates;
+}
+
+sub get_cds_sequences {
+    my $self = shift;
+    $self->_load_sequences('cds') unless exists $self->{'_cds_seq'};
+    return $self->{'_cds_seq'};
+}
+
+sub get_protein_sequences {
+    my $self = shift;
+    $self->_load_sequences('prot') unless exists $self->{'_prot_seq'};
+    return $self->{'_prot_seq'};
+}
+
+sub _load_sequences {
+    my $self = shift;
+    my $type = shift;
+
+    return unless exists $self->{"${type}_fasta"};
+    my $input_file = $self->{"${type}_fasta"};
+    die unless -e $input_file;
+
+    my %sequence2hash = ();
+
+    my $in_file  = Bio::SeqIO->new(-file => $input_file , '-format' => 'Fasta');
+    while ( my $seq = $in_file->next_seq() ) {
+        $sequence2hash{$seq->id} = $seq;
+    }
+
+    print scalar(keys %sequence2hash), " sequences of type $type\n";
+    if(!keys(%sequence2hash)){
+        die "Could not read fasta sequences from $input_file\n";
+    }
+    $self->{"_${type}_seq"} = \%sequence2hash;
+}
+
+sub get_MetaContainer {
+    my $self = shift;
+    return $self;
+}
+
+our $AUTOLOAD;
+
+sub AUTOLOAD {
+    my $self = shift;
+    if ( $AUTOLOAD =~ m/::get_(\w+)$/ ) {
+        return $self->{$1};
+    }
+}
+
+sub extract_assembly_name {
+    my $self = shift;
+    return $self->{'assembly'};
+}
+
+sub locator {
+    my $self = shift;
+    return sprintf('%s/filename=%s;index=%d', ref($self), $self->{'_registry_file'}, $self->{'_registry_index'});
+}
+
+
+1;
+