Mercurial > repos > willmclaren > ensembl_vep
view variant_effect_predictor/Bio/EnsEMBL/Compara/GenomeMF.pm @ 1:d6778b5d8382 draft default tip
Deleted selected files
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:05:43 -0400 |
parents | 21066c0abaf5 |
children |
line wrap: on
line source
package Bio::EnsEMBL::Compara::GenomeMF; use strict; use warnings; use Bio::EnsEMBL::Utils::Argument; use Bio::EnsEMBL::Utils::Scalar qw(:assert); use Bio::EnsEMBL::Utils::IO::GFFParser; use JSON; use Bio::SeqIO; use FileHandle; use Data::Dumper; sub new { my ($class, @args) = @_; my ($filename, $index); if (scalar @args) { ($filename, $index) = rearrange([qw(FILENAME INDEX)], @args); } die unless defined $filename; die unless defined $index; return $class->all_from_file($filename)->[$index-1]; } sub all_from_file { my $self = shift; my $filename = shift; # Loads the file with JSON die "'filename' must be defined" unless defined $filename; die "Can't read from '$filename'" unless -r $filename; my $json_text = `cat $filename`; my $json_parser = JSON->new->relaxed; my $perl_array = $json_parser->decode($json_text); # List of fields that must / can be present my @obligatory_fields = qw(production_name taxonomy_id assembly genebuild prot_fasta cds_fasta); my $possible_fields = {map {$_ => 1} (@obligatory_fields, qw(gene_coord_gff))}; # Checks the integrity of the file my $i = 0; die "The first level structure in '$filename' must be an array" unless ref($perl_array) eq 'ARRAY'; foreach my $entry (@$perl_array) { die "The second level structures in '$filename' must be hashes" unless ref($entry) eq 'HASH'; map {die "'$_' must map to a scalar in the registry file '$filename'" if ref($entry->{$_})} keys %$entry; map {die "'$_' is not a registered key in the registry file '$filename'" unless exists $possible_fields->{$_}} keys %$entry; map {die "'$_' must be present in every entry of the registry file '$filename'" unless exists $entry->{$_}} @obligatory_fields; $entry->{'_registry_file'} = $filename; $entry->{'_registry_index'} = ++$i; bless $entry, $self; } #print Dumper($perl_array); return $perl_array; } sub get_gene_coordinates { my $self = shift; $self->_load_coordinates unless exists $self->{'_gene_coordinates'}; return $self->{'_gene_coordinates'} } sub get_cds_coordinates { my $self = shift; $self->_load_coordinates unless exists $self->{'_cds_coordinates'}; return $self->{'_cds_coordinates'} } sub _load_coordinates { my $self = shift; my %gene_coordinates = (); my %cds_coordinates = (); if (exists $self->{'gene_coord_gff'}) { my $fh = FileHandle->new; $fh->open("<".$self->{'gene_coord_gff'}); my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh); $parser->parse_header(); my $feature; while ($feature = $parser->parse_next_feature()) { my %feature = %{$feature}; #print Dumper($feature); $gene_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'mRNA'; $cds_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'match'; } } print scalar(keys %gene_coordinates), " gene coordinates\n"; print scalar(keys %cds_coordinates), " cds coordinates\n"; $self->{'_gene_coordinates'} = \%gene_coordinates; $self->{'_cds_coordinates'} = \%cds_coordinates; } sub get_cds_sequences { my $self = shift; $self->_load_sequences('cds') unless exists $self->{'_cds_seq'}; return $self->{'_cds_seq'}; } sub get_protein_sequences { my $self = shift; $self->_load_sequences('prot') unless exists $self->{'_prot_seq'}; return $self->{'_prot_seq'}; } sub _load_sequences { my $self = shift; my $type = shift; return unless exists $self->{"${type}_fasta"}; my $input_file = $self->{"${type}_fasta"}; die unless -e $input_file; my %sequence2hash = (); my $in_file = Bio::SeqIO->new(-file => $input_file , '-format' => 'Fasta'); while ( my $seq = $in_file->next_seq() ) { $sequence2hash{$seq->id} = $seq; } print scalar(keys %sequence2hash), " sequences of type $type\n"; if(!keys(%sequence2hash)){ die "Could not read fasta sequences from $input_file\n"; } $self->{"_${type}_seq"} = \%sequence2hash; } sub get_MetaContainer { my $self = shift; return $self; } our $AUTOLOAD; sub AUTOLOAD { my $self = shift; if ( $AUTOLOAD =~ m/::get_(\w+)$/ ) { return $self->{$1}; } } sub extract_assembly_name { my $self = shift; return $self->{'assembly'}; } sub locator { my $self = shift; return sprintf('%s/filename=%s;index=%d', ref($self), $self->{'_registry_file'}, $self->{'_registry_index'}); } 1;