annotate variant_effect_predictor/Bio/EnsEMBL/Compara/GenomeMF.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2 package Bio::EnsEMBL::Compara::GenomeMF;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 use warnings;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 use Bio::EnsEMBL::Utils::Argument;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 use Bio::EnsEMBL::Utils::Scalar qw(:assert);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 use Bio::EnsEMBL::Utils::IO::GFFParser;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 use JSON;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12 use Bio::SeqIO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 use FileHandle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 use Data::Dumper;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 sub new {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18 my ($class, @args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20 my ($filename, $index);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 if (scalar @args) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22 ($filename, $index) = rearrange([qw(FILENAME INDEX)], @args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 die unless defined $filename;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26 die unless defined $index;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 return $class->all_from_file($filename)->[$index-1];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 sub all_from_file {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 my $filename = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 # Loads the file with JSON
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 die "'filename' must be defined" unless defined $filename;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 die "Can't read from '$filename'" unless -r $filename;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 my $json_text = `cat $filename`;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 my $json_parser = JSON->new->relaxed;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 my $perl_array = $json_parser->decode($json_text);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 # List of fields that must / can be present
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 my @obligatory_fields = qw(production_name taxonomy_id assembly genebuild prot_fasta cds_fasta);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 my $possible_fields = {map {$_ => 1} (@obligatory_fields, qw(gene_coord_gff))};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 # Checks the integrity of the file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 my $i = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 die "The first level structure in '$filename' must be an array" unless ref($perl_array) eq 'ARRAY';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49 foreach my $entry (@$perl_array) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 die "The second level structures in '$filename' must be hashes" unless ref($entry) eq 'HASH';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51 map {die "'$_' must map to a scalar in the registry file '$filename'" if ref($entry->{$_})} keys %$entry;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 map {die "'$_' is not a registered key in the registry file '$filename'" unless exists $possible_fields->{$_}} keys %$entry;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53 map {die "'$_' must be present in every entry of the registry file '$filename'" unless exists $entry->{$_}} @obligatory_fields;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 $entry->{'_registry_file'} = $filename;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 $entry->{'_registry_index'} = ++$i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 bless $entry, $self;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 #print Dumper($perl_array);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59 return $perl_array;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62 sub get_gene_coordinates {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64 $self->_load_coordinates unless exists $self->{'_gene_coordinates'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 return $self->{'_gene_coordinates'}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 sub get_cds_coordinates {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 $self->_load_coordinates unless exists $self->{'_cds_coordinates'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71 return $self->{'_cds_coordinates'}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 sub _load_coordinates {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 my %gene_coordinates = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 my %cds_coordinates = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 if (exists $self->{'gene_coord_gff'}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81 my $fh = FileHandle->new;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 $fh->open("<".$self->{'gene_coord_gff'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 $parser->parse_header();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 my $feature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 while ($feature = $parser->parse_next_feature()) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 my %feature = %{$feature};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 #print Dumper($feature);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 $gene_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'mRNA';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 $cds_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'match';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 print scalar(keys %gene_coordinates), " gene coordinates\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 print scalar(keys %cds_coordinates), " cds coordinates\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 $self->{'_gene_coordinates'} = \%gene_coordinates;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 $self->{'_cds_coordinates'} = \%cds_coordinates;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 sub get_cds_sequences {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 $self->_load_sequences('cds') unless exists $self->{'_cds_seq'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102 return $self->{'_cds_seq'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 sub get_protein_sequences {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107 $self->_load_sequences('prot') unless exists $self->{'_prot_seq'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 return $self->{'_prot_seq'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 sub _load_sequences {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 my $type = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 return unless exists $self->{"${type}_fasta"};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 my $input_file = $self->{"${type}_fasta"};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117 die unless -e $input_file;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 my %sequence2hash = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 my $in_file = Bio::SeqIO->new(-file => $input_file , '-format' => 'Fasta');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 while ( my $seq = $in_file->next_seq() ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123 $sequence2hash{$seq->id} = $seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 print scalar(keys %sequence2hash), " sequences of type $type\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 if(!keys(%sequence2hash)){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 die "Could not read fasta sequences from $input_file\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 $self->{"_${type}_seq"} = \%sequence2hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 sub get_MetaContainer {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 return $self;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 our $AUTOLOAD;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 sub AUTOLOAD {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 if ( $AUTOLOAD =~ m/::get_(\w+)$/ ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 return $self->{$1};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 sub extract_assembly_name {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 return $self->{'assembly'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152 sub locator {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154 return sprintf('%s/filename=%s;index=%d', ref($self), $self->{'_registry_file'}, $self->{'_registry_index'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159