0
|
1
|
|
2 package Bio::EnsEMBL::Compara::GenomeMF;
|
|
3
|
|
4 use strict;
|
|
5 use warnings;
|
|
6
|
|
7 use Bio::EnsEMBL::Utils::Argument;
|
|
8 use Bio::EnsEMBL::Utils::Scalar qw(:assert);
|
|
9 use Bio::EnsEMBL::Utils::IO::GFFParser;
|
|
10
|
|
11 use JSON;
|
|
12 use Bio::SeqIO;
|
|
13 use FileHandle;
|
|
14 use Data::Dumper;
|
|
15
|
|
16
|
|
17 sub new {
|
|
18 my ($class, @args) = @_;
|
|
19
|
|
20 my ($filename, $index);
|
|
21 if (scalar @args) {
|
|
22 ($filename, $index) = rearrange([qw(FILENAME INDEX)], @args);
|
|
23 }
|
|
24
|
|
25 die unless defined $filename;
|
|
26 die unless defined $index;
|
|
27
|
|
28 return $class->all_from_file($filename)->[$index-1];
|
|
29 }
|
|
30
|
|
31 sub all_from_file {
|
|
32 my $self = shift;
|
|
33 my $filename = shift;
|
|
34
|
|
35 # Loads the file with JSON
|
|
36 die "'filename' must be defined" unless defined $filename;
|
|
37 die "Can't read from '$filename'" unless -r $filename;
|
|
38 my $json_text = `cat $filename`;
|
|
39 my $json_parser = JSON->new->relaxed;
|
|
40 my $perl_array = $json_parser->decode($json_text);
|
|
41
|
|
42 # List of fields that must / can be present
|
|
43 my @obligatory_fields = qw(production_name taxonomy_id assembly genebuild prot_fasta cds_fasta);
|
|
44 my $possible_fields = {map {$_ => 1} (@obligatory_fields, qw(gene_coord_gff))};
|
|
45
|
|
46 # Checks the integrity of the file
|
|
47 my $i = 0;
|
|
48 die "The first level structure in '$filename' must be an array" unless ref($perl_array) eq 'ARRAY';
|
|
49 foreach my $entry (@$perl_array) {
|
|
50 die "The second level structures in '$filename' must be hashes" unless ref($entry) eq 'HASH';
|
|
51 map {die "'$_' must map to a scalar in the registry file '$filename'" if ref($entry->{$_})} keys %$entry;
|
|
52 map {die "'$_' is not a registered key in the registry file '$filename'" unless exists $possible_fields->{$_}} keys %$entry;
|
|
53 map {die "'$_' must be present in every entry of the registry file '$filename'" unless exists $entry->{$_}} @obligatory_fields;
|
|
54 $entry->{'_registry_file'} = $filename;
|
|
55 $entry->{'_registry_index'} = ++$i;
|
|
56 bless $entry, $self;
|
|
57 }
|
|
58 #print Dumper($perl_array);
|
|
59 return $perl_array;
|
|
60 }
|
|
61
|
|
62 sub get_gene_coordinates {
|
|
63 my $self = shift;
|
|
64 $self->_load_coordinates unless exists $self->{'_gene_coordinates'};
|
|
65 return $self->{'_gene_coordinates'}
|
|
66 }
|
|
67
|
|
68 sub get_cds_coordinates {
|
|
69 my $self = shift;
|
|
70 $self->_load_coordinates unless exists $self->{'_cds_coordinates'};
|
|
71 return $self->{'_cds_coordinates'}
|
|
72 }
|
|
73
|
|
74 sub _load_coordinates {
|
|
75
|
|
76 my $self = shift;
|
|
77
|
|
78 my %gene_coordinates = ();
|
|
79 my %cds_coordinates = ();
|
|
80 if (exists $self->{'gene_coord_gff'}) {
|
|
81 my $fh = FileHandle->new;
|
|
82 $fh->open("<".$self->{'gene_coord_gff'});
|
|
83 my $parser = Bio::EnsEMBL::Utils::IO::GFFParser->new($fh);
|
|
84 $parser->parse_header();
|
|
85 my $feature;
|
|
86 while ($feature = $parser->parse_next_feature()) {
|
|
87 my %feature = %{$feature};
|
|
88 #print Dumper($feature);
|
|
89 $gene_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'mRNA';
|
|
90 $cds_coordinates{ ${$feature{attribute}}{Name} } = [map {$feature{$_}} qw(seqid start end strand)] if $feature{type} eq 'match';
|
|
91 }
|
|
92 }
|
|
93 print scalar(keys %gene_coordinates), " gene coordinates\n";
|
|
94 print scalar(keys %cds_coordinates), " cds coordinates\n";
|
|
95 $self->{'_gene_coordinates'} = \%gene_coordinates;
|
|
96 $self->{'_cds_coordinates'} = \%cds_coordinates;
|
|
97 }
|
|
98
|
|
99 sub get_cds_sequences {
|
|
100 my $self = shift;
|
|
101 $self->_load_sequences('cds') unless exists $self->{'_cds_seq'};
|
|
102 return $self->{'_cds_seq'};
|
|
103 }
|
|
104
|
|
105 sub get_protein_sequences {
|
|
106 my $self = shift;
|
|
107 $self->_load_sequences('prot') unless exists $self->{'_prot_seq'};
|
|
108 return $self->{'_prot_seq'};
|
|
109 }
|
|
110
|
|
111 sub _load_sequences {
|
|
112 my $self = shift;
|
|
113 my $type = shift;
|
|
114
|
|
115 return unless exists $self->{"${type}_fasta"};
|
|
116 my $input_file = $self->{"${type}_fasta"};
|
|
117 die unless -e $input_file;
|
|
118
|
|
119 my %sequence2hash = ();
|
|
120
|
|
121 my $in_file = Bio::SeqIO->new(-file => $input_file , '-format' => 'Fasta');
|
|
122 while ( my $seq = $in_file->next_seq() ) {
|
|
123 $sequence2hash{$seq->id} = $seq;
|
|
124 }
|
|
125
|
|
126 print scalar(keys %sequence2hash), " sequences of type $type\n";
|
|
127 if(!keys(%sequence2hash)){
|
|
128 die "Could not read fasta sequences from $input_file\n";
|
|
129 }
|
|
130 $self->{"_${type}_seq"} = \%sequence2hash;
|
|
131 }
|
|
132
|
|
133 sub get_MetaContainer {
|
|
134 my $self = shift;
|
|
135 return $self;
|
|
136 }
|
|
137
|
|
138 our $AUTOLOAD;
|
|
139
|
|
140 sub AUTOLOAD {
|
|
141 my $self = shift;
|
|
142 if ( $AUTOLOAD =~ m/::get_(\w+)$/ ) {
|
|
143 return $self->{$1};
|
|
144 }
|
|
145 }
|
|
146
|
|
147 sub extract_assembly_name {
|
|
148 my $self = shift;
|
|
149 return $self->{'assembly'};
|
|
150 }
|
|
151
|
|
152 sub locator {
|
|
153 my $self = shift;
|
|
154 return sprintf('%s/filename=%s;index=%d', ref($self), $self->{'_registry_file'}, $self->{'_registry_index'});
|
|
155 }
|
|
156
|
|
157
|
|
158 1;
|
|
159
|