comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 NAME
4
5 Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs
6
7 =head1 DESCRIPTION
8
9 'Funcgen::InferMotifs'
10
11 =cut
12
13
14 package Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs;
15
16 use warnings;
17 use strict;
18
19 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif');
20
21 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
22 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
23 use Data::Dumper;
24
25 sub fetch_input { # nothing to fetch... just the parameters...
26 my $self = shift @_;
27
28 $self->SUPER::fetch_input();
29
30 if(!$self->param('bin')){ throw "No bin defined!"; }
31 $self->_bin($self->param('bin'));
32
33 return 1;
34 }
35
36 sub run {
37
38 my $self = shift @_;
39
40 my $afa = $self->_efgdba()->get_AnnotatedFeatureAdaptor();
41 #Order features descending by score
42 my @features = sort { $b->score <=> $a->score } @{$afa->fetch_all_by_FeatureSets( [ $self->_feature_set ] )};
43
44 # Print the sequences for this bin
45 my $sa = $self->_efgdba()->dnadb->get_SliceAdaptor();
46 my $fasta_file = $self->_output_dir."/bin.".$self->_bin.".fasta";
47 open(FO,">".$fasta_file) or throw "Couldn't open output file";
48 my $bin_start = ($self->_bin - 1)*$self->_bin_size;
49 for(my $i=$bin_start; $i<($bin_start+$self->_bin_size); $i++){
50 my $ft = $features[$i];
51 my $sr_name = $ft->seq_region_name;
52 my $start = POSIX::floor($ft->summit - $self->_window_size);
53 my $end = POSIX::floor($ft->summit + $self->_window_size);
54 my $slice = $sa->fetch_by_region( undef, $sr_name, $start, $end);
55 if(defined($slice)){
56 print FO ">".$sr_name.",".$start.",".$end."\n".$slice->get_repeatmasked_seq()->seq."\n";
57 } else {
58 warn $sr_name.",".$start.",".$end." could not be found\n";
59 }
60 }
61 close FO;
62
63 #Create a background to try and avoid repetitive A's
64 #fasta-get-markov is part of the meme package (but is very simple to replicate if needed)
65 #Use a pre-compiled background unique for the whole of human...
66 #run_system_cmd("cat ".$fasta_file." | fasta-get-markov -m 3 > ".$fasta_file.".bg");
67
68 #Run MEME for this bin
69 my $species = $self->_species;
70 my $meme_file = $self->_output_dir."/bin.".$self->_bin.".meme";
71 #Replace meme parameters by parameters from an analysis in the database...
72 #run_system_cmd("meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 0.00001 -nmotifs 10 -minsites 50 -minw 6 -maxw 20 -bfile ~/src/STAMP.v1.1/tests/${species}_masked.bg $fasta_file > ".$meme_file);
73 run_system_cmd($self->_bin_folder()."/meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 1e-20 -nmotifs 5 -minsites 50 -minw 6 -maxw 20 $fasta_file > ".$meme_file);
74
75 my $basename = $self->_feature_set->name.".bin_".$self->_bin;
76 if(_process_meme_to_STAMP($meme_file, $basename) == 0){
77 warn "No motif found ";
78 }
79
80 #Eliminate the temp files...
81 #run_system_cmd("rm -f ".$fasta_file);
82 #run_system_cmd("rm -f ".$meme_file);
83
84 return 1;
85 }
86
87
88 sub write_output { # Nothing is written at this stage (for the moment)
89
90 my $self = shift @_;
91
92 return 1;
93
94 }
95
96
97 sub _process_meme_to_STAMP {
98 my ($meme_file, $basename) = (shift, shift, shift);
99
100 my $num_motifs = 0;
101 my $log_odds = 1;
102 open(FO,">".$meme_file.".tmp_TRANSFAC");
103 open(FILE, $meme_file);
104 while(<FILE>){
105 chomp;
106 if(/^MOTIF\s+.*E-value\s*=\s+(\S+)/) { $log_odds = $1; }
107 if(/^BL\s+MOTIF\s+(\d+)\s+width=(\d+)\s+seqs=(\d+)\s*$/){
108 my $motif_num = $1;
109 my $motif_width = $2;
110 my $num_seqs = $3;
111
112
113 my @init = (0) x $motif_width;
114 my %matrix;
115 #Initialize with 0
116 for (my $j=0; $j<$motif_width; $j++){
117 $matrix{"A"}->[$j] = $matrix{"C"}->[$j] = $matrix{"G"}->[$j] = $matrix{"T"}->[$j] = 0;
118 }
119
120 for (my $i=1;$i<=$num_seqs;$i++){
121 my $line = <FILE>;
122 $line =~ /^\S+\s+\(\s*\d+\)\s+(\w+)\s+\d+\s*$/;
123 my @seq = split(//,uc($1));
124 if(scalar(@seq) != $motif_width){ throw "Problems parsing motif file"; }
125 for (my $j=0; $j<$motif_width; $j++){ $matrix{$seq[$j]}->[$j]++ }
126 }
127 my $end = <FILE>;
128 if(! $end =~ /^\/\/\s*$/){
129 throw "Error in the MEME file: expecting \/\/ after $num_seqs lines ";
130 }
131 print FO "DE ".$basename.".".$motif_num."\t".$log_odds."\n";
132 for(my $i=0; $i<$motif_width; $i++){
133 print FO $i."\t".$matrix{"A"}->[$i]."\t".$matrix{"C"}->[$i]."\t".$matrix{"G"}->[$i]."\t".$matrix{"T"}->[$i]."\n"
134 }
135 print FO "XX\n";
136
137 $num_motifs++;
138
139 }
140
141 }
142 close FILE;
143 close FO;
144
145 return $num_motifs;
146
147 }
148
149 #Private getter / setter to the bin
150 sub _bin {
151 return $_[0]->_getter_setter('bin',$_[1]);
152 }
153
154 1;