0
|
1 =pod
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs
|
|
6
|
|
7 =head1 DESCRIPTION
|
|
8
|
|
9 'ClusterMotifs'
|
|
10
|
|
11 =cut
|
|
12
|
|
13
|
|
14 package Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs;
|
|
15
|
|
16 use warnings;
|
|
17 use strict;
|
|
18
|
|
19 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif');
|
|
20
|
|
21 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
|
|
22 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
|
|
23 use Data::Dumper;
|
|
24
|
|
25 sub fetch_input { # nothing to fetch... just the parameters...
|
|
26 my $self = shift @_;
|
|
27
|
|
28 $self->SUPER::fetch_input();
|
|
29
|
|
30 return 1;
|
|
31 }
|
|
32
|
|
33 sub run {
|
|
34 my $self = shift @_;
|
|
35 my @matrices;
|
|
36
|
|
37 my $base = $self->_output_dir."/".$self->_feature_set->name;
|
|
38
|
|
39 my $motif_file = $base.".transfac";
|
|
40 #Maybe check if files are empty or not there at all!
|
|
41
|
|
42 #Need to clump all motif files together...
|
|
43 run_system_cmd("cat ".$self->_output_dir."/*.tmp_TRANSFAC > ".$motif_file);
|
|
44
|
|
45 eval {
|
|
46 #RUN STAMP here to cluster all motifs... change the Jaspar directories!!
|
|
47 my $bin_folder = $self->_bin_folder();
|
|
48 run_system_cmd("${bin_folder}/STAMP -tf $motif_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $base");
|
|
49
|
|
50 #TODO Refactor this section...
|
|
51 my $is_top = 0;
|
|
52 my $top_matrix; #Is the one with smallest p
|
|
53 my %matrix_scores;
|
|
54 my $min_p = 1;
|
|
55 open(FILE,$motif_file);
|
|
56 while(<FILE>){
|
|
57 my $line = $_;
|
|
58 if($line =~ /^DE\s+(\S+)\s+(\S+)/){
|
|
59 my $name = $1; my $p = $2;
|
|
60 $matrix_scores{$name} = $p;
|
|
61 if($p < $min_p){
|
|
62 $top_matrix = "";
|
|
63 $is_top = 1;
|
|
64 $min_p = $p;
|
|
65 #import the top matrix...
|
|
66 #Also check if it matches any known matrix...
|
|
67 my $ff = $base."_match_pairs.txt";
|
|
68 if(-e $ff){
|
|
69 open(FIM,$ff);
|
|
70 while(<FIM>){
|
|
71 if(/^>\s*(\S+)\s+/){
|
|
72 if($1 eq $name){
|
|
73 my $fline = <FIM>; chomp($fline); my ($jaspar, $score) = split(/\s+/, $fline);
|
|
74 if($score < 0.005){ $line =~ s/\n/\t${jaspar}\n/; }
|
|
75 last;
|
|
76 }
|
|
77 }
|
|
78 }
|
|
79 close FIM;
|
|
80 } else { warn $ff." was not found!"; }
|
|
81 }
|
|
82 }
|
|
83 if($is_top){
|
|
84 $top_matrix .= $line;
|
|
85 if($line =~ /^XX/){ $is_top = 0; }
|
|
86 }
|
|
87
|
|
88 }
|
|
89 close FILE;
|
|
90 if($top_matrix){
|
|
91 push @matrices, _transfac_to_jaspar($top_matrix);
|
|
92 } else { warn "No top matrix found!!"; }
|
|
93
|
|
94 #Just check what are the most similar matrices to the clusters...
|
|
95 run_system_cmd("${bin_folder}/STAMP -tf ${base}_tree_clusters.txt -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out ${base}_global");
|
|
96
|
|
97 #Recluster based on matrix similarity...
|
|
98 push @matrices, @{_recluster_motifs($self->_feature_set->name, $self->_bin_folder(),$base,\%matrix_scores)};
|
|
99 };
|
|
100 if ($@){ warn "No motif found: ".$@; }
|
|
101
|
|
102 #run_system_cmd("rm -f ".$self->_output_dir."/*.tmp_TRANSFAC");
|
|
103
|
|
104 $self->_matrix_to_store(\@matrices);
|
|
105
|
|
106 return 1;
|
|
107 }
|
|
108
|
|
109
|
|
110 sub write_output { # Nothing is written at this stage (for the moment)
|
|
111 my $self = shift @_;
|
|
112
|
|
113 #Store the final matrices obtained...
|
|
114 open(FO,">".$self->_output_dir."/".$self->_feature_set->name.".final");
|
|
115 print FO join("\n",@{$self->_matrix_to_store()});
|
|
116 close FO;
|
|
117
|
|
118 return 1;
|
|
119 }
|
|
120
|
|
121 #TODO Need refactoring and optimization...
|
|
122 sub _recluster_motifs {
|
|
123 my ($fset, $bin_folder, $base, $scores) = (shift, shift, shift, shift);
|
|
124 my %clusters;
|
|
125 my @result_matrix = ();
|
|
126
|
|
127 open(FILE,$base."_tree_clusters.txt");
|
|
128 while(<FILE>){
|
|
129 if(/^DE\s+(\S+)\s*$/){
|
|
130 #try to obtain a score here!...
|
|
131 my $cluster_id = $1;
|
|
132 my $matrix = $_;
|
|
133 while(<FILE>){ $matrix .= $_; last if(/^XX/); }
|
|
134 my $cluster_members = <FILE>;
|
|
135 chomp($cluster_members);
|
|
136 $cluster_members =~ s/^XX\s+\Cluster_Members:\s+//;
|
|
137 $clusters{$cluster_id}{"matrix"}=$matrix;
|
|
138 push @{$clusters{$cluster_id}{"elements"}}, split(/\s+/,$cluster_members);
|
|
139 }
|
|
140 }
|
|
141 close FILE;
|
|
142
|
|
143 #recalculate scores
|
|
144 foreach my $cluster (keys %clusters){
|
|
145 my $score = 0;
|
|
146 map { $score += $scores->{$_}; } @{$clusters{$cluster}{"elements"}};
|
|
147 $score = $score / scalar(@{$clusters{$cluster}{"elements"}});
|
|
148 $clusters{$cluster}{"matrix"} =~ s/\n/\t${score}\n/;
|
|
149 $clusters{$cluster}{"score"} = $score;
|
|
150 }
|
|
151
|
|
152
|
|
153 my %reclust;
|
|
154 open(FILE,$base."_global_match_pairs.txt");
|
|
155 while(<FILE>){
|
|
156 chomp;
|
|
157 if(/^>\s+(\S+)\s*$/){
|
|
158 my $clust_id = $1;
|
|
159 my $match = <FILE>;
|
|
160 chomp($match);
|
|
161 my ($jaspar, $score, undef, undef) = split(/\s/,$match);
|
|
162 #Make this an alpha-parameter?
|
|
163 if($score < 0.005){
|
|
164 #add the matrix in the first line...
|
|
165 $clusters{$clust_id}{"matrix"} =~ s/\n/\t${jaspar}\n/;
|
|
166 push @{$reclust{$jaspar}}, $clust_id;
|
|
167 }
|
|
168 }
|
|
169 }
|
|
170 close FILE;
|
|
171
|
|
172 foreach my $jaspar (keys %reclust) {
|
|
173 if(scalar(@{$reclust{$jaspar}})>1){
|
|
174 my $score = 0;
|
|
175 map { $score += $clusters{$_}{"score"}; } @{$reclust{$jaspar}};
|
|
176 $score = $score / scalar(@{$reclust{$jaspar}});
|
|
177 my $clust_file = $base."_cluster_".$jaspar;
|
|
178 open(FO,">".$clust_file);
|
|
179 map { print FO $clusters{$_}{"matrix"}; } @{$reclust{$jaspar}};
|
|
180 close FO;
|
|
181 run_system_cmd("${bin_folder}/STAMP -tf $clust_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $clust_file");
|
|
182 my $matrix;
|
|
183 open(FILE,"${clust_file}FBP.txt");
|
|
184 #Re-add the averaged score here!
|
|
185 while(<FILE>){
|
|
186 if(/^DE/){
|
|
187 $matrix = "DE ".$fset."_cluster_${jaspar}\t".$score."\t".$jaspar."\n";
|
|
188 } else { $matrix .= $_; }
|
|
189 }
|
|
190 close FILE;
|
|
191 push @result_matrix, _transfac_to_jaspar($matrix);
|
|
192 map { delete $clusters{$_}; } @{$reclust{$jaspar}};
|
|
193 }
|
|
194 }
|
|
195
|
|
196 map { push @result_matrix, _transfac_to_jaspar($clusters{$_}{"matrix"}); } keys %clusters;
|
|
197
|
|
198 return \@result_matrix;
|
|
199
|
|
200 }
|
|
201
|
|
202
|
|
203 #private function that transforms a transfac matrix to jaspar format
|
|
204 sub _transfac_to_jaspar{
|
|
205 my ($transfac) = shift;
|
|
206 my $jaspar;
|
|
207
|
|
208 my @lines = split(/\n/,$transfac);
|
|
209 pop @lines; #XX
|
|
210 my $title = shift @lines;
|
|
211 $title =~ s/^DE/>/;
|
|
212 $jaspar = $title."\n";
|
|
213 my @as; my @cs; my @gs; my @ts;
|
|
214 foreach my $line (@lines){
|
|
215 my (undef,$a,$c,$g,$t,undef) = split(/\s+/,$line);
|
|
216 #convert it to integers if necessary to make it simpler after
|
|
217 if($a<=1 && $c<=1 && $g<=1 && $t<=1){ $a = int(100*$a); $c=int(100*$c); $g=int(100*$g); $t=int(100*$t); }
|
|
218 push @as, $a; push @cs, $c; push @gs, $g; push @ts, $t;
|
|
219 }
|
|
220 $jaspar .= "A [ ".join("\t",@as)." ]\n" ;
|
|
221 $jaspar .= "C [ ".join("\t",@cs)." ]\n" ;
|
|
222 $jaspar .= "G [ ".join("\t",@gs)." ]\n" ;
|
|
223 $jaspar .= "T [ ".join("\t",@ts)." ]\n" ;
|
|
224
|
|
225 return $jaspar;
|
|
226 }
|
|
227
|
|
228 #Private getter / setter to the matrices
|
|
229 sub _matrix_to_store {
|
|
230 return $_[0]->_getter_setter('matrix_to_store',$_[1]);
|
|
231 }
|
|
232
|
|
233 1;
|