diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ClusterMotifs.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/ClusterMotifs.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,233 @@
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs
+
+=head1 DESCRIPTION
+
+'ClusterMotifs' 
+
+=cut
+
+
+package Bio::EnsEMBL::Funcgen::RunnableDB::ClusterMotifs;
+
+use warnings;
+use strict;
+
+use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif');
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd);
+use Data::Dumper;
+
+sub fetch_input {   # nothing to fetch... just the parameters...
+  my $self = shift @_;
+
+  $self->SUPER::fetch_input();
+  
+  return 1;
+}
+
+sub run {   
+  my $self = shift @_;
+  my @matrices;
+
+  my $base = $self->_output_dir."/".$self->_feature_set->name;
+
+  my $motif_file = $base.".transfac";
+  #Maybe check if files are empty or not there at all!
+
+  #Need to clump all motif files together...
+  run_system_cmd("cat ".$self->_output_dir."/*.tmp_TRANSFAC > ".$motif_file);
+  
+  eval {
+    #RUN STAMP here to cluster all motifs... change the Jaspar directories!!
+    my $bin_folder = $self->_bin_folder();
+    run_system_cmd("${bin_folder}/STAMP -tf $motif_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $base");
+    
+    #TODO Refactor this section...
+    my $is_top = 0;
+    my $top_matrix; #Is the one with smallest p
+    my %matrix_scores;
+    my $min_p = 1;
+    open(FILE,$motif_file);
+    while(<FILE>){
+      my $line = $_;
+      if($line =~ /^DE\s+(\S+)\s+(\S+)/){ 
+	my $name = $1; my $p = $2;
+	$matrix_scores{$name} = $p; 
+	if($p < $min_p){
+	  $top_matrix = "";
+	  $is_top = 1;
+	  $min_p = $p;
+	  #import the top matrix...
+	  #Also check if it matches any known matrix...
+	  my $ff = $base."_match_pairs.txt";
+	  if(-e $ff){ 
+	    open(FIM,$ff);
+	    while(<FIM>){
+	      if(/^>\s*(\S+)\s+/){
+		if($1 eq $name){
+		  my $fline = <FIM>; chomp($fline); my ($jaspar, $score) = split(/\s+/, $fline);
+		  if($score < 0.005){ $line =~ s/\n/\t${jaspar}\n/; }
+		  last;
+		}
+	      }
+	    }
+	    close FIM;
+	  } else { warn $ff." was not found!"; }
+	}      
+      }
+      if($is_top){
+	$top_matrix .= $line;
+	if($line =~ /^XX/){ $is_top = 0; }
+      }
+      
+    }
+    close FILE;
+    if($top_matrix){
+      push @matrices, _transfac_to_jaspar($top_matrix);
+    } else { warn "No top matrix found!!"; }
+    
+    #Just check what are the most similar matrices to the clusters...
+    run_system_cmd("${bin_folder}/STAMP -tf ${base}_tree_clusters.txt -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out ${base}_global");
+        
+    #Recluster based on matrix similarity...
+    push @matrices, @{_recluster_motifs($self->_feature_set->name, $self->_bin_folder(),$base,\%matrix_scores)};
+  };
+  if ($@){ warn "No motif found: ".$@; }
+
+  #run_system_cmd("rm -f ".$self->_output_dir."/*.tmp_TRANSFAC");
+
+  $self->_matrix_to_store(\@matrices);
+
+  return 1;
+}
+
+
+sub write_output {  # Nothing is written at this stage (for the moment)
+  my $self = shift @_;
+
+  #Store the final matrices obtained...
+  open(FO,">".$self->_output_dir."/".$self->_feature_set->name.".final");
+  print FO join("\n",@{$self->_matrix_to_store()});
+  close FO;
+
+  return 1;
+}
+
+#TODO Need refactoring and optimization...
+sub _recluster_motifs {
+  my ($fset, $bin_folder, $base, $scores) = (shift, shift, shift, shift);
+  my %clusters;
+  my @result_matrix = ();
+
+  open(FILE,$base."_tree_clusters.txt");
+  while(<FILE>){
+    if(/^DE\s+(\S+)\s*$/){
+      #try to obtain a score here!...
+      my $cluster_id = $1;
+      my $matrix = $_;
+      while(<FILE>){ $matrix .= $_; last if(/^XX/); }
+      my $cluster_members = <FILE>;
+      chomp($cluster_members);
+      $cluster_members =~ s/^XX\s+\Cluster_Members:\s+//;
+      $clusters{$cluster_id}{"matrix"}=$matrix;
+      push @{$clusters{$cluster_id}{"elements"}}, split(/\s+/,$cluster_members);
+    }
+  }
+  close FILE;
+
+  #recalculate scores
+  foreach my $cluster (keys %clusters){
+    my $score = 0;
+    map { $score += $scores->{$_}; } @{$clusters{$cluster}{"elements"}};
+    $score = $score / scalar(@{$clusters{$cluster}{"elements"}});
+    $clusters{$cluster}{"matrix"} =~ s/\n/\t${score}\n/;
+    $clusters{$cluster}{"score"} = $score;
+  }
+
+  
+  my %reclust;
+  open(FILE,$base."_global_match_pairs.txt");
+  while(<FILE>){
+    chomp;
+    if(/^>\s+(\S+)\s*$/){
+      my $clust_id = $1;
+      my $match = <FILE>;
+      chomp($match);
+      my ($jaspar, $score, undef, undef) = split(/\s/,$match);
+      #Make this an alpha-parameter?
+      if($score < 0.005){
+	#add the matrix in the first line...
+	$clusters{$clust_id}{"matrix"} =~ s/\n/\t${jaspar}\n/;
+	push @{$reclust{$jaspar}}, $clust_id;
+      }
+    }
+  }
+  close FILE;
+  
+  foreach my $jaspar (keys %reclust) { 
+    if(scalar(@{$reclust{$jaspar}})>1){
+      my $score = 0; 
+      map { $score += $clusters{$_}{"score"}; } @{$reclust{$jaspar}};
+      $score = $score / scalar(@{$reclust{$jaspar}});
+      my $clust_file = $base."_cluster_".$jaspar;
+      open(FO,">".$clust_file);
+      map { print FO $clusters{$_}{"matrix"}; } @{$reclust{$jaspar}};
+      close FO;
+      run_system_cmd("${bin_folder}/STAMP -tf $clust_file -align SWU -cc PCC -sd ${bin_folder}/STAMP_data/Jaspar2010_1000random_PCC_SWU.scores -chp -nooverlapalign -match ${bin_folder}/STAMP_data/Jaspar_Core_PBM_PolII.Transfac -match_top 1 -ma IR -out $clust_file");
+      my $matrix;
+      open(FILE,"${clust_file}FBP.txt");
+      #Re-add the averaged score here!
+      while(<FILE>){ 
+	if(/^DE/){
+	  $matrix = "DE ".$fset."_cluster_${jaspar}\t".$score."\t".$jaspar."\n";  
+	} else { $matrix .= $_;  }  
+      }
+      close FILE;
+      push @result_matrix, _transfac_to_jaspar($matrix);
+      map { delete $clusters{$_}; } @{$reclust{$jaspar}};
+    }
+  }
+  
+  map { push @result_matrix,  _transfac_to_jaspar($clusters{$_}{"matrix"}); } keys %clusters;
+  
+  return \@result_matrix;
+
+}
+
+
+#private function that transforms a transfac matrix to jaspar format
+sub _transfac_to_jaspar{
+  my ($transfac) = shift;
+  my $jaspar;
+
+  my @lines = split(/\n/,$transfac);
+  pop @lines; #XX
+  my $title = shift @lines;
+  $title =~ s/^DE/>/;
+  $jaspar = $title."\n";
+  my @as; my @cs; my @gs; my @ts;
+  foreach my $line (@lines){
+    my (undef,$a,$c,$g,$t,undef) = split(/\s+/,$line);
+    #convert it to integers if necessary to make it simpler after
+    if($a<=1 && $c<=1 && $g<=1 && $t<=1){ $a = int(100*$a); $c=int(100*$c); $g=int(100*$g); $t=int(100*$t); }
+    push @as, $a; push @cs, $c; push @gs, $g; push @ts, $t;
+  }
+  $jaspar .= "A [ ".join("\t",@as)." ]\n" ;
+  $jaspar .= "C [ ".join("\t",@cs)." ]\n" ;
+  $jaspar .= "G [ ".join("\t",@gs)." ]\n" ;
+  $jaspar .= "T [ ".join("\t",@ts)." ]\n" ;
+
+  return $jaspar;
+}
+
+#Private getter / setter to the matrices
+sub _matrix_to_store {
+  return $_[0]->_getter_setter('matrix_to_store',$_[1]);
+}
+
+1;