Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,154 @@ +=pod + +=head1 NAME + +Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs + +=head1 DESCRIPTION + +'Funcgen::InferMotifs' + +=cut + + +package Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs; + +use warnings; +use strict; + +use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif'); + +use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); +use Data::Dumper; + +sub fetch_input { # nothing to fetch... just the parameters... + my $self = shift @_; + + $self->SUPER::fetch_input(); + + if(!$self->param('bin')){ throw "No bin defined!"; } + $self->_bin($self->param('bin')); + + return 1; +} + +sub run { + + my $self = shift @_; + + my $afa = $self->_efgdba()->get_AnnotatedFeatureAdaptor(); + #Order features descending by score + my @features = sort { $b->score <=> $a->score } @{$afa->fetch_all_by_FeatureSets( [ $self->_feature_set ] )}; + + # Print the sequences for this bin + my $sa = $self->_efgdba()->dnadb->get_SliceAdaptor(); + my $fasta_file = $self->_output_dir."/bin.".$self->_bin.".fasta"; + open(FO,">".$fasta_file) or throw "Couldn't open output file"; + my $bin_start = ($self->_bin - 1)*$self->_bin_size; + for(my $i=$bin_start; $i<($bin_start+$self->_bin_size); $i++){ + my $ft = $features[$i]; + my $sr_name = $ft->seq_region_name; + my $start = POSIX::floor($ft->summit - $self->_window_size); + my $end = POSIX::floor($ft->summit + $self->_window_size); + my $slice = $sa->fetch_by_region( undef, $sr_name, $start, $end); + if(defined($slice)){ + print FO ">".$sr_name.",".$start.",".$end."\n".$slice->get_repeatmasked_seq()->seq."\n"; + } else { + warn $sr_name.",".$start.",".$end." could not be found\n"; + } + } + close FO; + + #Create a background to try and avoid repetitive A's + #fasta-get-markov is part of the meme package (but is very simple to replicate if needed) + #Use a pre-compiled background unique for the whole of human... + #run_system_cmd("cat ".$fasta_file." | fasta-get-markov -m 3 > ".$fasta_file.".bg"); + + #Run MEME for this bin + my $species = $self->_species; + my $meme_file = $self->_output_dir."/bin.".$self->_bin.".meme"; + #Replace meme parameters by parameters from an analysis in the database... + #run_system_cmd("meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 0.00001 -nmotifs 10 -minsites 50 -minw 6 -maxw 20 -bfile ~/src/STAMP.v1.1/tests/${species}_masked.bg $fasta_file > ".$meme_file); + run_system_cmd($self->_bin_folder()."/meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 1e-20 -nmotifs 5 -minsites 50 -minw 6 -maxw 20 $fasta_file > ".$meme_file); + + my $basename = $self->_feature_set->name.".bin_".$self->_bin; + if(_process_meme_to_STAMP($meme_file, $basename) == 0){ + warn "No motif found "; + } + + #Eliminate the temp files... + #run_system_cmd("rm -f ".$fasta_file); + #run_system_cmd("rm -f ".$meme_file); + + return 1; +} + + +sub write_output { # Nothing is written at this stage (for the moment) + + my $self = shift @_; + + return 1; + +} + + +sub _process_meme_to_STAMP { + my ($meme_file, $basename) = (shift, shift, shift); + + my $num_motifs = 0; + my $log_odds = 1; + open(FO,">".$meme_file.".tmp_TRANSFAC"); + open(FILE, $meme_file); + while(<FILE>){ + chomp; + if(/^MOTIF\s+.*E-value\s*=\s+(\S+)/) { $log_odds = $1; } + if(/^BL\s+MOTIF\s+(\d+)\s+width=(\d+)\s+seqs=(\d+)\s*$/){ + my $motif_num = $1; + my $motif_width = $2; + my $num_seqs = $3; + + + my @init = (0) x $motif_width; + my %matrix; + #Initialize with 0 + for (my $j=0; $j<$motif_width; $j++){ + $matrix{"A"}->[$j] = $matrix{"C"}->[$j] = $matrix{"G"}->[$j] = $matrix{"T"}->[$j] = 0; + } + + for (my $i=1;$i<=$num_seqs;$i++){ + my $line = <FILE>; + $line =~ /^\S+\s+\(\s*\d+\)\s+(\w+)\s+\d+\s*$/; + my @seq = split(//,uc($1)); + if(scalar(@seq) != $motif_width){ throw "Problems parsing motif file"; } + for (my $j=0; $j<$motif_width; $j++){ $matrix{$seq[$j]}->[$j]++ } + } + my $end = <FILE>; + if(! $end =~ /^\/\/\s*$/){ + throw "Error in the MEME file: expecting \/\/ after $num_seqs lines "; + } + print FO "DE ".$basename.".".$motif_num."\t".$log_odds."\n"; + for(my $i=0; $i<$motif_width; $i++){ + print FO $i."\t".$matrix{"A"}->[$i]."\t".$matrix{"C"}->[$i]."\t".$matrix{"G"}->[$i]."\t".$matrix{"T"}->[$i]."\n" + } + print FO "XX\n"; + + $num_motifs++; + + } + + } + close FILE; + close FO; + + return $num_motifs; + +} + +#Private getter / setter to the bin +sub _bin { + return $_[0]->_getter_setter('bin',$_[1]); +} + +1;