Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/InferMotifs.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs | |
6 | |
7 =head1 DESCRIPTION | |
8 | |
9 'Funcgen::InferMotifs' | |
10 | |
11 =cut | |
12 | |
13 | |
14 package Bio::EnsEMBL::Funcgen::RunnableDB::InferMotifs; | |
15 | |
16 use warnings; | |
17 use strict; | |
18 | |
19 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Motif'); | |
20 | |
21 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); | |
22 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw (run_system_cmd); | |
23 use Data::Dumper; | |
24 | |
25 sub fetch_input { # nothing to fetch... just the parameters... | |
26 my $self = shift @_; | |
27 | |
28 $self->SUPER::fetch_input(); | |
29 | |
30 if(!$self->param('bin')){ throw "No bin defined!"; } | |
31 $self->_bin($self->param('bin')); | |
32 | |
33 return 1; | |
34 } | |
35 | |
36 sub run { | |
37 | |
38 my $self = shift @_; | |
39 | |
40 my $afa = $self->_efgdba()->get_AnnotatedFeatureAdaptor(); | |
41 #Order features descending by score | |
42 my @features = sort { $b->score <=> $a->score } @{$afa->fetch_all_by_FeatureSets( [ $self->_feature_set ] )}; | |
43 | |
44 # Print the sequences for this bin | |
45 my $sa = $self->_efgdba()->dnadb->get_SliceAdaptor(); | |
46 my $fasta_file = $self->_output_dir."/bin.".$self->_bin.".fasta"; | |
47 open(FO,">".$fasta_file) or throw "Couldn't open output file"; | |
48 my $bin_start = ($self->_bin - 1)*$self->_bin_size; | |
49 for(my $i=$bin_start; $i<($bin_start+$self->_bin_size); $i++){ | |
50 my $ft = $features[$i]; | |
51 my $sr_name = $ft->seq_region_name; | |
52 my $start = POSIX::floor($ft->summit - $self->_window_size); | |
53 my $end = POSIX::floor($ft->summit + $self->_window_size); | |
54 my $slice = $sa->fetch_by_region( undef, $sr_name, $start, $end); | |
55 if(defined($slice)){ | |
56 print FO ">".$sr_name.",".$start.",".$end."\n".$slice->get_repeatmasked_seq()->seq."\n"; | |
57 } else { | |
58 warn $sr_name.",".$start.",".$end." could not be found\n"; | |
59 } | |
60 } | |
61 close FO; | |
62 | |
63 #Create a background to try and avoid repetitive A's | |
64 #fasta-get-markov is part of the meme package (but is very simple to replicate if needed) | |
65 #Use a pre-compiled background unique for the whole of human... | |
66 #run_system_cmd("cat ".$fasta_file." | fasta-get-markov -m 3 > ".$fasta_file.".bg"); | |
67 | |
68 #Run MEME for this bin | |
69 my $species = $self->_species; | |
70 my $meme_file = $self->_output_dir."/bin.".$self->_bin.".meme"; | |
71 #Replace meme parameters by parameters from an analysis in the database... | |
72 #run_system_cmd("meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 0.00001 -nmotifs 10 -minsites 50 -minw 6 -maxw 20 -bfile ~/src/STAMP.v1.1/tests/${species}_masked.bg $fasta_file > ".$meme_file); | |
73 run_system_cmd($self->_bin_folder()."/meme.bin -nostatus -dna -text -revcomp -mod zoops -evt 1e-20 -nmotifs 5 -minsites 50 -minw 6 -maxw 20 $fasta_file > ".$meme_file); | |
74 | |
75 my $basename = $self->_feature_set->name.".bin_".$self->_bin; | |
76 if(_process_meme_to_STAMP($meme_file, $basename) == 0){ | |
77 warn "No motif found "; | |
78 } | |
79 | |
80 #Eliminate the temp files... | |
81 #run_system_cmd("rm -f ".$fasta_file); | |
82 #run_system_cmd("rm -f ".$meme_file); | |
83 | |
84 return 1; | |
85 } | |
86 | |
87 | |
88 sub write_output { # Nothing is written at this stage (for the moment) | |
89 | |
90 my $self = shift @_; | |
91 | |
92 return 1; | |
93 | |
94 } | |
95 | |
96 | |
97 sub _process_meme_to_STAMP { | |
98 my ($meme_file, $basename) = (shift, shift, shift); | |
99 | |
100 my $num_motifs = 0; | |
101 my $log_odds = 1; | |
102 open(FO,">".$meme_file.".tmp_TRANSFAC"); | |
103 open(FILE, $meme_file); | |
104 while(<FILE>){ | |
105 chomp; | |
106 if(/^MOTIF\s+.*E-value\s*=\s+(\S+)/) { $log_odds = $1; } | |
107 if(/^BL\s+MOTIF\s+(\d+)\s+width=(\d+)\s+seqs=(\d+)\s*$/){ | |
108 my $motif_num = $1; | |
109 my $motif_width = $2; | |
110 my $num_seqs = $3; | |
111 | |
112 | |
113 my @init = (0) x $motif_width; | |
114 my %matrix; | |
115 #Initialize with 0 | |
116 for (my $j=0; $j<$motif_width; $j++){ | |
117 $matrix{"A"}->[$j] = $matrix{"C"}->[$j] = $matrix{"G"}->[$j] = $matrix{"T"}->[$j] = 0; | |
118 } | |
119 | |
120 for (my $i=1;$i<=$num_seqs;$i++){ | |
121 my $line = <FILE>; | |
122 $line =~ /^\S+\s+\(\s*\d+\)\s+(\w+)\s+\d+\s*$/; | |
123 my @seq = split(//,uc($1)); | |
124 if(scalar(@seq) != $motif_width){ throw "Problems parsing motif file"; } | |
125 for (my $j=0; $j<$motif_width; $j++){ $matrix{$seq[$j]}->[$j]++ } | |
126 } | |
127 my $end = <FILE>; | |
128 if(! $end =~ /^\/\/\s*$/){ | |
129 throw "Error in the MEME file: expecting \/\/ after $num_seqs lines "; | |
130 } | |
131 print FO "DE ".$basename.".".$motif_num."\t".$log_odds."\n"; | |
132 for(my $i=0; $i<$motif_width; $i++){ | |
133 print FO $i."\t".$matrix{"A"}->[$i]."\t".$matrix{"C"}->[$i]."\t".$matrix{"G"}->[$i]."\t".$matrix{"T"}->[$i]."\n" | |
134 } | |
135 print FO "XX\n"; | |
136 | |
137 $num_motifs++; | |
138 | |
139 } | |
140 | |
141 } | |
142 close FILE; | |
143 close FO; | |
144 | |
145 return $num_motifs; | |
146 | |
147 } | |
148 | |
149 #Private getter / setter to the bin | |
150 sub _bin { | |
151 return $_[0]->_getter_setter('bin',$_[1]); | |
152 } | |
153 | |
154 1; |