0
|
1 =pod
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile
|
|
6
|
|
7 =head1 DESCRIPTION
|
|
8
|
|
9 =cut
|
|
10
|
|
11
|
|
12 package Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile;
|
|
13
|
|
14 use warnings;
|
|
15 use strict;
|
|
16 use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor;
|
|
17 use base ('Bio::EnsEMBL::Hive::ProcessWithParams');
|
|
18
|
|
19 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
|
|
20 use Data::Dumper;
|
|
21
|
|
22 sub fetch_input { # nothing to fetch... just the parameters...
|
|
23 my $self = shift @_;
|
|
24
|
|
25 $self->SUPER::fetch_input();
|
|
26
|
|
27 my $work_dir = $self->param('work_dir');
|
|
28 my $matrix = $self->param('matrix');
|
|
29 my $dnase = $self->param('dnase');
|
|
30 #throw "Dnase file $dnase does not exist in $work_dir" if(! -e $work_dir."/".$dnase);
|
|
31 throw "Matrix file $matrix does not exist in $work_dir" if(! -e $work_dir."/matches/".$matrix);
|
|
32 my $is_male = $self->param('is_male');
|
|
33 throw "Need to define is_male" if(!defined($is_male));
|
|
34 #throw "mapability.bedGraph file expected in $work_dir" if(! -e $work_dir."/mapability.bedGraph");
|
|
35
|
|
36 return 1;
|
|
37 }
|
|
38
|
|
39 sub run { # Create Groups and Analysis and stuff...
|
|
40 my $self = shift @_;
|
|
41
|
|
42 my $hlen = 100;
|
|
43 my $work_dir = $self->param('work_dir');
|
|
44 my $matrix = $self->param('matrix');
|
|
45 my $dnase = $self->param('dnase');
|
|
46 my $is_male = $self->param('is_male');
|
|
47
|
|
48 my $motif_size;
|
|
49 my %motifs;
|
|
50 #Or remove // from matrix name
|
|
51 open(FILE,$work_dir."/matches_vf/".$matrix);
|
|
52 <FILE>; #title
|
|
53 while(<FILE>){
|
|
54 chomp;
|
|
55 #10 71532 71542 Gata1 8.55866 1 MA0035.2
|
|
56 my ($chr,$start,$end,$name,$score,$strand)=split("\t");
|
|
57 next if(!$is_male && ($chr =~ /Y/));
|
|
58 #Ignore Mitochondria...
|
|
59 next if($chr =~ /MT/);
|
|
60 if(!defined($motif_size)){ $motif_size = $end - $start; }
|
|
61 $motifs{$chr}{$start}{$strand}=$score;
|
|
62 }
|
|
63 close FILE;
|
|
64
|
|
65 #Maybe pass this as parameters
|
|
66 my $out_file = $work_dir."/output/".$matrix."_".$dnase.".counts";
|
|
67 open(FO,">".$out_file);
|
|
68
|
|
69 foreach my $chr (sort keys %motifs){
|
|
70 my %datap = ();
|
|
71 my %datan = ();
|
|
72 foreach my $start (sort keys %{$motifs{$chr}}){
|
|
73 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
|
|
74 $datap{$i} = 0;
|
|
75 $datan{$i} = 0;
|
|
76 }
|
|
77 }
|
|
78
|
|
79 #Add a mappability score to extra filter...
|
|
80 #A lot more time and didn't see much evidence of improvement...
|
|
81 #my %mappability = ();
|
|
82 #foreach my $start (sort keys %{$motifs{$chr}}){
|
|
83 # for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
|
|
84 # $mappability{$i} = 0;
|
|
85 # }
|
|
86 #}
|
|
87 #open(FILE,$work_dir."/mapability.bedGraph");
|
|
88 #while(<FILE>){
|
|
89 # chomp;
|
|
90 # my ($cur_chr,$start,$end,$score)=split();
|
|
91 # next if($cur_chr ne $chr);
|
|
92 #for(my $i=$start; $i<$end; $i++){
|
|
93 # if(defined($mappability{$i})){ $mappability{$i}=$score; }
|
|
94 # }
|
|
95 #}
|
|
96 #close FILE;
|
|
97
|
|
98 my $folder;
|
|
99 if($is_male){
|
|
100 $folder = "male";
|
|
101 } else {
|
|
102 $folder = "female";
|
|
103 }
|
|
104 #open(FILE,$work_dir."/dnase/".$folder."/".$dnase);
|
|
105 open(FILE,"gzip -dc ${work_dir}/dnase/${folder}/${dnase}".'AlnRep*.bam.unique.tagAlign.gz |');
|
|
106 while(<FILE>){
|
|
107 chomp;
|
|
108 my ($cur_chr,$start,$end,undef,undef,$strand)=split("\t");
|
|
109 next if(($cur_chr ne $chr) || !defined($datap{$start}));
|
|
110 if($strand eq '+'){
|
|
111 $datap{$start}++;
|
|
112 } else {
|
|
113 $datan{$end-1}++;
|
|
114 }
|
|
115 }
|
|
116 close FILE;
|
|
117
|
|
118 foreach my $start (sort keys %{$motifs{$chr}}){
|
|
119 foreach my $strand (keys %{$motifs{$chr}{$start}}){
|
|
120 my $score = $motifs{$chr}{$start}{$strand};
|
|
121
|
|
122 #filter out those that have no mappability in more than 20% of their extension...
|
|
123 #my $count = 0;
|
|
124 #for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
|
|
125 # if($mappability{$i} <= 0.5){ $count++; }
|
|
126 #}
|
|
127 #Remove if more than 20% of region is "non_mappable" (approach from Centipede paper)
|
|
128 #if(($count / (2*$hlen + $motif_size)) > 0.2){
|
|
129 # warn "Motif at chr: $chr start: $start strand: $strand was ignored due to low mappability";
|
|
130 # next;
|
|
131 #}
|
|
132
|
|
133 print FO $chr."\t".$start."\t".($start+$motif_size)."\t".$matrix."\t".$score."\t".$strand;
|
|
134 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
|
|
135 if($strand eq '+'){
|
|
136 print FO "\t".$datap{$i};
|
|
137 } else {
|
|
138 print FO "\t".$datan{$i};
|
|
139 }
|
|
140 }
|
|
141 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){
|
|
142 if($strand eq '+'){
|
|
143 print FO "\t".$datan{$i};
|
|
144 } else {
|
|
145 print FO "\t".$datap{$i};
|
|
146 }
|
|
147 }
|
|
148 print FO "\n";
|
|
149 }
|
|
150 }
|
|
151 }
|
|
152
|
|
153 close FO;
|
|
154
|
|
155 return 1;
|
|
156 }
|
|
157
|
|
158
|
|
159 sub write_output { # Nothing to do here
|
|
160 my $self = shift @_;
|
|
161
|
|
162 $self->dataflow_output_id($self->input_id, 2);
|
|
163
|
|
164 return 1;
|
|
165
|
|
166
|
|
167 }
|
|
168
|
|
169 1;
|