Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/MakeDnaseProfile.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile | |
6 | |
7 =head1 DESCRIPTION | |
8 | |
9 =cut | |
10 | |
11 | |
12 package Bio::EnsEMBL::Funcgen::RunnableDB::MakeDnaseProfile; | |
13 | |
14 use warnings; | |
15 use strict; | |
16 use Bio::EnsEMBL::Hive::DBSQL::AnalysisDataAdaptor; | |
17 use base ('Bio::EnsEMBL::Hive::ProcessWithParams'); | |
18 | |
19 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); | |
20 use Data::Dumper; | |
21 | |
22 sub fetch_input { # nothing to fetch... just the parameters... | |
23 my $self = shift @_; | |
24 | |
25 $self->SUPER::fetch_input(); | |
26 | |
27 my $work_dir = $self->param('work_dir'); | |
28 my $matrix = $self->param('matrix'); | |
29 my $dnase = $self->param('dnase'); | |
30 #throw "Dnase file $dnase does not exist in $work_dir" if(! -e $work_dir."/".$dnase); | |
31 throw "Matrix file $matrix does not exist in $work_dir" if(! -e $work_dir."/matches/".$matrix); | |
32 my $is_male = $self->param('is_male'); | |
33 throw "Need to define is_male" if(!defined($is_male)); | |
34 #throw "mapability.bedGraph file expected in $work_dir" if(! -e $work_dir."/mapability.bedGraph"); | |
35 | |
36 return 1; | |
37 } | |
38 | |
39 sub run { # Create Groups and Analysis and stuff... | |
40 my $self = shift @_; | |
41 | |
42 my $hlen = 100; | |
43 my $work_dir = $self->param('work_dir'); | |
44 my $matrix = $self->param('matrix'); | |
45 my $dnase = $self->param('dnase'); | |
46 my $is_male = $self->param('is_male'); | |
47 | |
48 my $motif_size; | |
49 my %motifs; | |
50 #Or remove // from matrix name | |
51 open(FILE,$work_dir."/matches_vf/".$matrix); | |
52 <FILE>; #title | |
53 while(<FILE>){ | |
54 chomp; | |
55 #10 71532 71542 Gata1 8.55866 1 MA0035.2 | |
56 my ($chr,$start,$end,$name,$score,$strand)=split("\t"); | |
57 next if(!$is_male && ($chr =~ /Y/)); | |
58 #Ignore Mitochondria... | |
59 next if($chr =~ /MT/); | |
60 if(!defined($motif_size)){ $motif_size = $end - $start; } | |
61 $motifs{$chr}{$start}{$strand}=$score; | |
62 } | |
63 close FILE; | |
64 | |
65 #Maybe pass this as parameters | |
66 my $out_file = $work_dir."/output/".$matrix."_".$dnase.".counts"; | |
67 open(FO,">".$out_file); | |
68 | |
69 foreach my $chr (sort keys %motifs){ | |
70 my %datap = (); | |
71 my %datan = (); | |
72 foreach my $start (sort keys %{$motifs{$chr}}){ | |
73 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ | |
74 $datap{$i} = 0; | |
75 $datan{$i} = 0; | |
76 } | |
77 } | |
78 | |
79 #Add a mappability score to extra filter... | |
80 #A lot more time and didn't see much evidence of improvement... | |
81 #my %mappability = (); | |
82 #foreach my $start (sort keys %{$motifs{$chr}}){ | |
83 # for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ | |
84 # $mappability{$i} = 0; | |
85 # } | |
86 #} | |
87 #open(FILE,$work_dir."/mapability.bedGraph"); | |
88 #while(<FILE>){ | |
89 # chomp; | |
90 # my ($cur_chr,$start,$end,$score)=split(); | |
91 # next if($cur_chr ne $chr); | |
92 #for(my $i=$start; $i<$end; $i++){ | |
93 # if(defined($mappability{$i})){ $mappability{$i}=$score; } | |
94 # } | |
95 #} | |
96 #close FILE; | |
97 | |
98 my $folder; | |
99 if($is_male){ | |
100 $folder = "male"; | |
101 } else { | |
102 $folder = "female"; | |
103 } | |
104 #open(FILE,$work_dir."/dnase/".$folder."/".$dnase); | |
105 open(FILE,"gzip -dc ${work_dir}/dnase/${folder}/${dnase}".'AlnRep*.bam.unique.tagAlign.gz |'); | |
106 while(<FILE>){ | |
107 chomp; | |
108 my ($cur_chr,$start,$end,undef,undef,$strand)=split("\t"); | |
109 next if(($cur_chr ne $chr) || !defined($datap{$start})); | |
110 if($strand eq '+'){ | |
111 $datap{$start}++; | |
112 } else { | |
113 $datan{$end-1}++; | |
114 } | |
115 } | |
116 close FILE; | |
117 | |
118 foreach my $start (sort keys %{$motifs{$chr}}){ | |
119 foreach my $strand (keys %{$motifs{$chr}{$start}}){ | |
120 my $score = $motifs{$chr}{$start}{$strand}; | |
121 | |
122 #filter out those that have no mappability in more than 20% of their extension... | |
123 #my $count = 0; | |
124 #for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ | |
125 # if($mappability{$i} <= 0.5){ $count++; } | |
126 #} | |
127 #Remove if more than 20% of region is "non_mappable" (approach from Centipede paper) | |
128 #if(($count / (2*$hlen + $motif_size)) > 0.2){ | |
129 # warn "Motif at chr: $chr start: $start strand: $strand was ignored due to low mappability"; | |
130 # next; | |
131 #} | |
132 | |
133 print FO $chr."\t".$start."\t".($start+$motif_size)."\t".$matrix."\t".$score."\t".$strand; | |
134 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ | |
135 if($strand eq '+'){ | |
136 print FO "\t".$datap{$i}; | |
137 } else { | |
138 print FO "\t".$datan{$i}; | |
139 } | |
140 } | |
141 for(my $i=$start-$hlen;$i<$start+$hlen+$motif_size;$i++){ | |
142 if($strand eq '+'){ | |
143 print FO "\t".$datan{$i}; | |
144 } else { | |
145 print FO "\t".$datap{$i}; | |
146 } | |
147 } | |
148 print FO "\n"; | |
149 } | |
150 } | |
151 } | |
152 | |
153 close FO; | |
154 | |
155 return 1; | |
156 } | |
157 | |
158 | |
159 sub write_output { # Nothing to do here | |
160 my $self = shift @_; | |
161 | |
162 $self->dataflow_output_id($self->input_id, 2); | |
163 | |
164 return 1; | |
165 | |
166 | |
167 } | |
168 | |
169 1; |