|
3
|
1 #!/usr/bin/perl -w
|
|
|
2 #outputs statistics for all genes in the list
|
|
|
3
|
|
|
4 #different boundaries
|
|
|
5 #no motif p-value for binding sites
|
|
|
6 #read directly No-resp/Up/down category
|
|
|
7
|
|
|
8 #all isoforms from the file with genes
|
|
|
9
|
|
|
10 #RNApolII sites on junctions
|
|
|
11
|
|
|
12 #for HISTONES!!!!
|
|
|
13
|
|
|
14 use strict;
|
|
|
15 use POSIX;
|
|
|
16 use warnings;
|
|
|
17
|
|
|
18 my $usage = qq{
|
|
|
19 $0
|
|
|
20
|
|
|
21 -----------------------------
|
|
|
22 mandatory parameters:
|
|
|
23
|
|
|
24 -g filename file with all genes
|
|
|
25 -tf filename file with sites of TF1
|
|
|
26
|
|
|
27
|
|
|
28 -----------------------------
|
|
|
29 optional parameters:
|
|
|
30 -k36 filename file with sites of K36me3
|
|
|
31 -rp filename file with sites of RNApolII
|
|
|
32 -i filename file with a table where to add colomnes
|
|
|
33 -add values which colomns to add
|
|
|
34
|
|
|
35 -o filename output filename (defaut "genes.output.txt")
|
|
|
36 -v verbose mode
|
|
|
37 -mir filename file with positions of miRNA
|
|
|
38 -k9 filename ile with sites of K9me3
|
|
|
39
|
|
|
40 -c_rp value cutoff for -rp
|
|
|
41 -c_k9 value cutoff for -k9
|
|
|
42 -c_k36 value cutoff for -k36
|
|
|
43
|
|
|
44 -selG filename selected genes (up-down-regulated)
|
|
|
45 -fluo filename file with fluorescence
|
|
|
46 -gc filename file with gc-islands
|
|
|
47
|
|
|
48 -long for each gene take the longest isoform
|
|
|
49
|
|
|
50
|
|
|
51 };
|
|
|
52
|
|
|
53 if(scalar(@ARGV) == 0){
|
|
|
54 print $usage;
|
|
|
55 exit(0);
|
|
|
56 }
|
|
|
57
|
|
|
58 ## mandatory arguments
|
|
|
59
|
|
|
60 my $RNApolFilename = "";
|
|
|
61 my $H3K36Me3polFilename = "";
|
|
|
62 my $H3K9Me3polFilename = "";
|
|
|
63 my $TF1Filename = "";
|
|
|
64 my $TF2Filename = "";
|
|
|
65 my $GenesFilename = "";
|
|
|
66 my $MirFilename = "";
|
|
|
67 my $TF1FilenameALL = "";
|
|
|
68 my $TF2FilenameALL = "";
|
|
|
69 my $SelectedGenesFilename = "";
|
|
|
70 my $fluoFile = "";
|
|
|
71 my $initialTable = "";
|
|
|
72 my $colomnesToAdd = "";
|
|
|
73
|
|
|
74 my $enhLeft = -30000;
|
|
|
75 my $enhRight = -1500;
|
|
|
76 my $immediateDownstream = 2000;
|
|
|
77 my $K9dist = 5000;
|
|
|
78 my $kb5 = 5000;
|
|
|
79 my $INFINITY = 10000000000;
|
|
|
80 my $jonctionSize = 50;
|
|
|
81 ## optional arguments
|
|
|
82 my $outname = "genes.output.txt";
|
|
|
83 my $verbose = 0;
|
|
|
84 my $GCislands = "";
|
|
|
85
|
|
|
86
|
|
|
87 my $longest = 0;
|
|
|
88
|
|
|
89 #my $cutoff_tf1 = 0;
|
|
|
90 #my $cutoff_tf2 = 0;
|
|
|
91 my $cutoff_tf1All = 0;
|
|
|
92 my $cutoff_tf2All = 0;
|
|
|
93 my $cutoff_rp = 0;
|
|
|
94 my $cutoff_k9 = 0;
|
|
|
95 my $cutoff_k36 = 0;
|
|
|
96 my $ifTFcoord = 0;
|
|
|
97
|
|
|
98 my $ifHMmode = 0;
|
|
|
99
|
|
|
100 ## parse command line arguments
|
|
|
101
|
|
|
102 while(scalar(@ARGV) > 0){
|
|
|
103 my $this_arg = shift @ARGV;
|
|
|
104 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
|
|
|
105
|
|
|
106 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;}
|
|
|
107
|
|
|
108 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;}
|
|
|
109 elsif ( $this_arg eq '-rp') {$RNApolFilename = shift @ARGV;}
|
|
|
110 elsif ( $this_arg eq '-k36') {$H3K36Me3polFilename = shift @ARGV;}
|
|
|
111 elsif ( $this_arg eq '-k9') {$H3K9Me3polFilename = shift @ARGV;}
|
|
|
112
|
|
|
113 elsif ( $this_arg eq '-tf') {$TF1Filename = shift @ARGV;}
|
|
|
114
|
|
|
115 elsif ( $this_arg eq '-v') {$verbose = 1;}
|
|
|
116
|
|
|
117 elsif ( $this_arg eq '-long') {$longest = 1;}
|
|
|
118
|
|
|
119
|
|
|
120 elsif ( $this_arg eq '-o') {$outname = shift @ARGV;}
|
|
|
121 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;}
|
|
|
122
|
|
|
123 elsif ( $this_arg eq '-c_rp') {$cutoff_rp = shift @ARGV;}
|
|
|
124 elsif ( $this_arg eq '-c_k9') {$cutoff_k9 = shift @ARGV;}
|
|
|
125 elsif ( $this_arg eq '-c_k36') {$cutoff_k36 = shift @ARGV;}
|
|
|
126
|
|
|
127 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;}
|
|
|
128 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;}
|
|
|
129
|
|
|
130 elsif ( $this_arg eq '-i') {$initialTable = shift @ARGV;}
|
|
|
131 elsif ( $this_arg eq '-add') {$colomnesToAdd = shift @ARGV;}
|
|
|
132
|
|
|
133 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;}
|
|
|
134 elsif ( $this_arg eq '-rightp') {$immediateDownstream = shift @ARGV;}
|
|
|
135 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;}
|
|
|
136 elsif ( $this_arg eq '-dg') {$kb5 = shift @ARGV;}
|
|
|
137
|
|
|
138
|
|
|
139
|
|
|
140 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
|
|
|
141 }
|
|
|
142
|
|
|
143 my $maxDistFromGene = max(abs($enhLeft),abs($kb5));
|
|
|
144
|
|
|
145 if ( $GenesFilename eq "" ){
|
|
|
146 die "you should specify a file with genes \n";
|
|
|
147 }
|
|
|
148 if(( $RNApolFilename eq "")&&($H3K36Me3polFilename eq "")&&($TF1Filename eq "")&&($H3K9Me3polFilename eq "")){
|
|
|
149 die "you should specify at least one file with peaks\n";
|
|
|
150 }
|
|
|
151
|
|
|
152
|
|
|
153 #-----------read selected genes----------------
|
|
|
154 my %selectedGenes;
|
|
|
155 my %selectedGenesFoldChange;
|
|
|
156 if ( $SelectedGenesFilename ne "") {
|
|
|
157 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!";
|
|
|
158 while (<FILE>) {
|
|
|
159 s/\R//g;
|
|
|
160 my @a = split/\t/;
|
|
|
161 $selectedGenes{$a[1]} = $a[3];
|
|
|
162 $selectedGenesFoldChange{$a[1]} = $a[2];
|
|
|
163 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n";
|
|
|
164 }
|
|
|
165
|
|
|
166 close FILE;
|
|
|
167 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose);
|
|
|
168 }
|
|
|
169
|
|
|
170 #-----------read genes with fluorescence---------
|
|
|
171 my %fluoGenes;
|
|
|
172 if ( $fluoFile ne "") {
|
|
|
173 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!";
|
|
|
174 my $gene = "";
|
|
|
175 my $med = 0;
|
|
|
176 my %h;
|
|
|
177 while (<FILE>) {
|
|
|
178 s/\R//g;
|
|
|
179 my @a = split/\t/;
|
|
|
180
|
|
|
181 next if (scalar(@a)<5);
|
|
|
182 next if ($a[0] eq "probesets");
|
|
|
183 next unless ($a[0] =~m/\S/);
|
|
|
184 next unless ($a[4] =~m/\S/);
|
|
|
185 if ($gene ne "" && $gene ne $a[2]) {
|
|
|
186 #calcMed
|
|
|
187 $med = med(keys %h);
|
|
|
188 $fluoGenes{$gene} = $med;
|
|
|
189 $med=0;
|
|
|
190 delete @h{keys %h};
|
|
|
191 } else {
|
|
|
192 #$h{$a[4]} = 1;
|
|
|
193 }
|
|
|
194 $gene = $a[2];
|
|
|
195 $h{$a[4]} = 1;
|
|
|
196 #print "keys ", scalar(keys %h),"\t",keys %h,"\n";
|
|
|
197 }
|
|
|
198 if ($gene ne "") {
|
|
|
199 $med = med(keys %h);
|
|
|
200 $fluoGenes{$gene} = $med;
|
|
|
201 }
|
|
|
202 close FILE;
|
|
|
203 print "\t\t$fluoFile is read!\n" if ($verbose);;
|
|
|
204 }
|
|
|
205 #-----------read GC-islands----------------
|
|
|
206 my %GCislands;
|
|
|
207 if ($GCislands ne "") {
|
|
|
208 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!";
|
|
|
209
|
|
|
210 while (<FILE>) {
|
|
|
211 s/\R//g;
|
|
|
212 my @a = split/\t/;
|
|
|
213 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp
|
|
|
214 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09
|
|
|
215 my $chr = $a[1];
|
|
|
216 my $start = $a[2];
|
|
|
217 my $end = $a[3];
|
|
|
218 $GCislands{$chr}->{$start}=$end;
|
|
|
219 }
|
|
|
220 close FILE;
|
|
|
221 if ($verbose) {
|
|
|
222 print "$GCislands is read\n";
|
|
|
223 }
|
|
|
224 } elsif ($verbose) {
|
|
|
225 print "you did not specify a file with GC-islands\n";
|
|
|
226 }
|
|
|
227
|
|
|
228 #-----------read genes----------------
|
|
|
229
|
|
|
230 my %genes;
|
|
|
231
|
|
|
232 my $count = 0;
|
|
|
233
|
|
|
234 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!";
|
|
|
235 <GENES>;
|
|
|
236 while (<GENES>) {
|
|
|
237 s/\R//g;
|
|
|
238 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){
|
|
|
239 my $name = $10;
|
|
|
240 my $chr = $1;
|
|
|
241 my $strand = $2;
|
|
|
242 if ($strand eq '+') {
|
|
|
243 $strand = 1;
|
|
|
244 }
|
|
|
245 else {
|
|
|
246 $strand = -1;
|
|
|
247 }
|
|
|
248 my $leftPos = $3;
|
|
|
249 my $rightPos = $4;
|
|
|
250 my $cdsStart= $5;
|
|
|
251 my $cdsEnd= $6;
|
|
|
252 my $exonCount= $7;
|
|
|
253 my $exonStarts= $8;
|
|
|
254 my $exonEnds= $9;
|
|
|
255 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count";
|
|
|
256 my $foldChange = 1;
|
|
|
257 my $reg = "NA";
|
|
|
258 my $fluo = "NA";
|
|
|
259 if ( $SelectedGenesFilename ne "") {
|
|
|
260 #print "$name\t";
|
|
|
261 if (exists($selectedGenes{$name})) {
|
|
|
262 $reg = $selectedGenes{$name};
|
|
|
263 $foldChange = $selectedGenesFoldChange{$name};
|
|
|
264 }
|
|
|
265 }
|
|
|
266 if ( $fluoFile ne "") {
|
|
|
267 if (exists($fluoGenes{$name})) {
|
|
|
268 $fluo = $fluoGenes{$name};
|
|
|
269 }
|
|
|
270 }
|
|
|
271 unless (exists($genes{$chr})) {
|
|
|
272 my %h;
|
|
|
273 $genes{$chr} = \%h;
|
|
|
274 }
|
|
|
275
|
|
|
276 my $RNAlength = 0;
|
|
|
277 my $skip = 0;
|
|
|
278
|
|
|
279 #print "$ID\n";
|
|
|
280 if($longest) {
|
|
|
281 $RNAlength = getRNAlength($exonStarts,$exonEnds);
|
|
|
282 for my $IDgene (keys %{$genes{$chr}}) {
|
|
|
283 my $nameGene= (split('\t', $IDgene))[0];
|
|
|
284 if ($nameGene eq $name && $RNAlength > $genes{$chr}->{$IDgene}{'RNAlength'}) {
|
|
|
285 #print "found longer isofome: $ID longer than $IDgene\n";
|
|
|
286 # print "$RNAlength > ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
|
|
|
287
|
|
|
288 $ID=$IDgene;
|
|
|
289 } elsif ($nameGene eq $name && $RNAlength <= $genes{$chr}->{$IDgene}{'RNAlength'}) {
|
|
|
290 #print "found shorter isofome: $ID shorted than $IDgene\nwill skip it\n";
|
|
|
291 #print "$RNAlength <= ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
|
|
|
292 $skip = 1;
|
|
|
293 }
|
|
|
294 }
|
|
|
295 }
|
|
|
296
|
|
|
297
|
|
|
298 unless ($skip) {
|
|
|
299
|
|
|
300 unless (exists($genes{$chr}->{$ID})) {
|
|
|
301 my %h1;
|
|
|
302 $genes{$chr}->{$ID} = \%h1;
|
|
|
303 $count++;
|
|
|
304 }
|
|
|
305
|
|
|
306 $genes{$chr}->{$ID}{'name'} = $name ;
|
|
|
307 $genes{$chr}->{$ID}{'left'} = $leftPos ;
|
|
|
308 $genes{$chr}->{$ID}{'right'} = $rightPos ;
|
|
|
309 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart;
|
|
|
310 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd;
|
|
|
311 $genes{$chr}->{$ID}{'strand'} = $strand;
|
|
|
312 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
|
|
|
313 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
|
|
|
314 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
|
|
|
315 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
|
|
|
316 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
|
|
|
317 $genes{$chr}->{$ID}{'reg'} = $reg;
|
|
|
318 $genes{$chr}->{$ID}{'foldChange'} = $foldChange;
|
|
|
319 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
|
|
|
320 $genes{$chr}->{$ID}{'RNAlength'} = $RNAlength ;
|
|
|
321
|
|
|
322 my $enhStart = min($genes{$chr}->{$ID}{'TSS'}-$enhLeft*$genes{$chr}->{$ID}{'strand'},$genes{$chr}->{$ID}{'TSS'}-$enhRight*$genes{$chr}->{$ID}{'strand'});
|
|
|
323 my $enhEnd = max($genes{$chr}->{$ID}{'TSS'}-$enhLeft*$genes{$chr}->{$ID}{'strand'},$genes{$chr}->{$ID}{'TSS'}-$enhRight*$genes{$chr}->{$ID}{'strand'});
|
|
|
324 $genes{$chr}->{$ID}{'enhStart'} =$enhStart;
|
|
|
325 $genes{$chr}->{$ID}{'enhEnd'} =$enhEnd;
|
|
|
326
|
|
|
327 my $IntraStart = min($genes{$chr}->{$ID}{'TSS'}+$immediateDownstream*$genes{$chr}->{$ID}{'strand'},$genes{$chr}->{$ID}{'TE'});
|
|
|
328 my $IntraEnd = max($genes{$chr}->{$ID}{'TSS'}+$immediateDownstream*$genes{$chr}->{$ID}{'strand'},$genes{$chr}->{$ID}{'TE'});
|
|
|
329 $genes{$chr}->{$ID}{'IntraStart'} =$IntraStart;
|
|
|
330 $genes{$chr}->{$ID}{'IntraEnd'} =$IntraEnd;
|
|
|
331
|
|
|
332 $genes{$chr}->{$ID}{'fluo'} = $fluo;
|
|
|
333
|
|
|
334 $genes{$chr}->{$ID}{'RNApolScore'} = 0;
|
|
|
335 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY;
|
|
|
336 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0;
|
|
|
337 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY;
|
|
|
338
|
|
|
339 $genes{$chr}->{$ID}{'normalizedGBodyScore'} = 0;
|
|
|
340 $genes{$chr}->{$ID}{'K9promScore'} = 0;
|
|
|
341 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY;
|
|
|
342 $genes{$chr}->{$ID}{'K9largeScore'} = 0;
|
|
|
343 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY;
|
|
|
344 $genes{$chr}->{$ID}{'largePromScore'} = 0;
|
|
|
345 $genes{$chr}->{$ID}{'promDist'} = $INFINITY;
|
|
|
346 $genes{$chr}->{$ID}{'EnhScore'} = 0;
|
|
|
347 $genes{$chr}->{$ID}{'EnhDistTSS'} = $INFINITY;
|
|
|
348 $genes{$chr}->{$ID}{'intraScore'} = 0;
|
|
|
349 $genes{$chr}->{$ID}{'intraDistTSS'} = $INFINITY;
|
|
|
350 $genes{$chr}->{$ID}{'allScore'} = 0;
|
|
|
351 $genes{$chr}->{$ID}{'allDistTSS'} = $INFINITY;
|
|
|
352
|
|
|
353
|
|
|
354 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0;
|
|
|
355 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY;
|
|
|
356 $genes{$chr}->{$ID}{'firstIntronScore'} = 0;
|
|
|
357 $genes{$chr}->{$ID}{'firstIntronDistTSS'} = $INFINITY;
|
|
|
358 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0;
|
|
|
359 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY;
|
|
|
360 $genes{$chr}->{$ID}{'immDownScore'} = 0;
|
|
|
361 $genes{$chr}->{$ID}{'immDownDist'} = $INFINITY;
|
|
|
362 $genes{$chr}->{$ID}{'promSimpleScore'} = 0;
|
|
|
363 $genes{$chr}->{$ID}{'promSimpleDist'} = $INFINITY;
|
|
|
364 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0;
|
|
|
365 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY;
|
|
|
366
|
|
|
367 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0;
|
|
|
368 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY;
|
|
|
369 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0;
|
|
|
370 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY;
|
|
|
371
|
|
|
372 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0;
|
|
|
373 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY;
|
|
|
374 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0;
|
|
|
375 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY;
|
|
|
376
|
|
|
377 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0;
|
|
|
378 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY;
|
|
|
379 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0;
|
|
|
380 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY;
|
|
|
381
|
|
|
382
|
|
|
383 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0;
|
|
|
384 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY;
|
|
|
385 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0;
|
|
|
386 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY;
|
|
|
387
|
|
|
388 $genes{$chr}->{$ID}{'geneDownstreamScore'} = 0;
|
|
|
389 $genes{$chr}->{$ID}{'geneDownstreamDist'} = $INFINITY;
|
|
|
390 $genes{$chr}->{$ID}{'K9enhScore'} = 0;
|
|
|
391 $genes{$chr}->{$ID}{'K9EnhDistTSS'} = $INFINITY;
|
|
|
392
|
|
|
393
|
|
|
394 $genes{$chr}->{$ID}{'score_GeneBody'} = 0;
|
|
|
395 $genes{$chr}->{$ID}{'GeneBodyPeak_DistToTSS'} = $INFINITY;
|
|
|
396
|
|
|
397 $genes{$chr}->{$ID}{'GCisland'} = 0;
|
|
|
398 if ($GCislands ne "") {
|
|
|
399 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr});
|
|
|
400 }
|
|
|
401 }
|
|
|
402 }
|
|
|
403 }
|
|
|
404
|
|
|
405 print "Total genes (including isoforms) : $count\n" ;
|
|
|
406 close GENES;
|
|
|
407 print "\t\t$GenesFilename is read!\n" if ($verbose);;
|
|
|
408 #for my $gName (sort keys %{$genes{'chr18'}}) {
|
|
|
409
|
|
|
410 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n";
|
|
|
411 #}
|
|
|
412
|
|
|
413
|
|
|
414 #-----------read file with sites of TF1-----
|
|
|
415 my $numberOfAllSites = 0;
|
|
|
416
|
|
|
417 if ($TF1Filename eq "") {
|
|
|
418 print "No file with peaks of TF1!\n" if ($verbose) ;
|
|
|
419 } else {
|
|
|
420 open (FILE, "<$TF1Filename ") or die "Cannot open file $TF1Filename !!!!: $!";
|
|
|
421 $_ = <FILE>;
|
|
|
422 my $correction = 0;
|
|
|
423 my @a = split /\t/;
|
|
|
424 if (scalar(@a)>1 && $a[1] =~ m/chr/ ) {
|
|
|
425 $correction = 1;
|
|
|
426 }
|
|
|
427
|
|
|
428 while (<FILE>) {
|
|
|
429 s/\R//g;
|
|
|
430
|
|
|
431 my @a = split /\t/;
|
|
|
432
|
|
|
433 my $chr = $a[0+$correction];
|
|
|
434 my $firstPos = $a[1+$correction];
|
|
|
435 my $LastPos = $a[2+$correction];
|
|
|
436
|
|
|
437 my $maxPos = int(($firstPos+$LastPos)/2);
|
|
|
438 if (scalar(@a)>(3+$correction)) {
|
|
|
439 $maxPos = $a[3+$correction];
|
|
|
440 }
|
|
|
441 if ($maxPos =~ m/\D/) {
|
|
|
442 $maxPos = int(($firstPos+$LastPos)/2);
|
|
|
443 }
|
|
|
444 my $score = 0;
|
|
|
445 if (scalar(@a)>(4+$correction)) {
|
|
|
446 $score = $a[4+$correction];
|
|
|
447 }
|
|
|
448 for my $ID (keys %{$genes{$chr}}) {
|
|
|
449 my $distTSS;
|
|
|
450 if ($firstPos<=$genes{$chr}->{$ID}{'TSS'} && $LastPos>=$genes{$chr}->{$ID}{'TSS'}) {
|
|
|
451 $distTSS = 0 ;
|
|
|
452 } else {
|
|
|
453 $distTSS = ($firstPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
|
|
|
454 $distTSS = ($LastPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'} if (abs($LastPos - $genes{$chr}->{$ID}{'TSS'})<abs($distTSS));
|
|
|
455 }
|
|
|
456
|
|
|
457 my $distTE;
|
|
|
458 if ($firstPos<=$genes{$chr}->{$ID}{'TE'} && $LastPos>=$genes{$chr}->{$ID}{'TE'}) {
|
|
|
459 $distTE = 0;
|
|
|
460 } else {
|
|
|
461 $distTE = ($firstPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
|
|
|
462 $distTE = ($LastPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'} if (abs($LastPos - $genes{$chr}->{$ID}{'TE'})<abs($distTE));
|
|
|
463 }
|
|
|
464
|
|
|
465 if (($distTSS>= $enhLeft)&&($distTE<=$kb5)) {
|
|
|
466 if ($genes{$chr}->{$ID}{'allScore'}<$score) {
|
|
|
467 $genes{$chr}->{$ID}{'allScore'}=$score;
|
|
|
468 $genes{$chr}->{$ID}{'allDistTSS'} = $distTSS;
|
|
|
469 }
|
|
|
470 } else {next;}
|
|
|
471
|
|
|
472 my $hmCorDist = -1;
|
|
|
473 if ($LastPos>=$genes{$chr}->{$ID}{'left'} && $firstPos <= $genes{$chr}->{$ID}{'right'} ) {
|
|
|
474 $hmCorDist = 0;
|
|
|
475 }
|
|
|
476
|
|
|
477 if ($hmCorDist == 0) {
|
|
|
478 if ($genes{$chr}->{$ID}{'score_GeneBody'}<$score) {
|
|
|
479 $genes{$chr}->{$ID}{'score_GeneBody'}=$score;
|
|
|
480 $genes{$chr}->{$ID}{'GeneBodyPeak_DistToTSS'} = $distTSS;
|
|
|
481 }
|
|
|
482
|
|
|
483
|
|
|
484 my $scoreToadd = 0;
|
|
|
485
|
|
|
486 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($LastPos<=$genes{$chr}->{$ID}{'right'})) {
|
|
|
487 $scoreToadd = $score/2.*($LastPos-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
|
488 } elsif (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($firstPos<$genes{$chr}->{$ID}{'right'})&&($LastPos>$genes{$chr}->{$ID}{'right'})) {
|
|
|
489 $scoreToadd = $score/2.*($genes{$chr}->{$ID}{'right'}-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
|
490 }
|
|
|
491 elsif (($firstPos<$genes{$chr}->{$ID}{'left'})&&($LastPos>$genes{$chr}->{$ID}{'left'})&&($LastPos<=$genes{$chr}->{$ID}{'right'})) {
|
|
|
492 $scoreToadd = $score/2.*($LastPos-$genes{$chr}->{$ID}{'left'}+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
|
493 }
|
|
|
494 elsif (($firstPos<$genes{$chr}->{$ID}{'left'})&&($LastPos>$genes{$chr}->{$ID}{'right'})) {
|
|
|
495 $scoreToadd = $score/2.;
|
|
|
496 }
|
|
|
497 $genes{$chr}->{$ID}{'normalizedGBodyScore'} += $scoreToadd;
|
|
|
498 }
|
|
|
499
|
|
|
500 if ( $LastPos>=$genes{$chr}->{$ID}{'enhStart'} && $firstPos <=$genes{$chr}->{$ID}{'enhEnd'} ) {
|
|
|
501 if ($genes{$chr}->{$ID}{'EnhScore'}<$score) {
|
|
|
502 $genes{$chr}->{$ID}{'EnhScore'}=$score;
|
|
|
503 $genes{$chr}->{$ID}{'EnhDistTSS'} = $distTSS;
|
|
|
504 }
|
|
|
505 }
|
|
|
506 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
|
|
|
507 if ($genes{$chr}->{$ID}{'largePromScore'}<$score) {
|
|
|
508 $genes{$chr}->{$ID}{'largePromScore'}=$score;
|
|
|
509 $genes{$chr}->{$ID}{'promDist'} = $distTSS;
|
|
|
510 }
|
|
|
511 }
|
|
|
512 if ( $LastPos>=$genes{$chr}->{$ID}{'IntraStart'} && $firstPos <= $genes{$chr}->{$ID}{'IntraEnd'} ) {
|
|
|
513 if ($genes{$chr}->{$ID}{'intraScore'}<$score) {
|
|
|
514 $genes{$chr}->{$ID}{'intraScore'}=$score;
|
|
|
515 $genes{$chr}->{$ID}{'intraDistTSS'} = $distTSS;
|
|
|
516 }
|
|
|
517 }
|
|
|
518
|
|
|
519
|
|
|
520 if (($distTSS>= 0)&&($distTSS<=$immediateDownstream)) {
|
|
|
521 if ($genes{$chr}->{$ID}{'immDownScore'}<$score) {
|
|
|
522 $genes{$chr}->{$ID}{'immDownScore'}=$score;
|
|
|
523 $genes{$chr}->{$ID}{'immDownDist'} = $distTSS;
|
|
|
524 }
|
|
|
525 }
|
|
|
526 if (($distTSS<= 0)&&($distTSS>=$enhRight)) {
|
|
|
527 if ($genes{$chr}->{$ID}{'promSimpleScore'}<$score) {
|
|
|
528 $genes{$chr}->{$ID}{'promSimpleScore'}=$score;
|
|
|
529 $genes{$chr}->{$ID}{'promSimpleDist'} = $distTSS;
|
|
|
530 }
|
|
|
531 }
|
|
|
532
|
|
|
533 if (($distTE>=0)&&($distTE<=$kb5)) {
|
|
|
534 if ($genes{$chr}->{$ID}{'geneDownstreamScore'}<$score) {
|
|
|
535 $genes{$chr}->{$ID}{'geneDownstreamScore'}=$score;
|
|
|
536 $genes{$chr}->{$ID}{'geneDownstreamDist'} = $distTSS;
|
|
|
537 }
|
|
|
538 }
|
|
|
539
|
|
|
540 }
|
|
|
541 $numberOfAllSites++;
|
|
|
542 }
|
|
|
543
|
|
|
544 close FILE;
|
|
|
545 print "\t$TF1Filename is read!\n" if ($verbose) ;
|
|
|
546 print "$numberOfAllSites sites\n" ;
|
|
|
547 }
|
|
|
548
|
|
|
549
|
|
|
550
|
|
|
551
|
|
|
552 open (OUT , ">$outname") or die "Cannot open file $outname!!!!: $!";
|
|
|
553
|
|
|
554 print OUT "name\tchr\tstart\tend\tstrand\tReg\tfoldChange\t";
|
|
|
555
|
|
|
556 if ($GCislands ne "") {
|
|
|
557 print OUT "GC-island\t";
|
|
|
558 }
|
|
|
559
|
|
|
560 if ( $fluoFile ne "") {
|
|
|
561 print OUT "fluorescence\t";
|
|
|
562 }
|
|
|
563
|
|
|
564 print OUT "GeneBodyNormalizedScore\t";
|
|
|
565
|
|
|
566 if ($TF1Filename ne "") {
|
|
|
567 print OUT "score_Gene\tdistTSS_Gene\t";
|
|
|
568 print OUT "score_Promoter\tdistTSS_Promoter\t";
|
|
|
569 print OUT "score_ImmDown\tdistTSS_ImmDown\t";
|
|
|
570 print OUT "score_PromoterORImmDown\tdistTSS_PromoterORImmDown\t";
|
|
|
571 print OUT "score_Enhancer\tdistTSS_Enhancer\t";
|
|
|
572 print OUT "score_Intragenic\tdistTSS_Intragenic\t";
|
|
|
573 print OUT "score_GeneDownstream\tdistTSS_GeneDownstream\t";
|
|
|
574
|
|
|
575
|
|
|
576 print OUT "score_GeneBody\tGeneBodyPeak_DistToTSS"
|
|
|
577 }
|
|
|
578
|
|
|
579 print OUT "\n";
|
|
|
580
|
|
|
581 for my $chr (keys %genes) {
|
|
|
582 for my $ID (keys %{$genes{$chr}}) {
|
|
|
583 print OUT "$genes{$chr}->{$ID}{'name'}\t$chr\t$genes{$chr}->{$ID}{'left'}\t$genes{$chr}->{$ID}{'right'}\t$genes{$chr}->{$ID}{'strand'}\t$genes{$chr}->{$ID}{'reg'}\t$genes{$chr}->{$ID}{'foldChange'}\t";
|
|
|
584
|
|
|
585 if ($GCislands ne "") {
|
|
|
586 print OUT "$genes{$chr}->{$ID}{'GCisland'}\t";
|
|
|
587 }
|
|
|
588 if ( $fluoFile ne "") {
|
|
|
589 print OUT "$genes{$chr}->{$ID}{'fluo'}\t";
|
|
|
590 }
|
|
|
591
|
|
|
592 print OUT "$genes{$chr}->{$ID}{'normalizedGBodyScore'}\t";
|
|
|
593
|
|
|
594 if ($TF1Filename ne "") {
|
|
|
595 print OUT "$genes{$chr}->{$ID}{'allScore'}\t$genes{$chr}->{$ID}{'allDistTSS'}\t";
|
|
|
596 print OUT "$genes{$chr}->{$ID}{'promSimpleScore'}\t$genes{$chr}->{$ID}{'promSimpleDist'}\t";
|
|
|
597 print OUT "$genes{$chr}->{$ID}{'immDownScore'}\t$genes{$chr}->{$ID}{'immDownDist'}\t";
|
|
|
598 print OUT "$genes{$chr}->{$ID}{'largePromScore'}\t$genes{$chr}->{$ID}{'promDist'}\t";
|
|
|
599 print OUT "$genes{$chr}->{$ID}{'EnhScore'}\t$genes{$chr}->{$ID}{'EnhDistTSS'}\t";
|
|
|
600 print OUT "$genes{$chr}->{$ID}{'intraScore'}\t$genes{$chr}->{$ID}{'intraDistTSS'}\t";
|
|
|
601 print OUT "$genes{$chr}->{$ID}{'geneDownstreamScore'}\t$genes{$chr}->{$ID}{'geneDownstreamDist'}\t";
|
|
|
602
|
|
|
603 print OUT "$genes{$chr}->{$ID}{'score_GeneBody'}\t$genes{$chr}->{$ID}{'GeneBodyPeak_DistToTSS'}"
|
|
|
604
|
|
|
605 }
|
|
|
606 print OUT "\n";
|
|
|
607 }
|
|
|
608 }
|
|
|
609
|
|
|
610 close OUT;
|
|
|
611
|
|
|
612 ###################################
|
|
|
613 sub med {
|
|
|
614 my @arr = @_;
|
|
|
615 my $med = 0;
|
|
|
616 @arr = sort {$a <=> $b} @arr;
|
|
|
617 if ((scalar(@arr)/2) =~ m/[\.\,]5/) {
|
|
|
618 return $arr[floor(scalar(@arr)/2)];
|
|
|
619 } else {
|
|
|
620 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2;
|
|
|
621 }
|
|
|
622 $med;
|
|
|
623 }
|
|
|
624
|
|
|
625 sub checkIfGC {
|
|
|
626 my ($TSS,$strand,$dist,$GCislandsChr)=@_;
|
|
|
627 my $ifGC = 0;
|
|
|
628 my $leftProm=$TSS-$dist;
|
|
|
629 my $rightProm = $TSS;
|
|
|
630 if ($strand== -1) {
|
|
|
631 my $leftProm=$TSS;
|
|
|
632 my $rightProm = $TSS+$dist;
|
|
|
633 } #print "$leftProm\t"; print "$rightProm\n";
|
|
|
634 for my $leftGC (keys %{$GCislandsChr}) {
|
|
|
635 my $rightGC = $GCislandsChr->{$leftGC};
|
|
|
636 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) {
|
|
|
637 return "GC-island";
|
|
|
638 }
|
|
|
639 }
|
|
|
640 return $ifGC ;
|
|
|
641 }
|
|
|
642
|
|
|
643 sub getFirstIntron {
|
|
|
644 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
|
645 my ($left,$right);
|
|
|
646 if ($exonCount == 1) {
|
|
|
647 return (0,0);
|
|
|
648 }
|
|
|
649 if ($strand == 1) {
|
|
|
650 $left = (split ",", $exonEnds)[0];
|
|
|
651 $right = (split (",", $exonStarts))[1];
|
|
|
652 } else {
|
|
|
653 $left = (split (",", $exonEnds))[$exonCount-2];
|
|
|
654 $right = (split (",", $exonStarts))[$exonCount-1];
|
|
|
655 }
|
|
|
656 ($left,$right);
|
|
|
657 }
|
|
|
658
|
|
|
659 sub getFirstExon {
|
|
|
660 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
|
661 my ($left,$right);
|
|
|
662 if ($exonCount == 1) {
|
|
|
663 return (0,0);
|
|
|
664 }
|
|
|
665 if ($strand == 1) {
|
|
|
666 $left = (split ",", $exonStarts)[0];
|
|
|
667 $right = (split (",", $exonEnds))[0]-$jonctionSize;
|
|
|
668 } else {
|
|
|
669 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize;
|
|
|
670 $right = (split (",", $exonEnds))[$exonCount-1];
|
|
|
671 }
|
|
|
672 ($left,$right);
|
|
|
673 }
|
|
|
674
|
|
|
675
|
|
|
676 sub getIntronExon {
|
|
|
677 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
|
678 my (@left,@right);
|
|
|
679 @left = (split ",", $exonStarts);
|
|
|
680 @right = (split (",", $exonEnds));
|
|
|
681
|
|
|
682 for (my $i = 0; $i<$exonCount;$i++) {
|
|
|
683 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n";
|
|
|
684 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) {
|
|
|
685 #print "URA!\n";
|
|
|
686 return "exon";
|
|
|
687 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) {
|
|
|
688 return "intron";
|
|
|
689 }
|
|
|
690 }
|
|
|
691 return "jonction";
|
|
|
692 }
|
|
|
693
|
|
|
694
|
|
|
695 sub getTypeIntra {
|
|
|
696
|
|
|
697 my ($geneEntry, $pos) = @_;
|
|
|
698 my $type;
|
|
|
699
|
|
|
700 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) {
|
|
|
701 return "f_intron";
|
|
|
702 }
|
|
|
703 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) {
|
|
|
704 return "f_exon";
|
|
|
705 }
|
|
|
706 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'});
|
|
|
707 return $type;
|
|
|
708 }
|
|
|
709
|
|
|
710 sub getRNAlength {
|
|
|
711 my ($exonStarts,$exonEnds) = @_;
|
|
|
712 my (@left,@right);
|
|
|
713 @left = (split ",", $exonStarts);
|
|
|
714 @right = (split (",", $exonEnds));
|
|
|
715 my $length = 0;
|
|
|
716 for (my $i = 0; $i<scalar(@right);$i++) {
|
|
|
717 $length+=$right[$i]-$left[$i];
|
|
|
718 }
|
|
|
719 #print STDERR "length = $length\n";
|
|
|
720 return $length;
|
|
|
721 }
|
|
|
722
|
|
|
723 sub min {
|
|
|
724 my @a = sort {$a <=> $b} @_;
|
|
|
725 $a[0];
|
|
|
726 }
|
|
|
727
|
|
|
728 sub max {
|
|
|
729 my @a = sort {$b <=> $a} @_;
|
|
|
730 $a[0];
|
|
|
731 }
|