2
|
1 #!/usr/bin/perl -w
|
|
2 #outputs statistics for all genes in the list
|
|
3
|
|
4 #different boundaries
|
|
5 #no motif p-value for binding sites
|
|
6 #read directly No-resp/Up/down category
|
|
7
|
|
8 #all isoforms from the file with genes
|
|
9
|
|
10 #RNApolII sites on junctions
|
|
11
|
|
12 #has Histone Modification Mode: covering TSS and overlaping genes
|
|
13
|
|
14 use strict;
|
|
15 use POSIX;
|
|
16
|
|
17 my $usage = qq{
|
|
18 $0
|
|
19
|
|
20 -----------------------------
|
|
21 mandatory parameters:
|
|
22
|
|
23 -g filename file with all genes
|
|
24 -tf filename file with sites of TF1
|
|
25
|
|
26
|
|
27 -----------------------------
|
|
28 optional parameters:
|
|
29 -k36 filename file with sites of K36me3
|
|
30 -rp filename file with sites of RNApolII
|
|
31 -i filename file with a table where to add colomnes
|
|
32 -add values which colomns to add
|
|
33
|
|
34 -o filename output filename (defaut "genes.output.txt")
|
|
35 -v verbose mode
|
|
36 -mir filename file with positions of miRNA
|
|
37 -k9 filename ile with sites of K9me3
|
|
38
|
|
39 -c_rp value cutoff for -rp
|
|
40 -c_k9 value cutoff for -k9
|
|
41 -c_k36 value cutoff for -k36
|
|
42
|
|
43 -selG filename selected genes (up-down-regulated)
|
|
44 -fluo filename file with fluorescence
|
|
45 -gc filename file with gc-islands
|
|
46
|
|
47 -long for each gene take the longest isoform
|
|
48 -HMmode 1/0 for histone modification type H3K27me3: look for overlap with TSS or gene body (default 0)
|
|
49
|
|
50 };
|
|
51
|
|
52 if(scalar(@ARGV) == 0){
|
|
53 print $usage;
|
|
54 exit(0);
|
|
55 }
|
|
56
|
|
57 ## mandatory arguments
|
|
58
|
|
59 my $RNApolFilename = "";
|
|
60 my $H3K36Me3polFilename = "";
|
|
61 my $H3K9Me3polFilename = "";
|
|
62 my $TF1Filename = "";
|
|
63 my $TF2Filename = "";
|
|
64 my $GenesFilename = "";
|
|
65 my $MirFilename = "";
|
|
66 my $TF1FilenameALL = "";
|
|
67 my $TF2FilenameALL = "";
|
|
68 my $SelectedGenesFilename = "";
|
|
69 my $fluoFile = "";
|
|
70 my $initialTable = "";
|
|
71 my $colomnesToAdd = "";
|
|
72
|
|
73 my $enhLeft = -30000;
|
|
74 my $longEnhLeft = -60000;
|
|
75 my $enhRight = -1500;
|
|
76 my $immediateDownstream = 2000;
|
|
77 my $K9dist = 5000;
|
|
78 my $kb5 = 5000;
|
|
79 my $INFINITY = 10000000000;
|
|
80 my $jonctionSize = 50;
|
|
81 ## optional arguments
|
|
82 my $outname = "genes.output.txt";
|
|
83 my $verbose = 0;
|
|
84 my $GCislands = "";
|
|
85
|
|
86 my $longest = 0;
|
|
87
|
|
88 #my $cutoff_tf1 = 0;
|
|
89 #my $cutoff_tf2 = 0;
|
|
90 my $cutoff_tf1All = 0;
|
|
91 my $cutoff_tf2All = 0;
|
|
92 my $cutoff_rp = 0;
|
|
93 my $cutoff_k9 = 0;
|
|
94 my $cutoff_k36 = 0;
|
|
95 my $ifTFcoord = 0;
|
|
96
|
|
97 my $ifHMmode = 0;
|
|
98
|
|
99 ## parse command line arguments
|
|
100
|
|
101 while(scalar(@ARGV) > 0){
|
|
102 my $this_arg = shift @ARGV;
|
|
103 if ( $this_arg eq '-h') {print "$usage\n"; exit; }
|
|
104
|
|
105 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;}
|
|
106
|
|
107 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;}
|
|
108 elsif ( $this_arg eq '-rp') {$RNApolFilename = shift @ARGV;}
|
|
109 elsif ( $this_arg eq '-k36') {$H3K36Me3polFilename = shift @ARGV;}
|
|
110 elsif ( $this_arg eq '-k9') {$H3K9Me3polFilename = shift @ARGV;}
|
|
111
|
|
112 elsif ( $this_arg eq '-tf') {$TF1Filename = shift @ARGV;}
|
|
113
|
|
114 elsif ( $this_arg eq '-v') {$verbose = 1;}
|
|
115
|
|
116 elsif ( $this_arg eq '-long') {$longest = 1;}
|
|
117
|
|
118
|
|
119 elsif ( $this_arg eq '-o') {$outname = shift @ARGV;}
|
|
120 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;}
|
|
121
|
|
122 elsif ( $this_arg eq '-c_rp') {$cutoff_rp = shift @ARGV;}
|
|
123 elsif ( $this_arg eq '-c_k9') {$cutoff_k9 = shift @ARGV;}
|
|
124 elsif ( $this_arg eq '-c_k36') {$cutoff_k36 = shift @ARGV;}
|
|
125
|
|
126 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;}
|
|
127 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;}
|
|
128
|
|
129 elsif ( $this_arg eq '-i') {$initialTable = shift @ARGV;}
|
|
130 elsif ( $this_arg eq '-add') {$colomnesToAdd = shift @ARGV;}
|
|
131
|
|
132 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;}
|
|
133 elsif ( $this_arg eq '-rightp') {$immediateDownstream = shift @ARGV;}
|
|
134 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;}
|
|
135 elsif ( $this_arg eq '-dg') {$kb5 = shift @ARGV;}
|
|
136 elsif ( $this_arg eq '-HMmode') {$ifHMmode = shift @ARGV; if ($ifHMmode eq "no" || $ifHMmode eq "0") {$ifHMmode = 0;} else {$ifHMmode=1;}}
|
|
137
|
|
138
|
|
139
|
|
140 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";}
|
|
141 }
|
|
142
|
|
143
|
|
144 if ( $GenesFilename eq "" ){
|
|
145 die "you should specify a file with genes \n";
|
|
146 }
|
|
147 if(( $RNApolFilename eq "")&&($H3K36Me3polFilename eq "")&&($TF1Filename eq "")&&($H3K9Me3polFilename eq "")){
|
|
148 die "you should specify at least one file with peaks\n";
|
|
149 }
|
|
150
|
|
151
|
|
152 #-----------read selected genes----------------
|
|
153 my %selectedGenes;
|
|
154 my %selectedGenesFoldChange;
|
|
155 if ( $SelectedGenesFilename ne "") {
|
|
156 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!";
|
|
157 while (<FILE>) {
|
|
158 s/\R//g;
|
|
159 my @a = split/\t/;
|
|
160 $selectedGenes{$a[1]} = $a[3];
|
|
161 $selectedGenesFoldChange{$a[1]} = $a[2];
|
|
162 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n";
|
|
163 }
|
|
164
|
|
165 close FILE;
|
|
166 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose);
|
|
167 }
|
|
168
|
|
169 #-----------read genes with fluorescence---------
|
|
170 my %fluoGenes;
|
|
171 if ( $fluoFile ne "") {
|
|
172 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!";
|
|
173 my $gene = "";
|
|
174 my $med = 0;
|
|
175 my %h;
|
|
176 while (<FILE>) {
|
|
177 s/\R//g;
|
|
178 my @a = split/\t/;
|
|
179
|
|
180 next if (scalar(@a)<5);
|
|
181 next if ($a[0] eq "probesets");
|
|
182 next unless ($a[0] =~m/\S/);
|
|
183 next unless ($a[4] =~m/\S/);
|
|
184 if ($gene ne "" && $gene ne $a[2]) {
|
|
185 #calcMed
|
|
186 $med = med(keys %h);
|
|
187 $fluoGenes{$gene} = $med;
|
|
188 $med=0;
|
|
189 delete @h{keys %h};
|
|
190 } else {
|
|
191 #$h{$a[4]} = 1;
|
|
192 }
|
|
193 $gene = $a[2];
|
|
194 $h{$a[4]} = 1;
|
|
195 #print "keys ", scalar(keys %h),"\t",keys %h,"\n";
|
|
196 }
|
|
197 if ($gene ne "") {
|
|
198 $med = med(keys %h);
|
|
199 $fluoGenes{$gene} = $med;
|
|
200 }
|
|
201 close FILE;
|
|
202 print "\t\t$fluoFile is read!\n" if ($verbose);;
|
|
203 }
|
|
204 #-----------read GC-islands----------------
|
|
205 my %GCislands;
|
|
206 if ($GCislands ne "") {
|
|
207 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!";
|
|
208
|
|
209 while (<FILE>) {
|
|
210 s/\R//g;
|
|
211 my @a = split/\t/;
|
|
212 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp
|
|
213 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09
|
|
214 my $chr = $a[1];
|
|
215 my $start = $a[2];
|
|
216 my $end = $a[3];
|
|
217 $GCislands{$chr}->{$start}=$end;
|
|
218 }
|
|
219 close FILE;
|
|
220 if ($verbose) {
|
|
221 print "$GCislands is read\n";
|
|
222 }
|
|
223 } elsif ($verbose) {
|
|
224 print "you did not specify a file with GC-islands\n";
|
|
225 }
|
|
226
|
|
227 #-----------read genes----------------
|
|
228
|
|
229 my %genes;
|
|
230
|
|
231 my $count = 0;
|
|
232
|
|
233 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!";
|
|
234 <GENES>;
|
|
235 while (<GENES>) {
|
|
236 s/\R//g;
|
|
237 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){
|
|
238 my $name = $10;
|
|
239 my $chr = $1;
|
|
240 my $strand = $2;
|
|
241 if ($strand eq '+') {
|
|
242 $strand = 1;
|
|
243 }
|
|
244 else {
|
|
245 $strand = -1;
|
|
246 }
|
|
247 my $leftPos = $3;
|
|
248 my $rightPos = $4;
|
|
249 my $cdsStart= $5;
|
|
250 my $cdsEnd= $6;
|
|
251 my $exonCount= $7;
|
|
252 my $exonStarts= $8;
|
|
253 my $exonEnds= $9;
|
|
254 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count";
|
|
255 my $foldChange = 1;
|
|
256 my $reg = "NA";
|
|
257 my $fluo = "NA";
|
|
258 if ( $SelectedGenesFilename ne "") {
|
|
259 #print "$name\t";
|
|
260 if (exists($selectedGenes{$name})) {
|
|
261 $reg = $selectedGenes{$name};
|
|
262 $foldChange = $selectedGenesFoldChange{$name};
|
|
263 }
|
|
264 }
|
|
265 if ( $fluoFile ne "") {
|
|
266 if (exists($fluoGenes{$name})) {
|
|
267 $fluo = $fluoGenes{$name};
|
|
268 }
|
|
269 }
|
|
270 unless (exists($genes{$chr})) {
|
|
271 my %h;
|
|
272 $genes{$chr} = \%h;
|
|
273 }
|
|
274
|
|
275 my $RNAlength = 0;
|
|
276 my $skip = 0;
|
|
277
|
|
278 #print "$ID\n";
|
|
279 if($longest) {
|
|
280 $RNAlength = getRNAlength($exonStarts,$exonEnds);
|
|
281 for my $IDgene (keys %{$genes{$chr}}) {
|
|
282 my $nameGene= (split('\t', $IDgene))[0];
|
|
283 if ($nameGene eq $name && $RNAlength > $genes{$chr}->{$IDgene}{'RNAlength'}) {
|
|
284 #print "found longer isofome: $ID longer than $IDgene\n";
|
|
285 # print "$RNAlength > ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
|
|
286
|
|
287 $ID=$IDgene;
|
|
288 } elsif ($nameGene eq $name && $RNAlength <= $genes{$chr}->{$IDgene}{'RNAlength'}) {
|
|
289 #print "found shorter isofome: $ID shorted than $IDgene\nwill skip it\n";
|
|
290 #print "$RNAlength <= ".$genes{$chr}->{$IDgene}{'RNAlength'}."\n";
|
|
291 $skip = 1;
|
|
292 }
|
|
293 }
|
|
294 }
|
|
295
|
|
296
|
|
297 unless ($skip) {
|
|
298
|
|
299 unless (exists($genes{$chr}->{$ID})) {
|
|
300 my %h1;
|
|
301 $genes{$chr}->{$ID} = \%h1;
|
|
302 $count++;
|
|
303 }
|
|
304
|
|
305 $genes{$chr}->{$ID}{'name'} = $name ;
|
|
306 $genes{$chr}->{$ID}{'left'} = $leftPos ;
|
|
307 $genes{$chr}->{$ID}{'right'} = $rightPos ;
|
|
308 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart;
|
|
309 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd;
|
|
310 $genes{$chr}->{$ID}{'strand'} = $strand;
|
|
311 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
|
|
312 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
|
|
313 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
|
|
314 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
|
|
315 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
|
|
316 $genes{$chr}->{$ID}{'reg'} = $reg;
|
|
317 $genes{$chr}->{$ID}{'foldChange'} = $foldChange;
|
|
318 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
|
|
319 $genes{$chr}->{$ID}{'RNAlength'} = $RNAlength ;
|
|
320
|
|
321 $genes{$chr}->{$ID}{'fluo'} = $fluo;
|
|
322
|
|
323 $genes{$chr}->{$ID}{'RNApolScore'} = 0;
|
|
324 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY;
|
|
325 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0;
|
|
326 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY;
|
|
327
|
|
328 $genes{$chr}->{$ID}{'K36score'} = 0;
|
|
329 $genes{$chr}->{$ID}{'K9promScore'} = 0;
|
|
330 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY;
|
|
331 $genes{$chr}->{$ID}{'K9largeScore'} = 0;
|
|
332 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY;
|
|
333 $genes{$chr}->{$ID}{'TFpromScore'} = 0;
|
|
334 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY;
|
|
335 $genes{$chr}->{$ID}{'TFenhScore'} = 0;
|
|
336 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY;
|
|
337 $genes{$chr}->{$ID}{'TFintraScore'} = 0;
|
|
338 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY;
|
|
339 $genes{$chr}->{$ID}{'TFallScore'} = 0;
|
|
340 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY;
|
|
341
|
|
342
|
|
343 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0;
|
|
344 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY;
|
|
345 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0;
|
|
346 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY;
|
|
347 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0;
|
|
348 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY;
|
|
349 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0;
|
|
350 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY;
|
|
351 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0;
|
|
352 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY;
|
|
353 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0;
|
|
354 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY;
|
|
355
|
|
356 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0;
|
|
357 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY;
|
|
358 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0;
|
|
359 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY;
|
|
360
|
|
361 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0;
|
|
362 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY;
|
|
363 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0;
|
|
364 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY;
|
|
365
|
|
366 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0;
|
|
367 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY;
|
|
368 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0;
|
|
369 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY;
|
|
370
|
|
371
|
|
372 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0;
|
|
373 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY;
|
|
374 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0;
|
|
375 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY;
|
|
376
|
|
377 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0;
|
|
378 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY;
|
|
379 $genes{$chr}->{$ID}{'K9enhScore'} = 0;
|
|
380 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY;
|
|
381
|
|
382 $genes{$chr}->{$ID}{'K27TSS_Score'} = 0;
|
|
383 $genes{$chr}->{$ID}{'K27TSS_Dist'} = $INFINITY;
|
|
384 $genes{$chr}->{$ID}{'K27GeneB_Score'} = 0;
|
|
385 $genes{$chr}->{$ID}{'K27GeneB_Dist'} = $INFINITY;
|
|
386
|
|
387 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand);
|
|
388 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand);
|
|
389
|
|
390 $genes{$chr}->{$ID}{'exonCount'} = $exonCount;
|
|
391 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts;
|
|
392 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds;
|
|
393
|
|
394
|
|
395 $genes{$chr}->{$ID}{'GCisland'} = 0;
|
|
396 if ($GCislands ne "") {
|
|
397 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr});
|
|
398 }
|
|
399 }
|
|
400 }
|
|
401 }
|
|
402
|
|
403 print "Total genes (including isoforms) : $count\n" ;
|
|
404 close GENES;
|
|
405 print "\t\t$GenesFilename is read!\n" if ($verbose);;
|
|
406 #for my $gName (sort keys %{$genes{'chr18'}}) {
|
|
407
|
|
408 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n";
|
|
409 #}
|
|
410
|
|
411 #-----------read file with sites miRNA, store as genes-----
|
|
412
|
|
413 if ( $MirFilename eq ""){
|
|
414 print "you did not specify file with miRNA\n" if ($verbose);;
|
|
415 }
|
|
416 else {
|
|
417 $count = 0;
|
|
418 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!";
|
|
419 #chr1 20669090 20669163 mmu-mir-206 960 +
|
|
420 while (<MIR>) {
|
|
421 s/\R//g;
|
|
422 my ($name, $chr, $leftPos, $rightPos, $strand );
|
|
423 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206";
|
|
424 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) {
|
|
425 $name = $5;
|
|
426 $chr = $1;
|
|
427 $leftPos = $2;
|
|
428 $rightPos = $3;
|
|
429 $strand = $4;
|
|
430 }
|
|
431 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){
|
|
432 $name = $4;
|
|
433 $chr = $1;
|
|
434 $leftPos = $2;
|
|
435 $rightPos = $3;
|
|
436 $strand = $6;
|
|
437 } else {
|
|
438 next;
|
|
439 }
|
|
440
|
|
441 unless ($chr =~ m/chr/) {
|
|
442 $chr = "chr".$chr;
|
|
443 }
|
|
444 my $ID = "$name\t$chr:$leftPos-$rightPos\t$count";
|
|
445
|
|
446 if ($strand eq '+') {
|
|
447 $strand = 1;
|
|
448 }
|
|
449 else {
|
|
450 $strand = -1;
|
|
451 }
|
|
452
|
|
453 unless (exists($genes{$chr})) {
|
|
454 my %h;
|
|
455 $genes{$chr} = \%h;
|
|
456 }
|
|
457 unless (exists($genes{$chr}->{$ID})) {
|
|
458 my %h1;
|
|
459 $genes{$chr}->{$ID} = \%h1;
|
|
460 $count++;
|
|
461 }
|
|
462 $genes{$chr}->{$ID}{'name'} = $name ;
|
|
463 $genes{$chr}->{$ID}{'left'} = $leftPos ;
|
|
464 $genes{$chr}->{$ID}{'right'} = $rightPos ;
|
|
465 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ;
|
|
466 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ;
|
|
467 $genes{$chr}->{$ID}{'strand'} = $strand;
|
|
468 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos);
|
|
469 $genes{$chr}->{$ID}{'exonCount'} = 1;
|
|
470 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ;
|
|
471 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ;
|
|
472 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ;
|
|
473 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ;
|
|
474 $genes{$chr}->{$ID}{'reg'} = "miRNA";
|
|
475 $genes{$chr}->{$ID}{'foldChange'} = 1;
|
|
476
|
|
477 $genes{$chr}->{$ID}{'fluo'} = "N/A";
|
|
478
|
|
479
|
|
480 $genes{$chr}->{$ID}{'RNApolScore'} = 0;
|
|
481 $genes{$chr}->{$ID}{'RNApolDist'} = $INFINITY;
|
|
482 $genes{$chr}->{$ID}{'RNApol_junctionScore'} = 0;
|
|
483 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $INFINITY;
|
|
484 $genes{$chr}->{$ID}{'K36score'} = 0;
|
|
485 $genes{$chr}->{$ID}{'K9promScore'} = 0;
|
|
486 $genes{$chr}->{$ID}{'K9promDist'} = $INFINITY;
|
|
487 $genes{$chr}->{$ID}{'K9largeScore'} = 0;
|
|
488 $genes{$chr}->{$ID}{'K9largeDist'} = $INFINITY;
|
|
489 $genes{$chr}->{$ID}{'TFpromScore'} = 0;
|
|
490 $genes{$chr}->{$ID}{'TFpromDist'} = $INFINITY;
|
|
491 $genes{$chr}->{$ID}{'TFenhScore'} = 0;
|
|
492 $genes{$chr}->{$ID}{'TFenhDist'} = $INFINITY;
|
|
493 $genes{$chr}->{$ID}{'TFintraScore'} = 0;
|
|
494 $genes{$chr}->{$ID}{'TFintraDist'} = $INFINITY;
|
|
495 $genes{$chr}->{$ID}{'TFallScore'} = 0;
|
|
496 $genes{$chr}->{$ID}{'TFallDist'} = $INFINITY;
|
|
497
|
|
498 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'} = 0;
|
|
499 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $INFINITY;
|
|
500 $genes{$chr}->{$ID}{'TFFirstIntronScore'} = 0;
|
|
501 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $INFINITY;
|
|
502 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'} = 0;
|
|
503 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $INFINITY;
|
|
504 $genes{$chr}->{$ID}{'TFImmDownScore'} = 0;
|
|
505 $genes{$chr}->{$ID}{'TFImmDownDist'} = $INFINITY;
|
|
506 $genes{$chr}->{$ID}{'TFpromSimpleScore'} = 0;
|
|
507 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $INFINITY;
|
|
508
|
|
509 $genes{$chr}->{$ID}{'TF_FirstExonScore'} = 0;
|
|
510 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $INFINITY;
|
|
511 $genes{$chr}->{$ID}{'TF_OtherExonsScore'} = 0;
|
|
512 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $INFINITY;
|
|
513 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'} = 0;
|
|
514 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $INFINITY;
|
|
515 $genes{$chr}->{$ID}{'TF_junctionScore'} = 0;
|
|
516 $genes{$chr}->{$ID}{'TF_junctionDist'} = $INFINITY;
|
|
517 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'} = 0;
|
|
518 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $INFINITY;
|
|
519 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'} = 0;
|
|
520 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $INFINITY;
|
|
521
|
|
522 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'} = 0;
|
|
523 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $INFINITY;
|
|
524 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'} = 0;
|
|
525 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $INFINITY;
|
|
526
|
|
527 $genes{$chr}->{$ID}{'TFenh60kbScore'} = 0;
|
|
528 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $INFINITY;
|
|
529
|
|
530 $genes{$chr}->{$ID}{'TF5kbDownScore'} = 0;
|
|
531 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $INFINITY;
|
|
532 $genes{$chr}->{$ID}{'K9enhScore'} = 0;
|
|
533 $genes{$chr}->{$ID}{'K9enhDist'} = $INFINITY;
|
|
534
|
|
535 $genes{$chr}->{$ID}{'K27TSS_Score'} = 0;
|
|
536 $genes{$chr}->{$ID}{'K27TSS_Dist'} = $INFINITY;
|
|
537 $genes{$chr}->{$ID}{'K27GeneB_Score'} = 0;
|
|
538 $genes{$chr}->{$ID}{'K27GeneB_Dist'} = $INFINITY;
|
|
539
|
|
540 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0);
|
|
541 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0);
|
|
542
|
|
543 $genes{$chr}->{$ID}{'GCisland'} = 0;
|
|
544
|
|
545 $genes{$chr}->{$ID}{'exonCount'} = 1;
|
|
546 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ;
|
|
547 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ;
|
|
548
|
|
549
|
|
550 }
|
|
551
|
|
552
|
|
553 close MIR;
|
|
554 print "\t\t$MirFilename is read!\n" if ($verbose) ;
|
|
555 print "$count miRNA\n" if ($verbose);;
|
|
556 }
|
|
557
|
|
558 #-----------read file with sites of TF1-----
|
|
559 my $numberOfAllSites = 0;
|
|
560
|
|
561 if ($TF1Filename eq "") {
|
|
562 print "No file with peaks of TF1!\n" if ($verbose) ;
|
|
563 } else {
|
|
564 open (FILE, "<$TF1Filename ") or die "Cannot open file $TF1Filename !!!!: $!";
|
|
565 $_ = <FILE>;
|
|
566 my $correction = 0;
|
|
567 my @a = split /\t/;
|
|
568 if ( $a[1] =~ m/chr/ ) {
|
|
569 $correction = 1;
|
|
570 }
|
|
571
|
|
572 while (<FILE>) {
|
|
573 s/\R//g;
|
|
574
|
|
575 my @a = split /\t/;
|
|
576
|
|
577 my $chr = $a[0+$correction];
|
|
578 my $firstPos = $a[1+$correction];
|
|
579 my $LastPos = $a[2+$correction];
|
|
580
|
|
581 my $maxPos = int(($firstPos+$LastPos)/2);
|
|
582 if (scalar(@a)>(3+$correction)) {
|
|
583 $maxPos = $a[3+$correction];
|
|
584 }
|
|
585 if ($maxPos=~/\D/) {
|
|
586 $maxPos = int(($firstPos+$LastPos)/2);
|
|
587 }
|
|
588 my $score = 0;
|
|
589 if (scalar(@a)>(4+$correction)) {
|
|
590 $score = $a[4+$correction];
|
|
591 }
|
|
592 for my $ID (keys %{$genes{$chr}}) {
|
|
593
|
|
594 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
|
|
595 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
|
|
596
|
|
597 my $hmDist;
|
|
598 my $hmCorDist = -1;
|
|
599 if ($ifHMmode) {
|
|
600 if ($genes{$chr}->{$ID}{'strand'}==1) {
|
|
601 if ($LastPos <= $genes{$chr}->{$ID}{'TSS'}) {
|
|
602 $hmDist = $LastPos-$genes{$chr}->{$ID}{'TSS'};
|
|
603 } elsif ($firstPos >= $genes{$chr}->{$ID}{'TSS'}) {
|
|
604 $hmDist = $firstPos - $genes{$chr}->{$ID}{'TSS'};
|
|
605 } else {$hmDist = 0;}
|
|
606
|
|
607 if ($hmDist >= 0 && $genes{$chr}->{$ID}{'TE'} >= $firstPos) {
|
|
608 $hmCorDist = 0;
|
|
609 }
|
|
610
|
|
611 } else {
|
|
612
|
|
613 if ($LastPos <= $genes{$chr}->{$ID}{'TSS'}) {
|
|
614 $hmDist = $genes{$chr}->{$ID}{'TSS'}- $LastPos;
|
|
615 } elsif ($firstPos >= $genes{$chr}->{$ID}{'TSS'}) {
|
|
616 $hmDist = $genes{$chr}->{$ID}{'TSS'}-$firstPos;
|
|
617 } else {$hmDist = 0;}
|
|
618
|
|
619 if ($hmDist >= 0 && $genes{$chr}->{$ID}{'TE'} <= $LastPos) {
|
|
620 $hmCorDist = 0;
|
|
621 }
|
|
622 }
|
|
623
|
|
624
|
|
625 if (($hmDist>= $enhRight)&&($hmDist<=$immediateDownstream)) {
|
|
626 if ($genes{$chr}->{$ID}{'K27TSS_Score'}<$score) {
|
|
627 $genes{$chr}->{$ID}{'K27TSS_Score'}=$score;
|
|
628 $genes{$chr}->{$ID}{'K27TSS_Dist'} = $hmDist;
|
|
629 }
|
|
630 }
|
|
631
|
|
632 if ($hmDist>$immediateDownstream && $hmCorDist == 0) {
|
|
633 if ($genes{$chr}->{$ID}{'K27GeneB_Score'}<$score) {
|
|
634 $genes{$chr}->{$ID}{'K27GeneB_Score'}=$score;
|
|
635 $genes{$chr}->{$ID}{'K27GeneB_Dist'} = $hmDist;
|
|
636 }
|
|
637 }
|
|
638
|
|
639 }
|
|
640
|
|
641
|
|
642
|
|
643 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) {
|
|
644 if ($genes{$chr}->{$ID}{'TFenhScore'}<$score) {
|
|
645 $genes{$chr}->{$ID}{'TFenhScore'}=$score;
|
|
646 $genes{$chr}->{$ID}{'TFenhDist'} = $distTSS;
|
|
647 }
|
|
648 } elsif (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
|
|
649 if ($genes{$chr}->{$ID}{'TFpromScore'}<$score) {
|
|
650 $genes{$chr}->{$ID}{'TFpromScore'}=$score;
|
|
651 $genes{$chr}->{$ID}{'TFpromDist'} = $distTSS;
|
|
652 }
|
|
653 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
654 if ($genes{$chr}->{$ID}{'TFintraScore'}<$score) {
|
|
655 $genes{$chr}->{$ID}{'TFintraScore'}=$score;
|
|
656 $genes{$chr}->{$ID}{'TFintraDist'} = $distTSS;
|
|
657 }
|
|
658 }
|
|
659 if (($distTSS>= $enhLeft)&&($distTE<=$kb5)) {
|
|
660 if ($genes{$chr}->{$ID}{'TFallScore'}<$score) {
|
|
661 $genes{$chr}->{$ID}{'TFallScore'}=$score;
|
|
662 $genes{$chr}->{$ID}{'TFallDist'} = $distTSS;
|
|
663 }
|
|
664 }
|
|
665
|
|
666 if (($distTSS>= 0)&&($distTSS<=$immediateDownstream)) {
|
|
667 if ($genes{$chr}->{$ID}{'TFImmDownScore'}<$score) {
|
|
668 $genes{$chr}->{$ID}{'TFImmDownScore'}=$score;
|
|
669 $genes{$chr}->{$ID}{'TFImmDownDist'} = $distTSS;
|
|
670 }
|
|
671 }
|
|
672 if (($distTSS<= 0)&&($distTSS>=$enhRight)) {
|
|
673 if ($genes{$chr}->{$ID}{'TFpromSimpleScore'}<$score) {
|
|
674 $genes{$chr}->{$ID}{'TFpromSimpleScore'}=$score;
|
|
675 $genes{$chr}->{$ID}{'TFpromSimpleDist'} = $distTSS;
|
|
676 }
|
|
677 }
|
|
678
|
|
679 my ($firstIntronStart,$firstIntronEnd)=($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'});
|
|
680 ($firstIntronStart,$firstIntronEnd)= ($firstIntronEnd,$firstIntronStart) if ($firstIntronStart>$firstIntronEnd) ;
|
|
681
|
|
682 my ($firstExonStart,$firstExonEnd) = ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) ;
|
|
683 ($firstExonStart,$firstExonEnd)= ($firstExonEnd,$firstExonStart) if ($firstExonStart>$firstExonEnd) ;
|
|
684
|
|
685 if ($maxPos>=$firstIntronStart && $maxPos <= $firstIntronEnd) {
|
|
686 if ($genes{$chr}->{$ID}{'TFFirstIntronScore'}<$score) {
|
|
687 $genes{$chr}->{$ID}{'TFFirstIntronScore'}=$score;
|
|
688 $genes{$chr}->{$ID}{'TFFirstIntronDist'} = $distTSS;
|
|
689 }
|
|
690
|
|
691 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
692 if ($genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}<$score) {
|
|
693 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}=$score;
|
|
694 $genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'} = $distTSS;
|
|
695 }
|
|
696 }
|
|
697 } elsif (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
698 if ($genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}<$score) {
|
|
699 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}=$score;
|
|
700 $genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'} = $distTSS;
|
|
701 }
|
|
702 }
|
|
703 if (($distTSS>= $longEnhLeft)&&($distTSS<$enhRight)) {
|
|
704 if ($genes{$chr}->{$ID}{'TFenh60kbScore'}<$score) {
|
|
705 $genes{$chr}->{$ID}{'TFenh60kbScore'}=$score;
|
|
706 $genes{$chr}->{$ID}{'TFenh60kbDist'} = $distTSS;
|
|
707 }
|
|
708 }
|
|
709 if (($distTE>=0)&&($distTE<=$kb5)) {
|
|
710 if ($genes{$chr}->{$ID}{'TF5kbDownScore'}<$score) {
|
|
711 $genes{$chr}->{$ID}{'TF5kbDownScore'}=$score;
|
|
712 $genes{$chr}->{$ID}{'TF5kbDownDist'} = $distTSS;
|
|
713 }
|
|
714 }
|
|
715 if ($distTSS>=0 && $distTE<=0) {
|
|
716 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos);
|
|
717 if ($typeIntra eq "f_exon") {
|
|
718 if ($genes{$chr}->{$ID}{'TF_FirstExonScore'}<$score) {
|
|
719 $genes{$chr}->{$ID}{'TF_FirstExonScore'}=$score;
|
|
720 $genes{$chr}->{$ID}{'TF_FirstExonDist'} = $distTSS;
|
|
721 }
|
|
722
|
|
723 if ($genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
724 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}=$score;
|
|
725 $genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'} = $distTSS;
|
|
726 }
|
|
727
|
|
728 } else {
|
|
729 if ($typeIntra eq "exon") {
|
|
730 if ($genes{$chr}->{$ID}{'TF_OtherExonsScore'}<$score) {
|
|
731 $genes{$chr}->{$ID}{'TF_OtherExonsScore'}=$score;
|
|
732 $genes{$chr}->{$ID}{'TF_OtherExonsDist'} = $distTSS;
|
|
733 }
|
|
734
|
|
735 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
736 if ($genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}<$score) {
|
|
737 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}=$score;
|
|
738 $genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'} = $distTSS;
|
|
739 }
|
|
740 }
|
|
741
|
|
742 } elsif ($typeIntra eq "intron") {
|
|
743 if ($genes{$chr}->{$ID}{'TF_OtherIntronsScore'}<$score) {
|
|
744 $genes{$chr}->{$ID}{'TF_OtherIntronsScore'}=$score;
|
|
745 $genes{$chr}->{$ID}{'TF_OtherIntronsDist'} = $distTSS;
|
|
746 }
|
|
747
|
|
748 if (($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
749 if ($genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}<$score) {
|
|
750 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}=$score;
|
|
751 $genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'} = $distTSS;
|
|
752 }
|
|
753 }
|
|
754 } elsif ($typeIntra eq "jonction") {
|
|
755 if ($genes{$chr}->{$ID}{'TF_junctionScore'}<$score) {
|
|
756 $genes{$chr}->{$ID}{'TF_junctionScore'}=$score;
|
|
757 $genes{$chr}->{$ID}{'TF_junctionDist'} = $distTSS;
|
|
758 }
|
|
759
|
|
760 if ($genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}<$score && ($distTSS >= $immediateDownstream)&&($distTE<=0)) {
|
|
761 $genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}=$score;
|
|
762 $genes{$chr}->{$ID}{'TF_junctionAndIntraDist'} = $distTSS;
|
|
763 }
|
|
764
|
|
765 }
|
|
766
|
|
767 }
|
|
768 }
|
|
769
|
|
770
|
|
771 }
|
|
772 $numberOfAllSites++;
|
|
773 }
|
|
774
|
|
775 close FILE;
|
|
776 print "\t$TF1Filename is read!\n" if ($verbose) ;
|
|
777 print "$numberOfAllSites sites\n" ;
|
|
778 }
|
|
779
|
|
780 #-----------read file with sites RNApolII-----
|
|
781 $numberOfAllSites = 0;
|
|
782
|
|
783 if ($RNApolFilename eq "") {
|
|
784 print "No file with peaks of RNA pol II!\n" if ($verbose) ;
|
|
785 } else {
|
|
786 open (FILE, "<$RNApolFilename ") or die "Cannot open file $RNApolFilename !!!!: $!";
|
|
787 $_ = <FILE>;
|
|
788 my @a = split /\t/;
|
|
789 my $correction = 0;
|
|
790
|
|
791 if ($a[0]=~m/chr/) {
|
|
792 $correction = -1;
|
|
793 }
|
|
794 #seek (FILE, 0, 0);
|
|
795 while (<FILE>) {
|
|
796 s/\R//g;
|
|
797
|
|
798 my @a = split /\t/;
|
|
799
|
|
800 my $chr = $a[1+$correction];
|
|
801 my $firstPos = $a[2+$correction];
|
|
802 my $LastPos = $a[3+$correction];
|
|
803 my $maxPos = $a[4+$correction];
|
|
804 my $score = $a[5+$correction];
|
|
805 #print "$numberOfAllSites: $score\n";
|
|
806 next if ($score < $cutoff_rp);
|
|
807
|
|
808 for my $ID (keys %{$genes{$chr}}) {
|
|
809
|
|
810 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
|
|
811 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
|
|
812
|
|
813 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
|
|
814 if ($genes{$chr}->{$ID}{'RNApolScore'}<$score) {
|
|
815 $genes{$chr}->{$ID}{'RNApolScore'}=$score;
|
|
816 $genes{$chr}->{$ID}{'RNApolDist'} = $distTSS;
|
|
817 }
|
|
818 }
|
|
819 if ($distTSS>=0 && $distTE<=0) {
|
|
820 my $typeIntra = &getTypeIntra($genes{$chr}->{$ID},$maxPos);
|
|
821 if ($typeIntra eq "jonction") {
|
|
822 if ($genes{$chr}->{$ID}{'RNApol_junctionScore'}<$score) {
|
|
823 $genes{$chr}->{$ID}{'RNApol_junctionScore'}=$score;
|
|
824 $genes{$chr}->{$ID}{'RNApol_junctionDist'} = $distTSS;
|
|
825 }
|
|
826 }
|
|
827 }
|
|
828 }
|
|
829
|
|
830 $numberOfAllSites++;
|
|
831 }
|
|
832 close FILE;
|
|
833 print "\t$RNApolFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
|
|
834
|
|
835 }
|
|
836
|
|
837 #-----------read file with sites K36me3-----
|
|
838 $numberOfAllSites = 0;
|
|
839 my @K36Score;
|
|
840
|
|
841 if ($H3K36Me3polFilename eq "") {
|
|
842 print "No file with peaks of H3K36me3!\n" if ($verbose) ;
|
|
843 } else {
|
|
844 open (FILE, "<$H3K36Me3polFilename ") or die "Cannot open file $H3K36Me3polFilename !!!!: $!";
|
|
845
|
|
846 $_ = <FILE>;
|
|
847 my @a = split /\t/;
|
|
848 my $correction = 0;
|
|
849
|
|
850 if ($a[0]=~m/chr/) {
|
|
851 $correction = -1;
|
|
852 }
|
|
853
|
|
854 while (<FILE>) {
|
|
855 s/\R//g;
|
|
856
|
|
857 my @a = split /\t/;
|
|
858
|
|
859 my $chr = $a[1+$correction];
|
|
860 my $firstPos = $a[2+$correction];
|
|
861 my $lastPos = $a[3+$correction];
|
|
862 my $maxPos = int(($firstPos+$lastPos)/2);
|
|
863 if (scalar(@a)>(4+$correction)) {
|
|
864 $maxPos = $a[4+$correction];
|
|
865 }
|
|
866 if ($maxPos =~ m/\D/) {
|
|
867 $maxPos = int(($firstPos+$lastPos)/2);
|
|
868 }
|
|
869 my $score = 0;
|
|
870 if (scalar(@a)>(5+$correction)) {
|
|
871 $score = $a[5+$correction];
|
|
872 }
|
|
873 for my $ID (keys %{$genes{$chr}}) {
|
|
874
|
|
875 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
|
|
876 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
|
|
877
|
|
878 my $scoreToadd = 0;
|
|
879
|
|
880 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) {
|
|
881 $scoreToadd = $score/2.*($lastPos-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
882 }
|
|
883 if (($firstPos>=$genes{$chr}->{$ID}{'left'})&&($firstPos<$genes{$chr}->{$ID}{'right'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) {
|
|
884 $scoreToadd = $score/2.*($genes{$chr}->{$ID}{'right'}-$firstPos+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
885 }
|
|
886 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'left'})&&($lastPos<=$genes{$chr}->{$ID}{'right'})) {
|
|
887 $scoreToadd = $score/2.*($lastPos-$genes{$chr}->{$ID}{'left'}+1)/($genes{$chr}->{$ID}{'right'}-$genes{$chr}->{$ID}{'left'}+1);
|
|
888 }
|
|
889 if (($firstPos<$genes{$chr}->{$ID}{'left'})&&($lastPos>$genes{$chr}->{$ID}{'right'})) {
|
|
890 my $scoreToadd = $score/2.*($lastPos-$firstPos+1);
|
|
891 }
|
|
892 $genes{$chr}->{$ID}{'K36score'} += $scoreToadd;
|
|
893 }
|
|
894 $numberOfAllSites++;
|
|
895 }
|
|
896 close FILE;
|
|
897 print "\t$H3K36Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
|
|
898 }
|
|
899
|
|
900 #-----------read file with sites H3K9me3-----
|
|
901 $numberOfAllSites = 0;
|
|
902
|
|
903
|
|
904 if ($H3K9Me3polFilename eq "") {
|
|
905 print "No file with peaks of H3K9me3!\n" if ($verbose) ;
|
|
906 } else {
|
|
907 open (FILE, "<$H3K9Me3polFilename ") or die "Cannot open file $H3K9Me3polFilename !!!!: $!";
|
|
908
|
|
909 $_ = <FILE>;
|
|
910 my @a = split /\t/;
|
|
911 my $correction = 0;
|
|
912
|
|
913 if ($a[0]=~m/chr/) {
|
|
914 $correction = -1;
|
|
915 }
|
|
916
|
|
917 while (<FILE>) {
|
|
918 s/\R//g;
|
|
919 my @a = split /\t/;
|
|
920 my $chr = $a[1+$correction];
|
|
921 my $firstPos = $a[2+$correction];
|
|
922 my $LastPos = $a[3+$correction];
|
|
923 my $maxPos = $a[4+$correction];
|
|
924 my $score = $a[5+$correction];
|
|
925
|
|
926 next if ($score < $cutoff_k9);
|
|
927
|
|
928 for my $ID (keys %{$genes{$chr}}) {
|
|
929
|
|
930 my $distTSS = ($maxPos - $genes{$chr}->{$ID}{'TSS'})*$genes{$chr}->{$ID}{'strand'};
|
|
931 my $distTE = ($maxPos - $genes{$chr}->{$ID}{'TE'})*$genes{$chr}->{$ID}{'strand'};
|
|
932
|
|
933 if (($distTSS>= $enhRight)&&($distTSS<=$immediateDownstream)) {
|
|
934 if ($genes{$chr}->{$ID}{'K9promScore'}<$score) {
|
|
935 $genes{$chr}->{$ID}{'K9promScore'}=$score;
|
|
936 $genes{$chr}->{$ID}{'K9promDist'} = $distTSS;
|
|
937 }
|
|
938 } elsif (($distTSS >= -$K9dist)&&($distTSS<=$K9dist)) {
|
|
939 if ($genes{$chr}->{$ID}{'K9largeScore'}<$score) {
|
|
940 $genes{$chr}->{$ID}{'K9largeScore'}=$score;
|
|
941 $genes{$chr}->{$ID}{'K9largeDist'} = $distTSS;
|
|
942 }
|
|
943 }
|
|
944 if (($distTSS>= $enhLeft)&&($distTSS<$enhRight)) {
|
|
945 if ($genes{$chr}->{$ID}{'K9enhScore'}<$score) {
|
|
946 $genes{$chr}->{$ID}{'K9enhScore'}=$score;
|
|
947 $genes{$chr}->{$ID}{'K9enhDist'} = $distTSS;
|
|
948 }
|
|
949 }
|
|
950 }
|
|
951 $numberOfAllSites++;
|
|
952 }
|
|
953 close FILE;
|
|
954 print "\t$H3K9Me3polFilename is read!\n$numberOfAllSites sites\n" if ($verbose) ;
|
|
955 }
|
|
956
|
|
957 #-----------output all-----
|
|
958 #unless($initialTable eq "") {}
|
|
959
|
|
960
|
|
961 open (OUT , ">$outname") or die "Cannot open file $outname!!!!: $!";
|
|
962
|
|
963 print OUT "name\tchr\tstart\tend\tstrand\tReg\tfoldChange\t";
|
|
964
|
|
965 if ($GCislands ne "") {
|
|
966 print OUT "GC-island\t";
|
|
967 }
|
|
968
|
|
969 if ( $fluoFile ne "") {
|
|
970 print OUT "fluorescence\t";
|
|
971 }
|
|
972
|
|
973 if ($RNApolFilename ne "") {
|
|
974 print OUT "RNApolII_score\tRNApolII_distTSS\tRNApol_junctionScore\tRNApol_junctionDist\t";
|
|
975 }
|
|
976 if ($H3K36Me3polFilename ne "") {
|
|
977 print OUT "H3K36me3_score\t";
|
|
978 }
|
|
979 if ($H3K9Me3polFilename ne "") {
|
|
980 print OUT "H3K9me3_score_prom\tH3K9me3_distTSS_prom\tH3K9me3_score_large\tH3K9me3_distTSS_large\tH3K9me3_score_enh\tH3K9me3_distTSS_enh\t";
|
|
981 }
|
|
982 if ($TF1Filename ne "") {
|
|
983 print OUT "TF_score_Gene\tTF_distTSS_Gene\t";
|
|
984 print OUT "TF_score_Promoter\tTF_distTSS_Promoter\t";
|
|
985 print OUT "TF_score_ImmDown\tTF_distTSS_ImmDown\t";
|
|
986 print OUT "TF_score_PromoterORImmDown\tTF_distTSS_PromoterORImmDown\t";
|
|
987 print OUT "TF_score_Enhancer\tTF_distTSS_Enhancer\t";
|
|
988 print OUT "TF_score_Intragenic\tTF_distTSS_Intragenic\t";
|
|
989 print OUT "TF_score_GeneDownstream\tTF_distTSS_GeneDownstream\t";
|
|
990 print OUT "TF_score_FirstExon\tTF_distTSS_FirstExon\t";
|
|
991 print OUT "TF_score_FisrtIntron\tTF_distTSS_FisrtIntron\t";
|
|
992 print OUT "TF_score_FirstExonAND>$immediateDownstream\tTF_distTSS_FirstExonAND>$immediateDownstream\t";
|
|
993 print OUT "TF_score_FisrtIntronAND>$immediateDownstream\tTF_distTSS_FisrtIntronAND>$immediateDownstream\t";
|
|
994 #print OUT "TF_score_IntraMinusFisrtIntron\tTF_distTSS_IntraMinusFisrtIntron\t";
|
|
995
|
|
996 #print OUT "TF_score_enh60kb\tTF_distTSS_enh60kb\t";
|
|
997
|
|
998 print OUT "TF_score_Exons2,3,4,etc\tTF_distTSS_Exons2,3,4,etc\t";
|
|
999 print OUT "TF_score_Exons2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Exons2,3,4,etcAND>$immediateDownstream\t";
|
|
1000
|
|
1001 print OUT "TF_score_Introns2,3,4,etc\tTF_distTSS_Introns2,3,4,etc\t";
|
|
1002 print OUT "TF_score_Introns2,3,4,etcAND>$immediateDownstream\tTF_distTSS_Introns2,3,4,etcAND>$immediateDownstream\t";
|
|
1003
|
|
1004 print OUT "TF_score_EIjunction\tTF_distTSS_EIjunction\t";
|
|
1005 print OUT "TF_score_EIjunctionAND>$immediateDownstream\tTF_distTSS_EIjunctionAND>$immediateDownstream";
|
|
1006
|
|
1007 if ($ifHMmode) {print OUT "\tK27TSS_Score\tK27TSS_Dist\tK27GeneB_Score\tK27GeneB_Dist"}
|
|
1008 }
|
|
1009
|
|
1010 print OUT "\n";
|
|
1011
|
|
1012 for my $chr (keys %genes) {
|
|
1013 for my $ID (keys %{$genes{$chr}}) {
|
|
1014 print OUT "$genes{$chr}->{$ID}{'name'}\t$chr\t$genes{$chr}->{$ID}{'left'}\t$genes{$chr}->{$ID}{'right'}\t$genes{$chr}->{$ID}{'strand'}\t$genes{$chr}->{$ID}{'reg'}\t$genes{$chr}->{$ID}{'foldChange'}\t";
|
|
1015
|
|
1016 if ($GCislands ne "") {
|
|
1017 print OUT "$genes{$chr}->{$ID}{'GCisland'}\t";
|
|
1018 }
|
|
1019
|
|
1020 if ( $fluoFile ne "") {
|
|
1021 print OUT "$genes{$chr}->{$ID}{'fluo'}\t";
|
|
1022 }
|
|
1023
|
|
1024 if ($RNApolFilename ne "") {
|
|
1025 print OUT "$genes{$chr}->{$ID}{'RNApolScore'}\t$genes{$chr}->{$ID}{'RNApolDist'}\t$genes{$chr}->{$ID}{'RNApol_junctionScore'}\t$genes{$chr}->{$ID}{'RNApol_junctionDist'}\t";
|
|
1026 }
|
|
1027 if ($H3K36Me3polFilename ne "") {
|
|
1028 print OUT "$genes{$chr}->{$ID}{'K36score'}\t";
|
|
1029 }
|
|
1030 if ($H3K9Me3polFilename ne "") {
|
|
1031 print OUT "$genes{$chr}->{$ID}{'K9promScore'}\t$genes{$chr}->{$ID}{'K9promDist'}\t$genes{$chr}->{$ID}{'K9largeScore'}\t$genes{$chr}->{$ID}{'K9largeDist'}\t$genes{$chr}->{$ID}{'K9enhScore'}\t$genes{$chr}->{$ID}{'K9enhDist'}\t";
|
|
1032 }
|
|
1033 if ($TF1Filename ne "") {
|
|
1034 print OUT "$genes{$chr}->{$ID}{'TFallScore'}\t$genes{$chr}->{$ID}{'TFallDist'}\t";
|
|
1035 print OUT "$genes{$chr}->{$ID}{'TFpromSimpleScore'}\t$genes{$chr}->{$ID}{'TFpromSimpleDist'}\t";
|
|
1036 print OUT "$genes{$chr}->{$ID}{'TFImmDownScore'}\t$genes{$chr}->{$ID}{'TFImmDownDist'}\t";
|
|
1037 print OUT "$genes{$chr}->{$ID}{'TFpromScore'}\t$genes{$chr}->{$ID}{'TFpromDist'}\t";
|
|
1038 print OUT "$genes{$chr}->{$ID}{'TFenhScore'}\t$genes{$chr}->{$ID}{'TFenhDist'}\t";
|
|
1039 print OUT "$genes{$chr}->{$ID}{'TFintraScore'}\t$genes{$chr}->{$ID}{'TFintraDist'}\t";
|
|
1040 print OUT "$genes{$chr}->{$ID}{'TF5kbDownScore'}\t$genes{$chr}->{$ID}{'TF5kbDownDist'}\t";
|
|
1041 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonDist'}\t";
|
|
1042 print OUT "$genes{$chr}->{$ID}{'TF_FirstExonAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_FirstExonAndIntraDist'}\t";
|
|
1043 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronDist'}\t";
|
|
1044 print OUT "$genes{$chr}->{$ID}{'TFFirstIntronAndIntraScore'}\t$genes{$chr}->{$ID}{'TFFirstIntronAndIntraDist'}\t";
|
|
1045 #print OUT "$genes{$chr}->{$ID}{'TFintraMinusFirstIntronScore'}\t$genes{$chr}->{$ID}{'TFintraMinusFirstIntronDist'}\t";
|
|
1046
|
|
1047 #print OUT "$genes{$chr}->{$ID}{'TFenh60kbScore'}\t$genes{$chr}->{$ID}{'TFenh60kbDist'}\t";
|
|
1048
|
|
1049 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsDist'}\t";
|
|
1050 print OUT "$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherExonsAndIntraDist'}\t";
|
|
1051
|
|
1052 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsDist'}\t";
|
|
1053 print OUT "$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_OtherIntronsAndIntraDist'}\t";
|
|
1054
|
|
1055 print OUT "$genes{$chr}->{$ID}{'TF_junctionScore'}\t$genes{$chr}->{$ID}{'TF_junctionDist'}\t";
|
|
1056 print OUT "$genes{$chr}->{$ID}{'TF_junctionAndIntraScore'}\t$genes{$chr}->{$ID}{'TF_junctionAndIntraDist'}";
|
|
1057
|
|
1058 if ($ifHMmode) {print OUT "\t$genes{$chr}->{$ID}{'K27TSS_Score'}\t$genes{$chr}->{$ID}{'K27TSS_Dist'}\t$genes{$chr}->{$ID}{'K27GeneB_Score'}\t$genes{$chr}->{$ID}{'K27GeneB_Dist'}"}
|
|
1059
|
|
1060 }
|
|
1061 print OUT "\n";
|
|
1062 }
|
|
1063 }
|
|
1064
|
|
1065 close OUT;
|
|
1066
|
|
1067 ###################################
|
|
1068 sub med {
|
|
1069 my @arr = @_;
|
|
1070 my $med = 0;
|
|
1071 @arr = sort {$a <=> $b} @arr;
|
|
1072 if ((scalar(@arr)/2) =~ m/[\.\,]5/) {
|
|
1073 return $arr[floor(scalar(@arr)/2)];
|
|
1074 } else {
|
|
1075 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2;
|
|
1076 }
|
|
1077 $med;
|
|
1078 }
|
|
1079
|
|
1080 sub checkIfGC {
|
|
1081 my ($TSS,$strand,$dist,$GCislandsChr)=@_;
|
|
1082 my $ifGC = 0;
|
|
1083 my $leftProm=$TSS-$dist;
|
|
1084 my $rightProm = $TSS;
|
|
1085 if ($strand== -1) {
|
|
1086 my $leftProm=$TSS;
|
|
1087 my $rightProm = $TSS+$dist;
|
|
1088 } #print "$leftProm\t"; print "$rightProm\n";
|
|
1089 for my $leftGC (keys %{$GCislandsChr}) {
|
|
1090 my $rightGC = $GCislandsChr->{$leftGC};
|
|
1091 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) {
|
|
1092 return "GC-island";
|
|
1093 }
|
|
1094 }
|
|
1095 return $ifGC ;
|
|
1096 }
|
|
1097
|
|
1098 sub getFirstIntron {
|
|
1099 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
1100 my ($left,$right);
|
|
1101 if ($exonCount == 1) {
|
|
1102 return (0,0);
|
|
1103 }
|
|
1104 if ($strand == 1) {
|
|
1105 $left = (split ",", $exonEnds)[0];
|
|
1106 $right = (split (",", $exonStarts))[1];
|
|
1107 } else {
|
|
1108 $left = (split (",", $exonEnds))[$exonCount-2];
|
|
1109 $right = (split (",", $exonStarts))[$exonCount-1];
|
|
1110 }
|
|
1111 ($left,$right);
|
|
1112 }
|
|
1113
|
|
1114 sub getFirstExon {
|
|
1115 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
1116 my ($left,$right);
|
|
1117 if ($exonCount == 1) {
|
|
1118 return (0,0);
|
|
1119 }
|
|
1120 if ($strand == 1) {
|
|
1121 $left = (split ",", $exonStarts)[0];
|
|
1122 $right = (split (",", $exonEnds))[0]-$jonctionSize;
|
|
1123 } else {
|
|
1124 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize;
|
|
1125 $right = (split (",", $exonEnds))[$exonCount-1];
|
|
1126 }
|
|
1127 ($left,$right);
|
|
1128 }
|
|
1129
|
|
1130
|
|
1131 sub getIntronExon {
|
|
1132 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_;
|
|
1133 my (@left,@right);
|
|
1134 @left = (split ",", $exonStarts);
|
|
1135 @right = (split (",", $exonEnds));
|
|
1136
|
|
1137 for (my $i = 0; $i<$exonCount;$i++) {
|
|
1138 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n";
|
|
1139 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) {
|
|
1140 #print "URA!\n";
|
|
1141 return "exon";
|
|
1142 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) {
|
|
1143 return "intron";
|
|
1144 }
|
|
1145 }
|
|
1146 return "jonction";
|
|
1147 }
|
|
1148
|
|
1149
|
|
1150 sub getTypeIntra {
|
|
1151
|
|
1152 my ($geneEntry, $pos) = @_;
|
|
1153 my $type;
|
|
1154
|
|
1155 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) {
|
|
1156 return "f_intron";
|
|
1157 }
|
|
1158 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) {
|
|
1159 return "f_exon";
|
|
1160 }
|
|
1161 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'});
|
|
1162 return $type;
|
|
1163 }
|
|
1164
|
|
1165 sub getRNAlength {
|
|
1166 my ($exonStarts,$exonEnds) = @_;
|
|
1167 my (@left,@right);
|
|
1168 @left = (split ",", $exonStarts);
|
|
1169 @right = (split (",", $exonEnds));
|
|
1170 my $length = 0;
|
|
1171 for (my $i = 0; $i<scalar(@right);$i++) {
|
|
1172 $length+=$right[$i]-$left[$i];
|
|
1173 }
|
|
1174 #print STDERR "length = $length\n";
|
|
1175 return $length;
|
|
1176 }
|
|
1177
|
|
1178 sub min {
|
|
1179 my @a = sort {$a <=> $b} @_;
|
|
1180 $a[0];
|
|
1181 }
|