Mercurial > repos > jbrayet > annotatepeaks
comparison annotatePeaks.pl @ 4:82b10f23f1fe draft
Uploaded
| author | jbrayet |
|---|---|
| date | Thu, 13 Aug 2015 11:02:47 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:db5076a0494f | 4:82b10f23f1fe |
|---|---|
| 1 #:t:::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 2 #:t::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 3 #:::::::::::::z;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 4 #::::::::::::i@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 5 #::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@$@@@@ | |
| 6 #:::::::::::3@@@@@@@@@@@@@@@@@@@@@@@@@B@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 7 #::::::::::3@@@@@@@@@@@@@@@@@@@@@BEEESSE5EEEEBBM@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 8 #::::::::::3@@@@@@@@@@@@@@@@@@@@BEEEEEE35EE55E2355E5SBMB@@@@@@@@@@@@@@@@@$ | |
| 9 #::::::::::@@@@@@@@@@@@@@@@@@@EEEE55533t3tttt::::::!!!!7755E755SBBMMM@@@MM | |
| 10 #::::::::::3@@@@@@@@@@@@@@@@@@EEEE2t3ttttt:::::::::::::::::::::::!7?5225EE | |
| 11 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEE31t::::::::::::::::::::::::::::::::3E5@ | |
| 12 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEEtt:::::::::::::::::::::::::::::::::353 | |
| 13 #::::::::::3@@@@@@@@@@@@@@@@@@EEEEEE1ttz::::::::::::::::::::::::::::::::35 | |
| 14 #:::::::::::@@@@@@@@@@@@@@@@@@EEEEEEEtz1::::::::::::::::::::::::::::::::t: | |
| 15 #:::::::::!3@@@@@@@@@@@@@@@@@@@EEEEEttt::::::::::::::::::::::::::::::::;zz | |
| 16 #::::::::::@@@@@@@@@@@@@@@@@@@@EEEEEttt:::::z;z:::::::::::::::::::::::::13 | |
| 17 #::::::::::3B@@@@@@@@@@@@@@@@@@EEEEEEE3tt:czzztti;:::::::::::::::::::::::3 | |
| 18 #::::ttt::::3@@@@@@@@@@@@@@@@EEEEE5EE25Ezt1EEEz5Etzzz;;;;::::::::::::::::: | |
| 19 #:::::::::::I9@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEE@@@@@@@@@@@@@@Ez;::::::::::: | |
| 20 #:::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@Ez:::::: | |
| 21 #::::::::::::::E@@@@@@@@@@@@@@@@@@@@@@@@@@@@@BE5EBB@@@@@@@@@@@@@@@EEE::::: | |
| 22 #:::::::::::::::@@@@@@@@@@@@@@@@@@@@@@@@@@@@E1::35@@@@@@@@@@ME3MMME2:::::: | |
| 23 #:::::::::::::::?@@@@@@@@@@@@@@@@@@M@@@@@@@EE:::::3SB@@BBESEEt:::::::::::: | |
| 24 #::::::::::::::::J$@@@@@@@B@@@@@@@@@@@@@@@@EE:::::::!35E33t::::::::::::::: | |
| 25 #:::::::::::::::::3@E@@@EE5EESE5EESE@@@@@@@Et::::::::::::tz::::::::::::::: | |
| 26 #:::::::::::::::::J@E$@EEE5133555SE@@@@@@@@Et::::::::::::::::::::::::::::: | |
| 27 #::::::::::::::::::E@E@EEEEtt3523EEE@@@@@@@E:::::::::::::::::::::::::::::: | |
| 28 #:t::::::::::::::::JEE3@@@EEEEEEEEEE@@@@@@@E:::::::::t;::::::::::::::::::: | |
| 29 #:t:::::::::::::::::!5ES@EEEEEEEEES@@@@@@@@@E;:::;;;:3Ez:::::::::::::::::: | |
| 30 #:t::::::::::::::::::::JE@@EEEEEEE@@@@@@@@@@@@@@@@ME!:::;::::::::::::::::: | |
| 31 #:tz::::::::::::::::::::JE@@@EEEE@@@@@@@@@@@@@@EE!:::::::t:::::::::::::::: | |
| 32 #:t::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@ESBE:::::::::::::::::::::::::: | |
| 33 #:::::::::::::::::::::::::Q@@@@@@@@@@@@@@@@EE3EE;:::::zzzz:::::::::::::::: | |
| 34 #:::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@@@@NN@@@@@@Ez::::::::::::::: | |
| 35 #:zt:::::::::::::::::::::::3@@@@EE@@@@@@@@@@EEEEt::;z113E5t::::::::::::::: | |
| 36 #::tt:::::::::::::::::::::::3@@@E@@@@@@@@@@@@@@@@BEt::::::::::::::::t::::: | |
| 37 #:tt:t:::::::::::::::::::::::?S@@@@@@@@@@@BBEEE51!::::::::::::::zzzEt::::: | |
| 38 #::::::::::::::::::::::::::::::3Q@@@@@@@BEEEEEt:::::::::::::;zz@@@EE:::::: | |
| 39 #::::::::::::::::::::::::::::::::75B@@@@@EEEtt;:::::::::;zz@@@@BEEEtz::::: | |
| 40 #::::::::::::::::::::::::::::::::::::?9@@@@@@@@@@@E2Ezg@@@@@B@@@EEEE1t:::: | |
| 41 #:::::::::::::::::::::::::::::::::::::::3@@@@@@@@@@@@@@@@@@@E@EEEEEEEzzz:: | |
| 42 #::::::::::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@EEEEEEE5ttttt | |
| 43 #:::::::::::::::::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@EEEEEEEEEEEtzt | |
| 44 #::::::::::::::::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@E@@EEEEEEEEEEEE@@@ | |
| 45 #::::::::::::::::::::::::::g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE3EEEE@@@@@@@ | |
| 46 #:::::::::::::::::::::;;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEt33@@@@@@@@@@ | |
| 47 #:::::::::::::::::;g@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@E@@@@@@EEEtg@@@@@@@@@@@@ | |
| 48 #::::::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@EEEE@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 49 #:::::::::::::@@@@@@@@@@@@@@@@@$@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 50 #::::::::::;@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |
| 51 # | |
| 52 # Copyleft ↄ⃝ 2012 Institut Curie | |
| 53 # Author(s): Valentina Boeva, Alban Lermine (Institut Curie) 2012 | |
| 54 # Contact: valentina.boeva@curie.fr, alban.lermine@curie.fr | |
| 55 # This software is distributed under the terms of the GNU General | |
| 56 # Public License, either Version 2, June 1991 or Version 3, June 2007. | |
| 57 | |
| 58 #!/usr/bin/perl -w | |
| 59 | |
| 60 #for a complete list of genes and a list of sites, sorts genes close to sites | |
| 61 #printDistance | |
| 62 | |
| 63 #read only gene files with refSeqGenes | |
| 64 #counts only once overlapping transcripts or genes | |
| 65 | |
| 66 #calculates some stats about locations of peaks | |
| 67 | |
| 68 #uses hierarchie : promoter, imUpstream, intragenic,enh, 5kbdownstream, intergenic + for genes: fExon,exon,fIntron,intron,junction+-50kb ONLY FOR "findClosestGene" | |
| 69 #outputs fold change of expression is available | |
| 70 | |
| 71 #outputname - corrected | |
| 72 #all isoforms are considered, even those that start and end at the same coordinates | |
| 73 #uses fluorescence values | |
| 74 | |
| 75 #only one location per gene in mode "all" (but promoter+intragenic if both) | |
| 76 | |
| 77 #read directly Noresp/down/up genes | |
| 78 | |
| 79 #prints distance to TE | |
| 80 | |
| 81 use strict; | |
| 82 use POSIX; | |
| 83 | |
| 84 my $SitesFilename ; | |
| 85 my $GenesFilename ; | |
| 86 my $SelectedGenesFilename = ""; | |
| 87 my $MirFilename = ""; | |
| 88 my $minScore = 0; | |
| 89 my $BIGNUMBER = 10000000000; | |
| 90 my $enhLeft = -30000; | |
| 91 my $enhRight = -1500; | |
| 92 my $promLeft = $enhRight ; | |
| 93 my $immediateDownstream = 2000; | |
| 94 my $maxScore = $BIGNUMBER ; | |
| 95 my $verbose = 0; | |
| 96 my $maxDistance = $BIGNUMBER ; | |
| 97 my $downStreamDist = 5000; | |
| 98 my $jonctionSize = 50; | |
| 99 my $all = 0; | |
| 100 my $fluoFile = ""; | |
| 101 my $regType = ""; | |
| 102 my $GCislands = ""; | |
| 103 my $ResFilename = ""; | |
| 104 | |
| 105 my $usage = qq{ | |
| 106 $0 | |
| 107 | |
| 108 ----------------------------- | |
| 109 mandatory parameters: | |
| 110 | |
| 111 -g filename file with all genes | |
| 112 -tf filename file with sites of TF | |
| 113 | |
| 114 ----------------------------- | |
| 115 optional parameters: | |
| 116 -v for verbose | |
| 117 -mir filename file with positions of miRNA | |
| 118 -maxDist value maximal Distance to genes (def. Inf) | |
| 119 -minScore value minimal Score (def. 0) | |
| 120 -maxScore value maximal Score (def. Inf) | |
| 121 -selG filename file with selected genes and their expression levels | |
| 122 -all to output all genes intersecting with the peak | |
| 123 -fluo filename file with fluorescence | |
| 124 -regType valeu down-regulated, up-regulated, no-response | |
| 125 -gc filename file with gc-islands | |
| 126 | |
| 127 -lp valeu upstream of TSS region to define promoter | |
| 128 -rp valeu downstream of TSS region to define immediate downstream | |
| 129 -enh valeu upstream of TSS region to define enhancer | |
| 130 -dg valeu downstream of transcription end region to define gene downstream | |
| 131 | |
| 132 -o filename output filename | |
| 133 | |
| 134 }; | |
| 135 | |
| 136 if(scalar(@ARGV) == 0){ | |
| 137 print $usage; | |
| 138 exit(0); | |
| 139 } | |
| 140 | |
| 141 | |
| 142 while(scalar(@ARGV) > 0){ | |
| 143 my $this_arg = shift @ARGV; | |
| 144 if ( $this_arg eq '-h') {print "$usage\n"; exit; } | |
| 145 | |
| 146 elsif ( $this_arg eq '-g') {$GenesFilename = shift @ARGV;} | |
| 147 elsif ( $this_arg eq '-tf') {$SitesFilename = shift @ARGV;} | |
| 148 elsif ( $this_arg eq '-mir') {$MirFilename = shift @ARGV;} | |
| 149 elsif ( $this_arg eq '-selG') {$SelectedGenesFilename = shift @ARGV;} | |
| 150 elsif ( $this_arg eq '-maxDist') {$maxDistance = 1;} | |
| 151 elsif ( $this_arg eq '-maxScore') {$maxScore = shift @ARGV;} | |
| 152 elsif ( $this_arg eq '-minScore') {$minScore = shift @ARGV;} | |
| 153 elsif ( $this_arg eq '-v') {$verbose = 1;} | |
| 154 elsif ( $this_arg eq '-all') {$all = 1;} | |
| 155 elsif ( $this_arg eq '-regType') {$regType = shift @ARGV;} | |
| 156 | |
| 157 elsif ( $this_arg eq '-fluo') {$fluoFile = shift @ARGV;} | |
| 158 elsif ( $this_arg eq '-gc') {$GCislands = shift @ARGV;} | |
| 159 elsif ( $this_arg eq '-o') {$ResFilename = shift @ARGV;} | |
| 160 | |
| 161 elsif ( $this_arg eq '-lp') {$enhRight = shift @ARGV;$promLeft = $enhRight;} | |
| 162 elsif ( $this_arg eq '-rp') {$immediateDownstream = shift @ARGV;} | |
| 163 elsif ( $this_arg eq '-enh') {$enhLeft = shift @ARGV;} | |
| 164 elsif ( $this_arg eq '-dg') {$downStreamDist = shift @ARGV;} | |
| 165 | |
| 166 elsif ( $this_arg =~ m/^-/ ) { print "unknown flag: $this_arg\n";} | |
| 167 } | |
| 168 | |
| 169 if ($maxDistance<0) { | |
| 170 die "\tThe maximal distance must be positive!\n"; | |
| 171 } | |
| 172 my $tmpn = $SitesFilename; | |
| 173 $tmpn =~ s/.*[\/\\]//; | |
| 174 if ($ResFilename eq "") { | |
| 175 $ResFilename = $SelectedGenesFilename.$tmpn."_".$minScore."_dists.txt"; | |
| 176 | |
| 177 if ($maxScore != $BIGNUMBER) { | |
| 178 $ResFilename = $SelectedGenesFilename.$SitesFilename."_".$minScore."_$maxScore"."_dists.txt"; | |
| 179 } | |
| 180 | |
| 181 if ($all == 1) { | |
| 182 $ResFilename =~ s/_dists/_all_dists/; | |
| 183 } | |
| 184 } | |
| 185 | |
| 186 | |
| 187 #-----------read selected genes---------------- | |
| 188 my %selectedGenes; | |
| 189 my %selectedGenesFoldChange; | |
| 190 if ( $SelectedGenesFilename ne "") { | |
| 191 open (FILE, "<$SelectedGenesFilename ") or die "Cannot open file $SelectedGenesFilename !!!!: $!"; | |
| 192 while (<FILE>) { | |
| 193 chomp; | |
| 194 my @a = split/\t/; | |
| 195 $selectedGenes{$a[1]} = $a[3]; | |
| 196 $selectedGenesFoldChange{$a[1]} = $a[2]; | |
| 197 #print "gene:$a[1],reg:$selectedGenes{$a[1]},FC:$selectedGenesFoldChange{$a[1]}\n"; | |
| 198 } | |
| 199 | |
| 200 close FILE; | |
| 201 print "\t\t$SelectedGenesFilename is read!\n" if ($verbose); | |
| 202 } | |
| 203 | |
| 204 #-----------read genes with fluorescence--------- | |
| 205 my %fluoGenes; | |
| 206 if ( $fluoFile ne "") { | |
| 207 open (FILE, "<$fluoFile ") or die "Cannot open file $fluoFile !!!!: $!"; | |
| 208 my $gene = ""; | |
| 209 my $med = 0; | |
| 210 my %h; | |
| 211 while (<FILE>) { | |
| 212 chomp; | |
| 213 my @a = split/\t/; | |
| 214 | |
| 215 next if (scalar(@a)<5); | |
| 216 next if ($a[0] eq "probesets"); | |
| 217 next unless ($a[0] =~m/\S/); | |
| 218 next unless ($a[4] =~m/\S/); | |
| 219 if ($gene ne "" && $gene ne $a[2]) { | |
| 220 #calcMed | |
| 221 $med = med(keys %h); | |
| 222 $fluoGenes{$gene} = $med; | |
| 223 $med=0; | |
| 224 delete @h{keys %h}; | |
| 225 } else { | |
| 226 #$h{$a[4]} = 1; | |
| 227 } | |
| 228 $gene = $a[2]; | |
| 229 $h{$a[4]} = 1; | |
| 230 #print "keys ", scalar(keys %h),"\t",keys %h,"\n"; | |
| 231 } | |
| 232 if ($gene ne "") { | |
| 233 $med = med(keys %h); | |
| 234 $fluoGenes{$gene} = $med; | |
| 235 } | |
| 236 close FILE; | |
| 237 print "\t\t$fluoFile is read!\n" if ($verbose); | |
| 238 } | |
| 239 #-----------read GC-islands---------------- | |
| 240 my %GCislands; | |
| 241 if ($GCislands ne "") { | |
| 242 open (FILE, "<$GCislands ") or die "Cannot open file $GCislands !!!!: $!"; | |
| 243 | |
| 244 while (<FILE>) { | |
| 245 chomp; | |
| 246 my @a = split/\t/; | |
| 247 #bin chrom chromStart chromEnd name length cpgNum gcNum perCpg perGc obsExp | |
| 248 #107 chr1 36568608 36569851 CpG: 128 1243 128 766 20.6 61.6 1.09 | |
| 249 my $chr = $a[1]; | |
| 250 my $start = $a[2]; | |
| 251 my $end = $a[3]; | |
| 252 $GCislands{$chr}->{$start}=$end; | |
| 253 } | |
| 254 close FILE; | |
| 255 } elsif ($verbose) { | |
| 256 print "you did not specify a file with GC-islands\n"; | |
| 257 } | |
| 258 | |
| 259 #-----------read genes---------------- | |
| 260 | |
| 261 my %genes; | |
| 262 | |
| 263 my $count = 0; | |
| 264 | |
| 265 open (GENES, "<$GenesFilename") or die "Cannot open file $GenesFilename!!!!: $!"; | |
| 266 <GENES>; | |
| 267 while (<GENES>) { | |
| 268 chomp; | |
| 269 if (/(chr.*)\s([+-])\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s(\S+)\s(\S+)\s\S+\s(\S+)/){ | |
| 270 my $name = $10; | |
| 271 my $chr = $1; | |
| 272 my $strand = $2; | |
| 273 if ($strand eq '+') { | |
| 274 $strand = 1; | |
| 275 } | |
| 276 else { | |
| 277 $strand = -1; | |
| 278 } | |
| 279 my $leftPos = $3; | |
| 280 my $rightPos = $4; | |
| 281 my $cdsStart= $5; | |
| 282 my $cdsEnd= $6; | |
| 283 my $exonCount= $7; | |
| 284 my $exonStarts= $8; | |
| 285 my $exonEnds= $9; | |
| 286 my $ID = "$name\t$chr:$leftPos-$rightPos;$exonStarts-$exonEnds"; | |
| 287 my $foldChange = 1; | |
| 288 my $reg = "NA"; | |
| 289 my $fluo = "NA"; | |
| 290 if ( $SelectedGenesFilename ne "") { | |
| 291 #print "$name\t"; | |
| 292 if (exists($selectedGenes{$name})) { | |
| 293 $reg = $selectedGenes{$name}; | |
| 294 $foldChange = $selectedGenesFoldChange{$name}; | |
| 295 } | |
| 296 } | |
| 297 unless (exists($genes{$chr})) { | |
| 298 my %h; | |
| 299 $genes{$chr} = \%h; | |
| 300 } | |
| 301 unless (exists($genes{$chr}->{$ID})) { | |
| 302 my %h1; | |
| 303 $genes{$chr}->{$ID} = \%h1; | |
| 304 $count++; | |
| 305 } | |
| 306 if ( $fluoFile ne "") { | |
| 307 if (exists($fluoGenes{$name})) { | |
| 308 $fluo = $fluoGenes{$name}; | |
| 309 } | |
| 310 } | |
| 311 $genes{$chr}->{$ID}{'name'} = $name ; | |
| 312 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
| 313 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
| 314 $genes{$chr}->{$ID}{'cdsStart'} = $cdsStart; | |
| 315 $genes{$chr}->{$ID}{'cdsEnd'} = $cdsEnd; | |
| 316 $genes{$chr}->{$ID}{'strand'} = $strand; | |
| 317 $genes{$chr}->{$ID}{'exonCount'} = $exonCount; | |
| 318 $genes{$chr}->{$ID}{'exonStarts'} = $exonStarts; | |
| 319 $genes{$chr}->{$ID}{'exonEnds'} = $exonEnds; | |
| 320 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
| 321 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
| 322 $genes{$chr}->{$ID}{'reg'} = $reg; | |
| 323 $genes{$chr}->{$ID}{'foldChange'} = $foldChange; | |
| 324 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
| 325 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = getFirstIntron ($exonCount,$exonStarts,$exonEnds,$strand); | |
| 326 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = getFirstExon ($exonCount,$exonStarts,$exonEnds,$strand); | |
| 327 | |
| 328 $genes{$chr}->{$ID}{'fluo'} = $fluo; | |
| 329 | |
| 330 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
| 331 if ($GCislands ne "") { | |
| 332 $genes{$chr}->{$ID}{'GCisland'} = checkIfGC ($genes{$chr}->{$ID}{'TSS'},$strand,2000,$GCislands{$chr}); | |
| 333 } | |
| 334 | |
| 335 } | |
| 336 } | |
| 337 print "Total genes : $count\n"; | |
| 338 close GENES; | |
| 339 print "\t\t$GenesFilename is read!\n" if ($verbose); | |
| 340 #for my $gName (sort keys %{$genes{'chr18'}}) { | |
| 341 | |
| 342 # print "$gName\t$genes{'chr18'}->{$gName}{'TSS'}\n"; | |
| 343 #} | |
| 344 | |
| 345 #-----------read file with sites miRNA, store as genes----- | |
| 346 | |
| 347 if ( $MirFilename eq ""){ | |
| 348 print "you did not specify file with miRNA\n" if ($verbose);; | |
| 349 } | |
| 350 else { | |
| 351 | |
| 352 | |
| 353 open (MIR, "<$MirFilename ") or die "Cannot open file $MirFilename !!!!: $!"; | |
| 354 #chr1 20669090 20669163 mmu-mir-206 960 + | |
| 355 while (<MIR>) { | |
| 356 chomp; | |
| 357 my ($name, $chr, $leftPos, $rightPos, $strand ); | |
| 358 my $fluo = "NA"; | |
| 359 | |
| 360 #1 . miRNA 20669091 20669163 . + . ACC="MI0000249"; ID="mmu-mir-206"; | |
| 361 if (/([0-9XYM]+)\s.\smiRNA\s(\d+)\s(\d+)\s.\s([+-])\s.\sACC=.*ID=\"(.*)\"/) { | |
| 362 $name = $5; | |
| 363 $chr = $1; | |
| 364 $leftPos = $2; | |
| 365 $rightPos = $3; | |
| 366 $strand = $4; | |
| 367 } | |
| 368 elsif (/(.*)\s(\d+)\s(\d+)\s(.*)\s(.*)\s(.*)/){ | |
| 369 $name = $4; | |
| 370 $chr = $1; | |
| 371 $leftPos = $2; | |
| 372 $rightPos = $3; | |
| 373 $strand = $6; | |
| 374 } else { | |
| 375 next; | |
| 376 } | |
| 377 | |
| 378 unless ($chr =~ m/chr/) { | |
| 379 $chr = "chr".$chr; | |
| 380 } | |
| 381 my $ID = "$name\t$chr:$leftPos-$rightPos"; | |
| 382 | |
| 383 if ($strand eq '+') { | |
| 384 $strand = 1; | |
| 385 } | |
| 386 else { | |
| 387 $strand = -1; | |
| 388 } | |
| 389 | |
| 390 unless (exists($genes{$chr})) { | |
| 391 my %h; | |
| 392 $genes{$chr} = \%h; | |
| 393 } | |
| 394 unless (exists($genes{$chr}->{$ID})) { | |
| 395 my %h1; | |
| 396 $genes{$chr}->{$ID} = \%h1; | |
| 397 $count++; | |
| 398 } | |
| 399 $genes{$chr}->{$ID}{'name'} = $name ; | |
| 400 $genes{$chr}->{$ID}{'left'} = $leftPos ; | |
| 401 $genes{$chr}->{$ID}{'right'} = $rightPos ; | |
| 402 $genes{$chr}->{$ID}{'cdsStart'} = $leftPos ; | |
| 403 $genes{$chr}->{$ID}{'cdsEnd'} = $rightPos ; | |
| 404 $genes{$chr}->{$ID}{'strand'} = $strand; | |
| 405 $genes{$chr}->{$ID}{'length'} = abs ($leftPos-$rightPos); | |
| 406 $genes{$chr}->{$ID}{'exonCount'} = 1; | |
| 407 $genes{$chr}->{$ID}{'exonStarts'} = $leftPos ; | |
| 408 $genes{$chr}->{$ID}{'exonEnds'} = $rightPos ; | |
| 409 $genes{$chr}->{$ID}{'TSS'} = ($strand == 1) ? $leftPos :$rightPos ; | |
| 410 $genes{$chr}->{$ID}{'TE'} = ($strand == -1) ? $leftPos :$rightPos ; | |
| 411 $genes{$chr}->{$ID}{'reg'} = "miRNA"; | |
| 412 $genes{$chr}->{$ID}{'foldChange'} = 1; | |
| 413 ($genes{$chr}->{$ID}{'firstIntronStart'},$genes{$chr}->{$ID}{'firstIntronEnd'}) = (0,0); | |
| 414 ($genes{$chr}->{$ID}{'firstExonStart'},$genes{$chr}->{$ID}{'firstExonEnd'}) = (0,0); | |
| 415 | |
| 416 $genes{$chr}->{$ID}{'fluo'} = $fluo; | |
| 417 $genes{$chr}->{$ID}{'GCisland'} = 0; | |
| 418 | |
| 419 | |
| 420 | |
| 421 } | |
| 422 | |
| 423 | |
| 424 close MIR; | |
| 425 print "\t\t$MirFilename is read!\n" if ($verbose) ; | |
| 426 } | |
| 427 #-----------read file with sites and find N closest genes----- | |
| 428 my $numberOfAllSites = 0; | |
| 429 | |
| 430 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; | |
| 431 open (OUT , ">$ResFilename") or die "Cannot open file $ResFilename!!!!: $!"; | |
| 432 print OUT "Chromosome\tStart\tEnd\tMax\tScore\tDistTSS\tType\tTypeIntra\tReg\tFoldChange\tDistTE\tGeneName\tGeneCoordinates\n" ; | |
| 433 my $header = <FILE>; #read header | |
| 434 my @a = split (/\t/,$header ); | |
| 435 my $correction = 0; | |
| 436 if ($a[1] =~m/chr/) { | |
| 437 $correction=1; | |
| 438 } | |
| 439 | |
| 440 unless ($header=~m/Chromosome/ || $header=~m/track/|| $header=~m/Start/i) { | |
| 441 close FILE; | |
| 442 open (FILE, "<$SitesFilename") or die "Cannot open file $SitesFilename!!!!: $!"; | |
| 443 } | |
| 444 | |
| 445 $count = 0; | |
| 446 | |
| 447 my %typehash; | |
| 448 my %typehashIntra; | |
| 449 | |
| 450 $typehashIntra{"f_exon"} = 0; | |
| 451 $typehashIntra{"f_intron"} = 0; | |
| 452 $typehashIntra{"jonction"} = 0; | |
| 453 $typehashIntra{"exon"} = 0; | |
| 454 $typehashIntra{"intron"} = 0; | |
| 455 | |
| 456 | |
| 457 while (<FILE>) { | |
| 458 chomp; | |
| 459 $numberOfAllSites++; | |
| 460 my @a = split /\t/; | |
| 461 my $chro = $a[0+$correction]; | |
| 462 my $fpos = $a[1+$correction]; | |
| 463 my $lpos = $a[2+$correction]; | |
| 464 my $maxpos = $a[3+$correction]; | |
| 465 if ($maxpos =~ m/\D/) { | |
| 466 $maxpos = int(($lpos+$fpos)/2); | |
| 467 } | |
| 468 my $score = $a[4+$correction]; | |
| 469 next if ($score < $minScore); | |
| 470 next if ($score >= $maxScore); | |
| 471 $score = $score +1 -1; | |
| 472 #print "$score" ; | |
| 473 | |
| 474 #my $site = "$chro:$fpos-$lpos\t$maxpos\t$score"; | |
| 475 #print $site ; | |
| 476 #my $motifNumber = $a[3]; | |
| 477 | |
| 478 #my $geneSet = &search_genes($N,$chro,($fpos+$lpos)/2,\%genes); | |
| 479 my @b; | |
| 480 unless ($all) { | |
| 481 @b = &findClosestGene($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene for all genes overlaping a peak | |
| 482 } else { | |
| 483 @b = &findAllClosestGeneOneLocation ($fpos,$lpos,$score,$chro,$maxpos,\%genes); #findAllClosestGene | |
| 484 } | |
| 485 #print "$distTSS\t$distTE\n" unless ($distTSS == $BIGNUMBER); | |
| 486 for my $type (@b) { | |
| 487 $typehash{$type} = 0 unless (exists($typehash{$type})); | |
| 488 $typehash{$type}++; | |
| 489 } | |
| 490 $count ++; | |
| 491 | |
| 492 } | |
| 493 close FILE; | |
| 494 close OUT ; | |
| 495 | |
| 496 print "Total Sites: $count\n"; | |
| 497 my $nEntries = 0; | |
| 498 for my $type (sort keys %typehash) { | |
| 499 $nEntries += $typehash{$type}; | |
| 500 } | |
| 501 print "Type\tCount\tFrequency\n"; | |
| 502 for my $type (sort keys %typehash) { | |
| 503 print $type,"\t",$typehash{$type},"\t",$typehash{$type}/$nEntries,"\n"; | |
| 504 } | |
| 505 for my $type (sort keys %typehashIntra) { | |
| 506 print $type,"\t",$typehashIntra{$type},"\t",$typehashIntra{$type}/$nEntries,"\n"; | |
| 507 } | |
| 508 | |
| 509 | |
| 510 #my @arr = @{$genes{'chr'}}; | |
| 511 | |
| 512 sub findClosestGene { | |
| 513 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
| 514 my ($distTSS,$distTE,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,$BIGNUMBER,"intergenic",0,0,0); | |
| 515 my $locDistTSS = 0; | |
| 516 my $locDistTE = 0; | |
| 517 my %hashForOverlapingGenes; | |
| 518 my @array2return; | |
| 519 my $typeIntra = "NA"; | |
| 520 for my $gName (keys %{$genes->{$chro}}) { | |
| 521 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
| 522 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
| 523 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
| 524 | |
| 525 $locDistTE = ($pos-$TE)*$strand; | |
| 526 $locDistTSS = ($pos-$TSS)*$strand; | |
| 527 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
| 528 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
| 529 | |
| 530 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
| 531 #print "$locDistTSS $enhLeft $enhRight\n"; | |
| 532 | |
| 533 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
| 534 $type = "enhancer"; | |
| 535 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
| 536 $type = "promoter"; | |
| 537 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
| 538 $type = "immediateDownstream"; | |
| 539 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
| 540 $type = "intragenic"; | |
| 541 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
| 542 $type = "5kbDownstream"; | |
| 543 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
| 544 $distTSS = $locDistTSS; | |
| 545 $distTE = $locDistTE; | |
| 546 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
| 547 } | |
| 548 | |
| 549 if (($type ne "intergenic")&&($type ne "NA")) { | |
| 550 unless (exists ($hashForOverlapingGenes{$type})) { | |
| 551 my %h; | |
| 552 $hashForOverlapingGenes{$type} = \%h; | |
| 553 } | |
| 554 $hashForOverlapingGenes{$type}->{$locDistTSS}=$gName; | |
| 555 | |
| 556 #print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$gName\n" ; | |
| 557 $type = "NA"; | |
| 558 #unless ($gName =~ m/$TSS/) { | |
| 559 # print "$gName\t$TSS\n"; | |
| 560 #} | |
| 561 } | |
| 562 | |
| 563 } | |
| 564 if ($type eq "intergenic") { | |
| 565 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$typeIntra\t$reg\t$foldChange\t$distTE\t$geneName\n" ; | |
| 566 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
| 567 push(@array2return,$type); | |
| 568 | |
| 569 } else { | |
| 570 my $selectedType; | |
| 571 if (exists ($hashForOverlapingGenes{"promoter"})) { | |
| 572 $selectedType = "promoter"; | |
| 573 } elsif (exists ($hashForOverlapingGenes{"immediateDownstream"})) { | |
| 574 $selectedType = "immediateDownstream"; | |
| 575 } elsif (exists ($hashForOverlapingGenes{"intragenic"})) { | |
| 576 $selectedType = "intragenic"; | |
| 577 } elsif (exists ($hashForOverlapingGenes{"enhancer"})) { | |
| 578 $selectedType = "enhancer"; | |
| 579 } elsif (exists ($hashForOverlapingGenes{"5kbDownstream"})) { | |
| 580 $selectedType = "5kbDownstream"; | |
| 581 } | |
| 582 my $bestGene = printBest($hashForOverlapingGenes{$selectedType},$chro,$fpos,$lpos,$pos,$score,$genes,$selectedType); #print "$type\n"; | |
| 583 push(@array2return,$selectedType); | |
| 584 my %otherGenes; | |
| 585 for my $type ("promoter","immediateDownstream","intragenic","enhancer","5kbDownstream") { #"firstIntron","exon","intron", | |
| 586 for my $locDistTSS (sort {$a<=>$b} keys %{$hashForOverlapingGenes{$type}}) { | |
| 587 my $otherGene = $hashForOverlapingGenes{$type}->{$locDistTSS}; | |
| 588 next if ($genes->{$chro}{$bestGene}{'name'} eq $genes->{$chro}{$otherGene}{'name'}); | |
| 589 next if (exists ($otherGenes{$genes->{$chro}{$otherGene}{'name'}})); | |
| 590 if (overlapGenes($otherGene,$bestGene,$chro,$genes)) { | |
| 591 | |
| 592 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
| 593 $typeIntra = &getTypeIntra($genes->{$chro}{$otherGene},$pos); | |
| 594 $typehashIntra{$typeIntra}++; | |
| 595 }else { | |
| 596 $typeIntra = "NA"; } | |
| 597 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$otherGene}{'reg'}\t$genes->{$chro}{$otherGene}{'foldChange'}\t",($pos-$genes->{$chro}{$otherGene}{'TE'})*$genes->{$chro}{$otherGene}{'strand'},"\t$otherGene\n" ; | |
| 598 push(@array2return,$type); | |
| 599 | |
| 600 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$otherGene}{'reg'}\t$otherGene\n" ; | |
| 601 #print $type,"\n"; | |
| 602 $otherGenes{$genes->{$chro}{$otherGene}{'name'}} = 1; | |
| 603 } | |
| 604 } | |
| 605 } | |
| 606 | |
| 607 } | |
| 608 return @array2return; | |
| 609 } | |
| 610 | |
| 611 sub findAllClosestGeneOneLocation { | |
| 612 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
| 613 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
| 614 my $locDistTSS = 0; | |
| 615 my $locDistTE = 0; | |
| 616 my %hashForOverlapingGenes; | |
| 617 my @array2return; | |
| 618 my $typeIntra = "NA"; | |
| 619 for my $gName (keys %{$genes->{$chro}}) { | |
| 620 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
| 621 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
| 622 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
| 623 | |
| 624 $locDistTE = ($pos-$TE)*$strand; | |
| 625 $locDistTSS = ($pos-$TSS)*$strand; | |
| 626 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
| 627 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
| 628 | |
| 629 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
| 630 #print "$locDistTSS $enhLeft $enhRight\n"; | |
| 631 | |
| 632 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
| 633 $type = "enhancer"; | |
| 634 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
| 635 $type = "promoter"; | |
| 636 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
| 637 $type = "immediateDownstream"; | |
| 638 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
| 639 $type = "intragenic"; | |
| 640 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
| 641 $type = "5kbDownstream"; | |
| 642 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
| 643 $distTSS = $locDistTSS; | |
| 644 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
| 645 } | |
| 646 | |
| 647 if (($type ne "intergenic")&&($type ne "NA")) { | |
| 648 unless (exists ($hashForOverlapingGenes{$type})) { | |
| 649 my %h; | |
| 650 $hashForOverlapingGenes{$type} = \%h; | |
| 651 $hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; | |
| 652 | |
| 653 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
| 654 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
| 655 $typehashIntra{$typeIntra}++; | |
| 656 } | |
| 657 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
| 658 push(@array2return,$type); | |
| 659 } | |
| 660 $typeIntra = "NA"; | |
| 661 $type = "NA"; | |
| 662 #unless ($gName =~ m/$TSS/) { | |
| 663 # print "$gName\t$TSS\n"; | |
| 664 #} | |
| 665 } | |
| 666 | |
| 667 } | |
| 668 if ($type eq "intergenic") { | |
| 669 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; | |
| 670 | |
| 671 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
| 672 push(@array2return,$type); | |
| 673 | |
| 674 } | |
| 675 return @array2return; | |
| 676 } | |
| 677 | |
| 678 sub findAllClosestGene { | |
| 679 my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
| 680 my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
| 681 my $locDistTSS = 0; | |
| 682 my $locDistTE = 0; | |
| 683 my %hashForOverlapingGenes; | |
| 684 my @array2return; | |
| 685 my $typeIntra = "NA"; | |
| 686 for my $gName (keys %{$genes->{$chro}}) { | |
| 687 my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
| 688 my $strand = $genes->{$chro}{$gName}{'strand'}; | |
| 689 my $TE = $genes->{$chro}{$gName}{'TE'}; | |
| 690 | |
| 691 $locDistTE = ($pos-$TE)*$strand; | |
| 692 $locDistTSS = ($pos-$TSS)*$strand; | |
| 693 #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
| 694 #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
| 695 | |
| 696 #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
| 697 #print "$locDistTSS $enhLeft $enhRight\n"; | |
| 698 | |
| 699 if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
| 700 $type = "enhancer"; | |
| 701 } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
| 702 $type = "promoter"; | |
| 703 } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)&&($locDistTE<=0)) { | |
| 704 $type = "immediateDownstream"; | |
| 705 } elsif (($locDistTSS >= $immediateDownstream)&&($locDistTE<=0)) { | |
| 706 $type = "intragenic"; | |
| 707 } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
| 708 $type = "5kbDownstream"; | |
| 709 } elsif (abs($locDistTSS)<abs($distTSS)) { | |
| 710 $distTSS = $locDistTSS; | |
| 711 ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
| 712 } | |
| 713 | |
| 714 if (($type ne "intergenic")&&($type ne "NA")) { | |
| 715 unless (exists ($hashForOverlapingGenes{$type})) { | |
| 716 my %h; | |
| 717 $hashForOverlapingGenes{$type} = \%h; | |
| 718 } | |
| 719 #$hashForOverlapingGenes{$type}->{$gName}=$locDistTSS; | |
| 720 | |
| 721 $typeIntra = "NA"; | |
| 722 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
| 723 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
| 724 $typehashIntra{$typeIntra}++; | |
| 725 } | |
| 726 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'fluo'}\t$genes->{$chro}{$gName}{'foldChange'}\t$genes{$chro}->{$gName}{'GCisland'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
| 727 push(@array2return,$type); | |
| 728 | |
| 729 $type = "NA"; | |
| 730 #unless ($gName =~ m/$TSS/) { | |
| 731 # print "$gName\t$TSS\n"; | |
| 732 #} | |
| 733 } | |
| 734 | |
| 735 } | |
| 736 if ($type eq "intergenic") { | |
| 737 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$geneName}{'reg'}\t$genes->{$chro}{$geneName}{'fluo'}\t$genes->{$chro}{$geneName}{'foldChange'}\t$genes{$chro}->{$geneName}{'GCisland'}\t",($pos-$genes->{$chro}{$geneName}{'TE'})*$genes->{$chro}{$geneName}{'strand'},"\t$geneName\n" ; | |
| 738 | |
| 739 #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
| 740 push(@array2return,$type); | |
| 741 | |
| 742 } | |
| 743 return @array2return; | |
| 744 } | |
| 745 | |
| 746 sub getTypeIntra { | |
| 747 | |
| 748 my ($geneEntry, $pos) = @_; | |
| 749 my $type; | |
| 750 | |
| 751 if (($pos >= $geneEntry->{'firstIntronStart'})&&($pos <=$geneEntry->{'firstIntronEnd'})) { | |
| 752 return "f_intron"; | |
| 753 } | |
| 754 if (($pos >= $geneEntry->{'firstExonStart'})&&($pos <=$geneEntry->{'firstExonEnd'})) { | |
| 755 return "f_exon"; | |
| 756 } | |
| 757 $type = getIntronExon ($pos, $geneEntry->{'exonCount'},$geneEntry->{'exonStarts'},$geneEntry->{'exonEnds'},$geneEntry->{'strand'}); | |
| 758 return $type; | |
| 759 } | |
| 760 | |
| 761 sub overlapGenes { | |
| 762 my ($otherGene,$bestGene,$chro,$genes)= @_; | |
| 763 my $a1 = $genes->{$chro}{$otherGene}{'left'}; | |
| 764 my $a2 = $genes->{$chro}{$bestGene}{'left'}; | |
| 765 my $e1 = $genes->{$chro}{$otherGene}{'right'}; | |
| 766 my $e2 = $genes->{$chro}{$bestGene}{'right'}; | |
| 767 if (($a1 >= $a2)&&($a1 <= $e2)) { | |
| 768 return 1; | |
| 769 } | |
| 770 if (($a2 >= $a1)&&($a2 <= $e1)) { | |
| 771 return 1; | |
| 772 } | |
| 773 return 0; | |
| 774 } | |
| 775 | |
| 776 sub printBest { | |
| 777 my ($hash,$chro,$fpos,$lpos,$pos,$score,$genes,$type) = @_; | |
| 778 for my $locDistTSS (sort {$a<=>$b} keys %{$hash}) { | |
| 779 my $gName = $hash->{$locDistTSS}; | |
| 780 my $typeIntra = "NA"; | |
| 781 if (($type eq "intragenic")||($type eq "immediateDownstream")) { | |
| 782 $typeIntra = &getTypeIntra($genes->{$chro}{$gName},$pos); | |
| 783 $typehashIntra{$typeIntra}++; | |
| 784 } | |
| 785 print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$typeIntra\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t",($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'},"\t$gName\n" ; | |
| 786 return $gName; | |
| 787 } | |
| 788 } | |
| 789 | |
| 790 #sub findAllClosestGene { | |
| 791 # my ($fpos,$lpos,$score,$chro,$pos,$genes) = @_; | |
| 792 # my ($distTSS,$type,$reg,$geneName,$foldChange) = ($BIGNUMBER,"intergenic",0,0,0); | |
| 793 # my $locDistTSS = 0; | |
| 794 # my $locDistTE = 0; | |
| 795 # my @array2return; | |
| 796 # for my $gName (keys %{$genes->{$chro}}) { | |
| 797 # my $TSS = $genes->{$chro}{$gName}{'TSS'}; | |
| 798 # my $strand = $genes->{$chro}{$gName}{'strand'}; | |
| 799 # my $TE = $genes->{$chro}{$gName}{'TE'}; | |
| 800 # $locDistTSS = ($pos-$TSS)*$strand; | |
| 801 # $locDistTE = ($pos-$TE)*$strand; | |
| 802 # #print "$pos-$genes->{$chro}{$gName}{'TSS'})*$genes->{$chro}{$gName}{'strand'} = $locDistTSS\n"; | |
| 803 # #my $locDistTE = ($pos-$genes->{$chro}{$gName}{'TE'})*$genes->{$chro}{$gName}{'strand'}; | |
| 804 # | |
| 805 # #print "$gName $maxDistance $pos-$genes->{$chro}{$gName}{'TSS'} $genes->{$chro}{$gName}{'strand'}\n"; | |
| 806 # #print "$locDistTSS $enhLeft $enhRight\n"; | |
| 807 # | |
| 808 # if (($locDistTSS>= $enhLeft)&&($locDistTSS<$enhRight)) { | |
| 809 # $type = "enhancer"; | |
| 810 # } elsif (($locDistTSS>= $enhRight)&&($locDistTSS<=0)) { | |
| 811 # $type = "promoter"; | |
| 812 # } elsif (($locDistTSS> 0)&&($locDistTSS<=$immediateDownstream)) { | |
| 813 # $type = "immediateDownstream"; | |
| 814 # } elsif (($pos >= $genes{$chro}->{$gName}{'firstIntronStart'})&&($pos <= $genes{$chro}->{$gName}{'firstIntronEnd'})) { | |
| 815 # $type = "firstIntron"; | |
| 816 # } elsif (($locDistTSS> $immediateDownstream)&&($locDistTSS<=$genes{$chro}->{$gName}{'length'})) { | |
| 817 # #$type = "intragenic"; | |
| 818 # $type = &getIntronExon ($pos, $genes{$chro}->{$gName}{'exonCount'},$genes{$chro}->{$gName}{'exonStarts'},$genes{$chro}->{$gName}{'exonEnds'},$genes{$chro}->{$gName}{'strand'}); | |
| 819 # } elsif (($locDistTE >= 0)&&($locDistTE<=$downStreamDist)) { | |
| 820 # $type = "5kbDownstream"; | |
| 821 # } elsif (abs($locDistTSS)<abs($distTSS)) { | |
| 822 # $distTSS = $locDistTSS; | |
| 823 # ($reg,$geneName,$foldChange) = ($genes->{$chro}{$gName}{'reg'},$gName,$genes->{$chro}{$gName}{'foldChange'}); | |
| 824 # } | |
| 825 # if (($type ne "intergenic")&&($type ne "NA")) { | |
| 826 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$locDistTSS\t$type\t$genes->{$chro}{$gName}{'reg'}\t$genes->{$chro}{$gName}{'foldChange'}\t$gName\n" ; | |
| 827 # push(@array2return,$type); | |
| 828 # $type = "NA"; | |
| 829 # #unless ($gName =~ m/$TSS/) { | |
| 830 # # print "$gName\t$TSS\n"; | |
| 831 # #} | |
| 832 # } | |
| 833 # | |
| 834 # } | |
| 835 # if ($type eq "intergenic") { | |
| 836 # print OUT "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$foldChange\t$geneName\n" ; | |
| 837 # push(@array2return,$type); | |
| 838 # #print "$chro\t$fpos\t$lpos\t$pos\t$score\t$distTSS\t$type\t$reg\t$geneName\n" ; | |
| 839 # | |
| 840 # } | |
| 841 # return @array2return; | |
| 842 #} | |
| 843 | |
| 844 sub getFirstIntron { | |
| 845 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
| 846 my ($left,$right); | |
| 847 if ($exonCount == 1) { | |
| 848 return (0,0); | |
| 849 } | |
| 850 if ($strand == 1) { | |
| 851 $left = (split ",", $exonEnds)[0]+$jonctionSize; | |
| 852 $right = (split (",", $exonStarts))[1]-$jonctionSize; | |
| 853 } else { | |
| 854 $left = (split (",", $exonEnds))[$exonCount-2]+$jonctionSize; | |
| 855 $right = (split (",", $exonStarts))[$exonCount-1]-$jonctionSize; | |
| 856 } | |
| 857 ($left,$right); | |
| 858 } | |
| 859 | |
| 860 sub getFirstExon { | |
| 861 my ($exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
| 862 my ($left,$right); | |
| 863 if ($exonCount == 1) { | |
| 864 return (0,0); | |
| 865 } | |
| 866 if ($strand == 1) { | |
| 867 $left = (split ",", $exonStarts)[0]; | |
| 868 $right = (split (",", $exonEnds))[0]-$jonctionSize; | |
| 869 } else { | |
| 870 $left = (split (",", $exonStarts))[$exonCount-1]+$jonctionSize; | |
| 871 $right = (split (",", $exonEnds))[$exonCount-1]; | |
| 872 } | |
| 873 ($left,$right); | |
| 874 } | |
| 875 | |
| 876 sub getIntronExon { | |
| 877 my ($pos,$exonCount,$exonStarts,$exonEnds,$strand) = @_; | |
| 878 my (@left,@right); | |
| 879 @left = (split ",", $exonStarts); | |
| 880 @right = (split (",", $exonEnds)); | |
| 881 | |
| 882 for (my $i = 0; $i<$exonCount;$i++) { | |
| 883 #print "$left[$i] <= $pos ? $pos <= $right[$i]\n"; | |
| 884 if (($left[$i]+$jonctionSize < $pos) && ($pos < $right[$i]-$jonctionSize)) { | |
| 885 #print "URA!\n"; | |
| 886 return "exon"; | |
| 887 } elsif (($i+1<$exonCount)&&($right[$i]+$jonctionSize < $pos) && ($pos < $left[$i+1]-$jonctionSize)) { | |
| 888 return "intron"; | |
| 889 } | |
| 890 } | |
| 891 return "jonction"; | |
| 892 } | |
| 893 | |
| 894 sub min { | |
| 895 return $_[0] if ($_[0]<$_[1]); | |
| 896 $_[1]; | |
| 897 } | |
| 898 sub med { | |
| 899 my @arr = @_; | |
| 900 my $med = 0; | |
| 901 @arr = sort {$a <=> $b} @arr; | |
| 902 if ((scalar(@arr)/2) =~ m/[\.\,]5/) { | |
| 903 return $arr[floor(scalar(@arr)/2)]; | |
| 904 } else { | |
| 905 return ($arr[scalar(@arr)/2]+$arr[scalar(@arr)/2-1])/2; | |
| 906 } | |
| 907 $med; | |
| 908 } | |
| 909 | |
| 910 sub checkIfGC { | |
| 911 my ($TSS,$strand,$dist,$GCislandsChr)=@_; | |
| 912 my $ifGC = 0; | |
| 913 my $leftProm=$TSS-$dist; | |
| 914 my $rightProm = $TSS; | |
| 915 if ($strand== -1) { | |
| 916 my $leftProm=$TSS; | |
| 917 my $rightProm = $TSS+$dist; | |
| 918 } | |
| 919 for my $leftGC (keys %{$GCislandsChr}) { | |
| 920 my $rightGC = $GCislandsChr->{$leftGC}; | |
| 921 if ($leftGC>=$leftProm&&$leftGC<=$rightProm || $rightGC>=$leftProm&&$rightGC<=$rightProm) { | |
| 922 return "GC-island"; | |
| 923 } | |
| 924 } | |
| 925 return $ifGC ; | |
| 926 } |
