Mercurial > repos > eugen > bs_seq_test_1
comparison methylation_extractor @ 17:59a61c1d5aee draft default tip
Uploaded
| author | eugen |
|---|---|
| date | Thu, 16 Aug 2012 08:11:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 16:bfda928ca5d8 | 17:59a61c1d5aee |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 use Getopt::Long; | |
| 3 use Cwd; | |
| 4 | |
| 5 my @filenames; | |
| 6 my %counting; | |
| 7 | |
| 8 my %fhs; | |
| 9 my %bases; | |
| 10 my ($single,$paired,$output_file) = process_commandline(); | |
| 11 my $parent_dir = getcwd; | |
| 12 | |
| 13 process_Bismark_results_file(); | |
| 14 | |
| 15 sub process_commandline{ | |
| 16 my $help; | |
| 17 my $single_end; | |
| 18 my $paired_end; | |
| 19 my $ignore; | |
| 20 my $genomic_fasta; | |
| 21 my $full; | |
| 22 my $report; | |
| 23 my $extractor_version; | |
| 24 my $no_overlap; | |
| 25 my $merge_non_CpG; | |
| 26 my $vanilla; | |
| 27 my $output_file; | |
| 28 | |
| 29 my $command_line = GetOptions ('p|paired-end' => \$paired_end, | |
| 30 's|single-end' => \$single_end, | |
| 31 'o|output=s' => \$output_file | |
| 32 ); | |
| 33 | |
| 34 ### EXIT ON ERROR if there were errors with any of the supplied options | |
| 35 unless ($command_line){ | |
| 36 die "Please respecify command line options\n"; | |
| 37 } | |
| 38 | |
| 39 | |
| 40 ### no files provided | |
| 41 unless (@ARGV){ | |
| 42 die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; | |
| 43 } | |
| 44 @filenames = @ARGV; | |
| 45 | |
| 46 if ($single_end){ | |
| 47 $paired_end = 0; ### SINGLE END ALIGNMENTS | |
| 48 } | |
| 49 elsif ($paired_end){ | |
| 50 $single_end = 0; ### PAIRED-END ALIGNMENTS | |
| 51 } | |
| 52 else{ | |
| 53 die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; | |
| 54 } | |
| 55 | |
| 56 return ($single_end,$paired_end, $output_file); | |
| 57 } | |
| 58 | |
| 59 sub process_Bismark_results_file{ | |
| 60 chdir $parent_dir or die "Unable to move to initial working directory $!\n"; | |
| 61 foreach my $filename (@filenames){ | |
| 62 %fhs = (); | |
| 63 %counting =( | |
| 64 total_meCHG_count => 0, | |
| 65 total_meCHH_count => 0, | |
| 66 total_meCpG_count => 0, | |
| 67 total_unmethylated_CHG_count => 0, | |
| 68 total_unmethylated_CHH_count => 0, | |
| 69 total_unmethylated_CpG_count => 0, | |
| 70 sequences_count => 0, | |
| 71 ); | |
| 72 warn "\nNow reading in Bismark result file $filename\n\n"; | |
| 73 open (IN,$filename) or die "Can't open file $filename: $!\n"; | |
| 74 | |
| 75 if($output_file){ | |
| 76 $full_output=$output_file; | |
| 77 } | |
| 78 else{ | |
| 79 #print "lbubb"; | |
| 80 my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/; | |
| 81 my $cpg_output = my $other_c_output = my $full_output = $output_filename; | |
| 82 $full_output =~ s/^/Full_/; | |
| 83 $full_output = $dir."/".$full_output.".bed"; | |
| 84 } | |
| 85 | |
| 86 ### OPENING OUTPUT-FILEHANDLES | |
| 87 open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n"; | |
| 88 | |
| 89 ### Write header | |
| 90 printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n"); | |
| 91 | |
| 92 my $methylation_call_strings_processed = 0; | |
| 93 my $line_count = 0; | |
| 94 | |
| 95 | |
| 96 ### PROCESSING SINGLE-END RESULT FILES | |
| 97 if ($single){ | |
| 98 | |
| 99 while (<IN>){ | |
| 100 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
| 101 if (/^\@/){ # skipping header lines (starting with @) | |
| 102 warn "skipping SAM header line:\t$_"; | |
| 103 next; | |
| 104 } | |
| 105 | |
| 106 ++$line_count; | |
| 107 warn "Processed lines: $line_count\n" if ($line_count%500000==0); | |
| 108 | |
| 109 my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
| 110 | |
| 111 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); | |
| 112 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); | |
| 113 | |
| 114 if ("\U$chrom" !~ /CHR.*/){ | |
| 115 $chrom = "chr".$chrom; | |
| 116 } | |
| 117 | |
| 118 if (!defined $last_chrom){ | |
| 119 print "Now parsing data from $chrom\n"; | |
| 120 } | |
| 121 | |
| 122 if (defined($last_chrom) and $last_chrom ne $chrom){ | |
| 123 print "Parsed: $last_chrom\n"; | |
| 124 write_hash($last_chrom); | |
| 125 print "Now parsing data from $chrom\n"; | |
| 126 } | |
| 127 | |
| 128 $meth_call =~ s/^XM:Z://; | |
| 129 $read_conversion =~ s/^XR:Z://; | |
| 130 $genome_conversion =~ s/^XG:Z://; | |
| 131 my $strand; | |
| 132 chomp $genome_conversion; | |
| 133 | |
| 134 | |
| 135 my $index; | |
| 136 if ($meth_call){ | |
| 137 if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand | |
| 138 $index = 0; | |
| 139 $strand = '+'; | |
| 140 } | |
| 141 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand | |
| 142 $index = 1; | |
| 143 $strand = '-'; | |
| 144 } | |
| 145 elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand | |
| 146 $index = 2; | |
| 147 $strand = '+'; | |
| 148 } | |
| 149 elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand | |
| 150 $index = 3; | |
| 151 $strand = '-'; | |
| 152 } | |
| 153 else { | |
| 154 die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n"; | |
| 155 } | |
| 156 | |
| 157 ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output | |
| 158 if ($strand eq '-'){ | |
| 159 $meth_call = reverse $meth_call; | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 | |
| 164 ### printing out the methylation state of every C in the read | |
| 165 hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); | |
| 166 | |
| 167 ++$methylation_call_strings_processed; # 1 per single-end result | |
| 168 | |
| 169 $last_chrom = $chrom; | |
| 170 } | |
| 171 write_hash($last_chrom); | |
| 172 print "\n\nDone!"; | |
| 173 } | |
| 174 | |
| 175 ### PROCESSING PAIRED-END RESULT FILES | |
| 176 elsif ($paired){ | |
| 177 while (<IN>){ | |
| 178 ### SAM format can either start with header lines (starting with @) or start with alignments directly | |
| 179 if (/^\@/){ # skipping header lines (starting with @) | |
| 180 warn "skipping SAM header line:\t$_"; | |
| 181 next; | |
| 182 } | |
| 183 | |
| 184 ++$line_count; | |
| 185 # example paired-end reads in SAM format (2 consecutive lines) | |
| 186 # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT | |
| 187 # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT | |
| 188 | |
| 189 | |
| 190 my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; | |
| 191 $_ = <IN>; # reading in the paired read | |
| 192 my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; | |
| 193 | |
| 194 | |
| 195 $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); | |
| 196 $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); | |
| 197 if ("\U$chrom" !~ /CHR.*/){ | |
| 198 $chrom = "chr".$chrom; | |
| 199 } | |
| 200 if (!defined $last_chrom){ | |
| 201 print "Now parsing data from $chrom\n"; | |
| 202 } | |
| 203 if (defined($last_chrom) and $last_chrom ne $chrom){ | |
| 204 print "Parsed: $last_chrom\n"; | |
| 205 write_hash($last_chrom); | |
| 206 print "Now parsing data from $chrom\n"; | |
| 207 } | |
| 208 $meth_call_1 =~ s/^XM:Z://; | |
| 209 $meth_call_2 =~ s/^XM:Z://; | |
| 210 $first_read_conversion =~ s/^XR:Z://; | |
| 211 $second_read_conversion =~ s/^XR:Z://; | |
| 212 $genome_conversion =~ s/^XG:Z://; | |
| 213 chomp $genome_conversion; | |
| 214 | |
| 215 | |
| 216 my $index; | |
| 217 my $strand; | |
| 218 | |
| 219 if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ | |
| 220 $index = 0; ## this is OT | |
| 221 $strand = '+'; | |
| 222 } | |
| 223 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ | |
| 224 $index = 1; ## this is CTOT | |
| 225 $strand = '-'; | |
| 226 } | |
| 227 elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ | |
| 228 $index = 2; ## this is CTOB | |
| 229 $strand = '+'; | |
| 230 } | |
| 231 elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ | |
| 232 $index = 3; ## this is OB | |
| 233 $strand = '-'; | |
| 234 } | |
| 235 else { | |
| 236 die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; | |
| 237 } | |
| 238 | |
| 239 ### reversing the methylation call of the read that was reverse-complemented | |
| 240 if ($strand eq '+'){ | |
| 241 # $meth_call_2 = reverse $meth_call_2; | |
| 242 } | |
| 243 else{ | |
| 244 #$meth_call_1 = reverse $meth_call_1; | |
| 245 } | |
| 246 | |
| 247 if ($meth_call_1 and $meth_call_2){ | |
| 248 ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' | |
| 249 my $end_read_1; | |
| 250 | |
| 251 if ($strand eq '+'){ | |
| 252 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
| 253 #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand | |
| 254 } | |
| 255 else{ | |
| 256 $end_read_1 = $start_read_1+length($meth_call_1)-1; | |
| 257 #$end_read_1 = $start_read_1; | |
| 258 #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand | |
| 259 } | |
| 260 | |
| 261 if ($strand eq '+'){ | |
| 262 ## we first pass the first read which is in + orientation on the forward strand | |
| 263 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0); | |
| 264 | |
| 265 # we next pass the second read which is in - orientation on the reverse strand | |
| 266 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 | |
| 267 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1); | |
| 268 } | |
| 269 | |
| 270 else{ | |
| 271 ## we first pass the first read which is in - orientation on the reverse strand | |
| 272 hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0); | |
| 273 | |
| 274 # we next pass the second read which is in + orientation on the forward strand | |
| 275 ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 | |
| 276 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1); | |
| 277 } | |
| 278 | |
| 279 $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls | |
| 280 } | |
| 281 $last_chrom = $chrom; | |
| 282 } | |
| 283 write_hash($last_chrom); | |
| 284 print "\n\nDone!"; | |
| 285 } | |
| 286 else{ | |
| 287 die "Single-end or paired-end reads not specified properly\n"; | |
| 288 } | |
| 289 | |
| 290 print "\n\nProcessed $line_count lines from $filename in total\n"; | |
| 291 print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 | |
| 296 sub hash_individual_C_methylation_states_paired_end_files{ | |
| 297 | |
| 298 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_; | |
| 299 my @methylation_calls = split(//,$meth_call); | |
| 300 | |
| 301 ################################################################# | |
| 302 ### . for bases not involving cytosines ### | |
| 303 ### X for methylated C in CHG context (was protected) ### | |
| 304 ### x for not methylated C in CHG context (was converted) ### | |
| 305 ### H for methylated C in CHH context (was protected) ### | |
| 306 ### h for not methylated C in CHH context (was converted) ### | |
| 307 ### Z for methylated C in CpG context (was protected) ### | |
| 308 ### z for not methylated C in CpG context (was converted) ### | |
| 309 ################################################################# | |
| 310 | |
| 311 #my $methyl_CHG_count = 0; | |
| 312 #my $methyl_CHH_count = 0; | |
| 313 #my $methyl_CpG_count = 0; | |
| 314 #my $unmethylated_CHG_count = 0; | |
| 315 #my $unmethylated_CHH_count = 0; | |
| 316 #my $unmethylated_CpG_count = 0; | |
| 317 | |
| 318 ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified | |
| 319 | |
| 320 if ($strand eq '+') { | |
| 321 for my $index (0..$#methylation_calls) { | |
| 322 | |
| 323 ### Returning as soon as the methylation calls start overlapping | |
| 324 #if ($start+$index >= $end_read_1) { | |
| 325 # return; | |
| 326 #} | |
| 327 | |
| 328 if ($methylation_calls[$index] eq 'X') { | |
| 329 #$counting{total_meCHG_count}++; | |
| 330 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 331 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 332 }else{ | |
| 333 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 334 } | |
| 335 } elsif ($methylation_calls[$index] eq 'x') { | |
| 336 #$counting{total_unmethylated_CHG_count}++; | |
| 337 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 338 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 339 }else{ | |
| 340 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 341 } | |
| 342 } elsif ($methylation_calls[$index] eq 'Z') { | |
| 343 #$counting{total_meCpG_count}++; | |
| 344 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 345 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 346 }else{ | |
| 347 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 348 } | |
| 349 } elsif ($methylation_calls[$index] eq 'z') { | |
| 350 #$counting{total_unmethylated_CpG_count}++; | |
| 351 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 352 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 353 }else{ | |
| 354 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 355 } | |
| 356 } elsif ($methylation_calls[$index] eq 'H') { | |
| 357 #$counting{total_meCHH_count}++; | |
| 358 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 359 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 360 }else{ | |
| 361 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 362 } | |
| 363 } elsif ($methylation_calls[$index] eq 'h') { | |
| 364 #$counting{total_unmethylated_CHH_count}++; | |
| 365 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 366 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 367 }else{ | |
| 368 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 369 } | |
| 370 } | |
| 371 elsif ($methylation_calls[$index] eq '.'){} | |
| 372 else{ | |
| 373 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 374 } | |
| 375 } | |
| 376 } elsif ($strand eq '-') { | |
| 377 for my $index (0..$#methylation_calls) { | |
| 378 | |
| 379 ### Returning as soon as the methylation calls start overlapping | |
| 380 #if ($start-$index <= $end_read_1) { | |
| 381 # return; | |
| 382 #} | |
| 383 =head1 | |
| 384 if ($start-$index == 100){ | |
| 385 print "\nStrand: - ID: $id\n"; | |
| 386 } | |
| 387 | |
| 388 if ($methylation_calls[$index] eq 'X') { | |
| 389 #$counting{total_meCHG_count}++; | |
| 390 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 391 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 392 }else{ | |
| 393 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 394 } | |
| 395 } elsif ($methylation_calls[$index] eq 'x') { | |
| 396 #$counting{total_unmethylated_CHG_count}++; | |
| 397 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 398 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 399 }else{ | |
| 400 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 401 } | |
| 402 } elsif ($methylation_calls[$index] eq 'Z') { | |
| 403 #$counting{total_meCpG_count}++; | |
| 404 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 405 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 406 }else{ | |
| 407 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 408 } | |
| 409 } elsif ($methylation_calls[$index] eq 'z') { | |
| 410 #$counting{total_unmethylated_CpG_count}++; | |
| 411 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 412 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 413 }else{ | |
| 414 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 415 } | |
| 416 } elsif ($methylation_calls[$index] eq 'H') { | |
| 417 # $counting{total_meCHH_count}++; | |
| 418 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 419 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 420 }else{ | |
| 421 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 422 } | |
| 423 } elsif ($methylation_calls[$index] eq 'h') { | |
| 424 #$counting{total_unmethylated_CHH_count}++; | |
| 425 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 426 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 427 }else{ | |
| 428 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 429 } | |
| 430 } | |
| 431 elsif ($methylation_calls[$index] eq '.') {} | |
| 432 else{ | |
| 433 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 434 } | |
| 435 } | |
| 436 =cut | |
| 437 if ($methylation_calls[$index] eq 'X') { | |
| 438 #$counting{total_meCHG_count}++; | |
| 439 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 440 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 441 }else{ | |
| 442 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 443 } | |
| 444 } elsif ($methylation_calls[$index] eq 'x') { | |
| 445 #$counting{total_unmethylated_CHG_count}++; | |
| 446 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 447 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 448 }else{ | |
| 449 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 450 } | |
| 451 } elsif ($methylation_calls[$index] eq 'Z') { | |
| 452 #$counting{total_meCpG_count}++; | |
| 453 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 454 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 455 }else{ | |
| 456 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 457 } | |
| 458 } elsif ($methylation_calls[$index] eq 'z') { | |
| 459 #$counting{total_unmethylated_CpG_count}++; | |
| 460 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 461 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 462 }else{ | |
| 463 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 464 } | |
| 465 } elsif ($methylation_calls[$index] eq 'H') { | |
| 466 # $counting{total_meCHH_count}++; | |
| 467 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 468 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 469 }else{ | |
| 470 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 471 } | |
| 472 } elsif ($methylation_calls[$index] eq 'h') { | |
| 473 #$counting{total_unmethylated_CHH_count}++; | |
| 474 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 475 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 476 }else{ | |
| 477 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 478 } | |
| 479 } | |
| 480 elsif ($methylation_calls[$index] eq '.') {} | |
| 481 else{ | |
| 482 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 483 } | |
| 484 } | |
| 485 } else { | |
| 486 die "This cannot happen $!\n"; | |
| 487 } | |
| 488 | |
| 489 =head1 | |
| 490 if ($strand eq '+') { | |
| 491 for my $index (0..$#methylation_calls) { | |
| 492 | |
| 493 ### Returning as soon as the methylation calls start overlapping | |
| 494 #if ($start+$index >= $end_read_1) { | |
| 495 # return; | |
| 496 #} | |
| 497 if ($methylation_calls[$index] eq 'X') { | |
| 498 #$counting{total_meCHG_count}++; | |
| 499 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 500 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 501 }else{ | |
| 502 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 503 } | |
| 504 } elsif ($methylation_calls[$index] eq 'x') { | |
| 505 #$counting{total_unmethylated_CHG_count}++; | |
| 506 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 507 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 508 }else{ | |
| 509 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 510 } | |
| 511 } elsif ($methylation_calls[$index] eq 'Z') { | |
| 512 #$counting{total_meCpG_count}++; | |
| 513 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 514 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 515 }else{ | |
| 516 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 517 } | |
| 518 } elsif ($methylation_calls[$index] eq 'z') { | |
| 519 #$counting{total_unmethylated_CpG_count}++; | |
| 520 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 521 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 522 }else{ | |
| 523 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 524 } | |
| 525 } elsif ($methylation_calls[$index] eq 'H') { | |
| 526 #$counting{total_meCHH_count}++; | |
| 527 if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ | |
| 528 $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; | |
| 529 }else{ | |
| 530 $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; | |
| 531 } | |
| 532 } elsif ($methylation_calls[$index] eq 'h') { | |
| 533 #$counting{total_unmethylated_CHH_count}++; | |
| 534 if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ | |
| 535 $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; | |
| 536 }else{ | |
| 537 $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; | |
| 538 } | |
| 539 } | |
| 540 elsif ($methylation_calls[$index] eq '.'){} | |
| 541 else{ | |
| 542 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 543 } | |
| 544 } | |
| 545 } elsif ($strand eq '-') { | |
| 546 for my $index (0..$#methylation_calls) { | |
| 547 | |
| 548 ### Returning as soon as the methylation calls start overlapping | |
| 549 #if ($start-$index <= $end_read_1) { | |
| 550 # return; | |
| 551 #} | |
| 552 | |
| 553 if ($methylation_calls[$index] eq 'X') { | |
| 554 #$counting{total_meCHG_count}++; | |
| 555 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ | |
| 556 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
| 557 }else{ | |
| 558 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
| 559 } | |
| 560 } elsif ($methylation_calls[$index] eq 'x') { | |
| 561 #$counting{total_unmethylated_CHG_count}++; | |
| 562 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 563 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 564 }else{ | |
| 565 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 566 } | |
| 567 } elsif ($methylation_calls[$index] eq 'Z') { | |
| 568 #$counting{total_meCpG_count}++; | |
| 569 if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ | |
| 570 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
| 571 }else{ | |
| 572 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
| 573 } | |
| 574 } elsif ($methylation_calls[$index] eq 'z') { | |
| 575 #$counting{total_unmethylated_CpG_count}++; | |
| 576 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 577 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 578 }else{ | |
| 579 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 580 } | |
| 581 } elsif ($methylation_calls[$index] eq 'H') { | |
| 582 # $counting{total_meCHH_count}++; | |
| 583 if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){ | |
| 584 $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; | |
| 585 }else{ | |
| 586 $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; | |
| 587 } | |
| 588 } elsif ($methylation_calls[$index] eq 'h') { | |
| 589 #$counting{total_unmethylated_CHH_count}++; | |
| 590 if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ | |
| 591 $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; | |
| 592 }else{ | |
| 593 $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; | |
| 594 } | |
| 595 } | |
| 596 elsif ($methylation_calls[$index] eq '.') {} | |
| 597 else{ | |
| 598 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 599 } | |
| 600 } | |
| 601 } else { | |
| 602 die "This cannot happen $!\n"; | |
| 603 } | |
| 604 =cut | |
| 605 } | |
| 606 | |
| 607 | |
| 608 | |
| 609 sub hash_individual_C_methylation_states_single_end{ | |
| 610 | |
| 611 my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_; | |
| 612 my @methylation_calls = split(//,$meth_call); | |
| 613 | |
| 614 ################################################################# | |
| 615 ### . for bases not involving cytosines ### | |
| 616 ### X for methylated C in CHG context (was protected) ### | |
| 617 ### x for not methylated C in CHG context (was converted) ### | |
| 618 ### H for methylated C in CHH context (was protected) ### | |
| 619 ### h for not methylated C in CHH context (was converted) ### | |
| 620 ### Z for methylated C in CpG context (was protected) ### | |
| 621 ### z for not methylated C in CpG context (was converted) ### | |
| 622 ################################################################# | |
| 623 | |
| 624 #my $methyl_CHG_count = 0; | |
| 625 #my $methyl_CHH_count = 0; | |
| 626 #my $methyl_CpG_count = 0; | |
| 627 #my $unmethylated_CHG_count = 0; | |
| 628 #my $unmethylated_CHH_count = 0; | |
| 629 #my $unmethylated_CpG_count = 0; | |
| 630 | |
| 631 ### single-file CpG and other-context output | |
| 632 if ($strand eq '+') { | |
| 633 for my $index (0..$#methylation_calls) { | |
| 634 | |
| 635 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 636 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 637 if ($methylation_calls[$index] eq 'X') { | |
| 638 #$counting{total_meCHG_count}++; | |
| 639 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 640 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 641 }else{ | |
| 642 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 643 } | |
| 644 } | |
| 645 elsif ($methylation_calls[$index] eq 'x') { | |
| 646 #$counting{total_unmethylated_CHG_count}++; | |
| 647 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 648 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 649 }else{ | |
| 650 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 651 } | |
| 652 } | |
| 653 elsif ($methylation_calls[$index] eq 'Z') { | |
| 654 #$counting{total_meCpG_count}++; | |
| 655 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 656 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 657 }else{ | |
| 658 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 659 } | |
| 660 } | |
| 661 elsif ($methylation_calls[$index] eq 'z') { | |
| 662 #$counting{total_unmethylated_CpG_count}++; | |
| 663 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 664 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 665 }else{ | |
| 666 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 667 } | |
| 668 } | |
| 669 elsif ($methylation_calls[$index] eq 'H') { | |
| 670 #$counting{total_meCHH_count}++; | |
| 671 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 672 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 673 }else{ | |
| 674 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 675 } | |
| 676 } | |
| 677 elsif ($methylation_calls[$index] eq 'h') { | |
| 678 #$counting{total_unmethylated_CHH_count}++; | |
| 679 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ | |
| 680 $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; | |
| 681 }else{ | |
| 682 $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; | |
| 683 } | |
| 684 } | |
| 685 elsif ($methylation_calls[$index] eq '.') { | |
| 686 } | |
| 687 else{ | |
| 688 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 689 } | |
| 690 } | |
| 691 } | |
| 692 elsif ($strand eq '-') { | |
| 693 | |
| 694 unless ($ignore){ ### if --ignore was specified the start position has already been corrected | |
| 695 $start += length($meth_call)-1; | |
| 696 } | |
| 697 | |
| 698 for my $index (0..$#methylation_calls) { | |
| 699 ### methylated Cs (any context) will receive a forward (+) orientation | |
| 700 ### not methylated Cs (any context) will receive a reverse (-) orientation | |
| 701 if ($methylation_calls[$index] eq 'X') { | |
| 702 #$counting{total_meCHG_count}++; | |
| 703 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 704 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 705 }else{ | |
| 706 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 707 } | |
| 708 } | |
| 709 elsif ($methylation_calls[$index] eq 'x') { | |
| 710 #$counting{total_unmethylated_CHG_count}++; | |
| 711 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 712 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 713 }else{ | |
| 714 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 715 } | |
| 716 } | |
| 717 elsif ($methylation_calls[$index] eq 'Z') { | |
| 718 #$counting{total_meCpG_count}++; | |
| 719 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 720 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 721 }else{ | |
| 722 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 723 } | |
| 724 } | |
| 725 elsif ($methylation_calls[$index] eq 'z') { | |
| 726 #$counting{total_unmethylated_CpG_count}++; | |
| 727 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 728 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 729 }else{ | |
| 730 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 731 } | |
| 732 } | |
| 733 elsif ($methylation_calls[$index] eq 'H') { | |
| 734 #$counting{total_meCHH_count}++; | |
| 735 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 736 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 737 }else{ | |
| 738 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 739 } | |
| 740 } | |
| 741 elsif ($methylation_calls[$index] eq 'h') { | |
| 742 #$counting{total_unmethylated_CHH_count}++; | |
| 743 if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ | |
| 744 $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; | |
| 745 }else{ | |
| 746 $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; | |
| 747 } | |
| 748 } | |
| 749 elsif ($methylation_calls[$index] eq '.'){ | |
| 750 } | |
| 751 else{ | |
| 752 die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; | |
| 753 } | |
| 754 } | |
| 755 } | |
| 756 else { | |
| 757 die "This cannot happen (or at least it shouldn't....)\n"; | |
| 758 } | |
| 759 } | |
| 760 | |
| 761 sub write_hash(){ | |
| 762 my ($chrom) = @_; | |
| 763 | |
| 764 for my $k1 (sort {$a <=> $b} (keys %bases)){ | |
| 765 my $coverage = 0; | |
| 766 my $methylated = 0; | |
| 767 my $context; | |
| 768 for my $k2 (keys %{$bases{$k1}}){ | |
| 769 | |
| 770 for my $k3 (keys %{$bases{$k1}{$k2}}){ | |
| 771 if ($k3 eq "Z" or $k3 eq "z"){ | |
| 772 $context = "CPG"; | |
| 773 }elsif ($k3 eq "H" or $k3 eq "h"){ | |
| 774 $context = "CHH"; | |
| 775 }elsif ($k3 eq "X" or $k3 eq "x"){ | |
| 776 $context = "CHG"; | |
| 777 } | |
| 778 | |
| 779 #test, if $k3 is upper-case (==methylated base) | |
| 780 if ($k3 eq "\U$k3"){ | |
| 781 $methylated+=$bases{$k1}{$k2}{$k3}; | |
| 782 } | |
| 783 $coverage+=$bases{$k1}{$k2}{$k3}; | |
| 784 | |
| 785 } | |
| 786 my $percentage = ($methylated / $coverage)*100; | |
| 787 | |
| 788 ##################################################################################### | |
| 789 ### BED file format tabs: ### | |
| 790 ### 1. Chromosome ### | |
| 791 ### 2. Start position ### | |
| 792 ### 3. Strand ### | |
| 793 ### 4. Context (CpG,CHH,CHG) ### | |
| 794 ### 5. Coverage ### | |
| 795 ### 6. Methylated Cs ### | |
| 796 ### 7. Percentage methylated Cs ### | |
| 797 ##################################################################################### | |
| 798 | |
| 799 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); | |
| 800 } | |
| 801 } | |
| 802 %bases = (); | |
| 803 } | |
| 804 | |
| 805 | |
| 806 sub write_hash_back(){ | |
| 807 my ($chrom) = @_; | |
| 808 | |
| 809 for my $k1 (sort (keys %bases)){ | |
| 810 my $coverage = 0; | |
| 811 my $methylated = 0; | |
| 812 my $context; | |
| 813 my $k2 = (keys %{$bases{$k1}})[0]; | |
| 814 for my $k3 (keys %{$bases{$k1}{$k2}}){ | |
| 815 if ($k3 eq "Z" or $k3 eq "z"){ | |
| 816 $context = "CPG"; | |
| 817 }elsif ($k3 eq "H" or $k3 eq "h"){ | |
| 818 $context = "CHH"; | |
| 819 }elsif ($k3 eq "X" or $k3 eq "x"){ | |
| 820 $context = "CHG"; | |
| 821 } | |
| 822 | |
| 823 #test, if $k3 is upper-case (==methylated base) | |
| 824 if ($k3 eq "\U$k3"){ | |
| 825 $methylated+=$bases{$k1}{$k2}{$k3}; | |
| 826 } | |
| 827 $coverage+=$bases{$k1}{$k2}{$k3}; | |
| 828 | |
| 829 } | |
| 830 my $percentage = ($methylated / $coverage)*100; | |
| 831 | |
| 832 ################################################################################### | |
| 833 ### BED file format tabs: ### | |
| 834 ### 1. Chromosome ### | |
| 835 ### 2. Start position ### | |
| 836 ### 3. Strand ### | |
| 837 ### 4. Context (CpG,CHH,CHG) ### | |
| 838 ### 5. Coverage ### | |
| 839 ### 6. Methylated Cs ### | |
| 840 ### 7. Percentage methylated Cs ### | |
| 841 ################################################################################### | |
| 842 | |
| 843 printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); | |
| 844 | |
| 845 } | |
| 846 %bases = (); | |
| 847 } | |
| 848 | |
| 849 | |
| 850 |
