Mercurial > repos > eugen > bs_seq_test_1
view methylation_extractor @ 17:59a61c1d5aee draft default tip
Uploaded
author | eugen |
---|---|
date | Thu, 16 Aug 2012 08:11:06 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use Getopt::Long; use Cwd; my @filenames; my %counting; my %fhs; my %bases; my ($single,$paired,$output_file) = process_commandline(); my $parent_dir = getcwd; process_Bismark_results_file(); sub process_commandline{ my $help; my $single_end; my $paired_end; my $ignore; my $genomic_fasta; my $full; my $report; my $extractor_version; my $no_overlap; my $merge_non_CpG; my $vanilla; my $output_file; my $command_line = GetOptions ('p|paired-end' => \$paired_end, 's|single-end' => \$single_end, 'o|output=s' => \$output_file ); ### EXIT ON ERROR if there were errors with any of the supplied options unless ($command_line){ die "Please respecify command line options\n"; } ### no files provided unless (@ARGV){ die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; } @filenames = @ARGV; if ($single_end){ $paired_end = 0; ### SINGLE END ALIGNMENTS } elsif ($paired_end){ $single_end = 0; ### PAIRED-END ALIGNMENTS } else{ die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; } return ($single_end,$paired_end, $output_file); } sub process_Bismark_results_file{ chdir $parent_dir or die "Unable to move to initial working directory $!\n"; foreach my $filename (@filenames){ %fhs = (); %counting =( total_meCHG_count => 0, total_meCHH_count => 0, total_meCpG_count => 0, total_unmethylated_CHG_count => 0, total_unmethylated_CHH_count => 0, total_unmethylated_CpG_count => 0, sequences_count => 0, ); warn "\nNow reading in Bismark result file $filename\n\n"; open (IN,$filename) or die "Can't open file $filename: $!\n"; if($output_file){ $full_output=$output_file; } else{ #print "lbubb"; my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/; my $cpg_output = my $other_c_output = my $full_output = $output_filename; $full_output =~ s/^/Full_/; $full_output = $dir."/".$full_output.".bed"; } ### OPENING OUTPUT-FILEHANDLES open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n"; ### Write header printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n"); my $methylation_call_strings_processed = 0; my $line_count = 0; ### PROCESSING SINGLE-END RESULT FILES if ($single){ while (<IN>){ ### SAM format can either start with header lines (starting with @) or start with alignments directly if (/^\@/){ # skipping header lines (starting with @) warn "skipping SAM header line:\t$_"; next; } ++$line_count; warn "Processed lines: $line_count\n" if ($line_count%500000==0); my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); if ("\U$chrom" !~ /CHR.*/){ $chrom = "chr".$chrom; } if (!defined $last_chrom){ print "Now parsing data from $chrom\n"; } if (defined($last_chrom) and $last_chrom ne $chrom){ print "Parsed: $last_chrom\n"; write_hash($last_chrom); print "Now parsing data from $chrom\n"; } $meth_call =~ s/^XM:Z://; $read_conversion =~ s/^XR:Z://; $genome_conversion =~ s/^XG:Z://; my $strand; chomp $genome_conversion; my $index; if ($meth_call){ if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand $index = 0; $strand = '+'; } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand $index = 1; $strand = '-'; } elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand $index = 2; $strand = '+'; } elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand $index = 3; $strand = '-'; } else { die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n"; } ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output if ($strand eq '-'){ $meth_call = reverse $meth_call; } } ### printing out the methylation state of every C in the read hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); ++$methylation_call_strings_processed; # 1 per single-end result $last_chrom = $chrom; } write_hash($last_chrom); print "\n\nDone!"; } ### PROCESSING PAIRED-END RESULT FILES elsif ($paired){ while (<IN>){ ### SAM format can either start with header lines (starting with @) or start with alignments directly if (/^\@/){ # skipping header lines (starting with @) warn "skipping SAM header line:\t$_"; next; } ++$line_count; # example paired-end reads in SAM format (2 consecutive lines) # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; $_ = <IN>; # reading in the paired read my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); if ("\U$chrom" !~ /CHR.*/){ $chrom = "chr".$chrom; } if (!defined $last_chrom){ print "Now parsing data from $chrom\n"; } if (defined($last_chrom) and $last_chrom ne $chrom){ print "Parsed: $last_chrom\n"; write_hash($last_chrom); print "Now parsing data from $chrom\n"; } $meth_call_1 =~ s/^XM:Z://; $meth_call_2 =~ s/^XM:Z://; $first_read_conversion =~ s/^XR:Z://; $second_read_conversion =~ s/^XR:Z://; $genome_conversion =~ s/^XG:Z://; chomp $genome_conversion; my $index; my $strand; if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ $index = 0; ## this is OT $strand = '+'; } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ $index = 1; ## this is CTOT $strand = '-'; } elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ $index = 2; ## this is CTOB $strand = '+'; } elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ $index = 3; ## this is OB $strand = '-'; } else { die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; } ### reversing the methylation call of the read that was reverse-complemented if ($strand eq '+'){ # $meth_call_2 = reverse $meth_call_2; } else{ #$meth_call_1 = reverse $meth_call_1; } if ($meth_call_1 and $meth_call_2){ ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' my $end_read_1; if ($strand eq '+'){ $end_read_1 = $start_read_1+length($meth_call_1)-1; #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand } else{ $end_read_1 = $start_read_1+length($meth_call_1)-1; #$end_read_1 = $start_read_1; #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand } if ($strand eq '+'){ ## we first pass the first read which is in + orientation on the forward strand hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0); # we next pass the second read which is in - orientation on the reverse strand ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1); } else{ ## we first pass the first read which is in - orientation on the reverse strand hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0); # we next pass the second read which is in + orientation on the forward strand ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1); } $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls } $last_chrom = $chrom; } write_hash($last_chrom); print "\n\nDone!"; } else{ die "Single-end or paired-end reads not specified properly\n"; } print "\n\nProcessed $line_count lines from $filename in total\n"; print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; } } sub hash_individual_C_methylation_states_paired_end_files{ my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_; my @methylation_calls = split(//,$meth_call); ################################################################# ### . for bases not involving cytosines ### ### X for methylated C in CHG context (was protected) ### ### x for not methylated C in CHG context (was converted) ### ### H for methylated C in CHH context (was protected) ### ### h for not methylated C in CHH context (was converted) ### ### Z for methylated C in CpG context (was protected) ### ### z for not methylated C in CpG context (was converted) ### ################################################################# #my $methyl_CHG_count = 0; #my $methyl_CHH_count = 0; #my $methyl_CpG_count = 0; #my $unmethylated_CHG_count = 0; #my $unmethylated_CHH_count = 0; #my $unmethylated_CpG_count = 0; ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified if ($strand eq '+') { for my $index (0..$#methylation_calls) { ### Returning as soon as the methylation calls start overlapping #if ($start+$index >= $end_read_1) { # return; #} if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { #$counting{total_meCHH_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.'){} else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } elsif ($strand eq '-') { for my $index (0..$#methylation_calls) { ### Returning as soon as the methylation calls start overlapping #if ($start-$index <= $end_read_1) { # return; #} =head1 if ($start-$index == 100){ print "\nStrand: - ID: $id\n"; } if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { # $counting{total_meCHH_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.') {} else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } =cut if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { # $counting{total_meCHH_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.') {} else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } else { die "This cannot happen $!\n"; } =head1 if ($strand eq '+') { for my $index (0..$#methylation_calls) { ### Returning as soon as the methylation calls start overlapping #if ($start+$index >= $end_read_1) { # return; #} if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { #$counting{total_meCHH_count}++; if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.'){} else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } elsif ($strand eq '-') { for my $index (0..$#methylation_calls) { ### Returning as soon as the methylation calls start overlapping #if ($start-$index <= $end_read_1) { # return; #} if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { # $counting{total_meCHH_count}++; if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){ $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.') {} else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } else { die "This cannot happen $!\n"; } =cut } sub hash_individual_C_methylation_states_single_end{ my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_; my @methylation_calls = split(//,$meth_call); ################################################################# ### . for bases not involving cytosines ### ### X for methylated C in CHG context (was protected) ### ### x for not methylated C in CHG context (was converted) ### ### H for methylated C in CHH context (was protected) ### ### h for not methylated C in CHH context (was converted) ### ### Z for methylated C in CpG context (was protected) ### ### z for not methylated C in CpG context (was converted) ### ################################################################# #my $methyl_CHG_count = 0; #my $methyl_CHH_count = 0; #my $methyl_CpG_count = 0; #my $unmethylated_CHG_count = 0; #my $unmethylated_CHH_count = 0; #my $unmethylated_CpG_count = 0; ### single-file CpG and other-context output if ($strand eq '+') { for my $index (0..$#methylation_calls) { ### methylated Cs (any context) will receive a forward (+) orientation ### not methylated Cs (any context) will receive a reverse (-) orientation if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { #$counting{total_meCHH_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.') { } else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } elsif ($strand eq '-') { unless ($ignore){ ### if --ignore was specified the start position has already been corrected $start += length($meth_call)-1; } for my $index (0..$#methylation_calls) { ### methylated Cs (any context) will receive a forward (+) orientation ### not methylated Cs (any context) will receive a reverse (-) orientation if ($methylation_calls[$index] eq 'X') { #$counting{total_meCHG_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'x') { #$counting{total_unmethylated_CHG_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'Z') { #$counting{total_meCpG_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'z') { #$counting{total_unmethylated_CpG_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'H') { #$counting{total_meCHH_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq 'h') { #$counting{total_unmethylated_CHH_count}++; if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; }else{ $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; } } elsif ($methylation_calls[$index] eq '.'){ } else{ die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; } } } else { die "This cannot happen (or at least it shouldn't....)\n"; } } sub write_hash(){ my ($chrom) = @_; for my $k1 (sort {$a <=> $b} (keys %bases)){ my $coverage = 0; my $methylated = 0; my $context; for my $k2 (keys %{$bases{$k1}}){ for my $k3 (keys %{$bases{$k1}{$k2}}){ if ($k3 eq "Z" or $k3 eq "z"){ $context = "CPG"; }elsif ($k3 eq "H" or $k3 eq "h"){ $context = "CHH"; }elsif ($k3 eq "X" or $k3 eq "x"){ $context = "CHG"; } #test, if $k3 is upper-case (==methylated base) if ($k3 eq "\U$k3"){ $methylated+=$bases{$k1}{$k2}{$k3}; } $coverage+=$bases{$k1}{$k2}{$k3}; } my $percentage = ($methylated / $coverage)*100; ##################################################################################### ### BED file format tabs: ### ### 1. Chromosome ### ### 2. Start position ### ### 3. Strand ### ### 4. Context (CpG,CHH,CHG) ### ### 5. Coverage ### ### 6. Methylated Cs ### ### 7. Percentage methylated Cs ### ##################################################################################### printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); } } %bases = (); } sub write_hash_back(){ my ($chrom) = @_; for my $k1 (sort (keys %bases)){ my $coverage = 0; my $methylated = 0; my $context; my $k2 = (keys %{$bases{$k1}})[0]; for my $k3 (keys %{$bases{$k1}{$k2}}){ if ($k3 eq "Z" or $k3 eq "z"){ $context = "CPG"; }elsif ($k3 eq "H" or $k3 eq "h"){ $context = "CHH"; }elsif ($k3 eq "X" or $k3 eq "x"){ $context = "CHG"; } #test, if $k3 is upper-case (==methylated base) if ($k3 eq "\U$k3"){ $methylated+=$bases{$k1}{$k2}{$k3}; } $coverage+=$bases{$k1}{$k2}{$k3}; } my $percentage = ($methylated / $coverage)*100; ################################################################################### ### BED file format tabs: ### ### 1. Chromosome ### ### 2. Start position ### ### 3. Strand ### ### 4. Context (CpG,CHH,CHG) ### ### 5. Coverage ### ### 6. Methylated Cs ### ### 7. Percentage methylated Cs ### ################################################################################### printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); } %bases = (); }