Mercurial > repos > eugen > bs_seq_test_1
changeset 17:59a61c1d5aee draft default tip
Uploaded
author | eugen |
---|---|
date | Thu, 16 Aug 2012 08:11:06 -0400 |
parents | bfda928ca5d8 |
children | |
files | methylation_extractor |
diffstat | 1 files changed, 850 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/methylation_extractor Thu Aug 16 08:11:06 2012 -0400 @@ -0,0 +1,850 @@ +#!/usr/bin/perl +use Getopt::Long; +use Cwd; + +my @filenames; +my %counting; + +my %fhs; +my %bases; +my ($single,$paired,$output_file) = process_commandline(); +my $parent_dir = getcwd; + +process_Bismark_results_file(); + +sub process_commandline{ + my $help; + my $single_end; + my $paired_end; + my $ignore; + my $genomic_fasta; + my $full; + my $report; + my $extractor_version; + my $no_overlap; + my $merge_non_CpG; + my $vanilla; + my $output_file; + + my $command_line = GetOptions ('p|paired-end' => \$paired_end, + 's|single-end' => \$single_end, + 'o|output=s' => \$output_file + ); + + ### EXIT ON ERROR if there were errors with any of the supplied options + unless ($command_line){ + die "Please respecify command line options\n"; + } + + + ### no files provided + unless (@ARGV){ + die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n"; + } + @filenames = @ARGV; + + if ($single_end){ + $paired_end = 0; ### SINGLE END ALIGNMENTS + } + elsif ($paired_end){ + $single_end = 0; ### PAIRED-END ALIGNMENTS + } + else{ + die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n"; + } + + return ($single_end,$paired_end, $output_file); +} + +sub process_Bismark_results_file{ + chdir $parent_dir or die "Unable to move to initial working directory $!\n"; + foreach my $filename (@filenames){ + %fhs = (); + %counting =( + total_meCHG_count => 0, + total_meCHH_count => 0, + total_meCpG_count => 0, + total_unmethylated_CHG_count => 0, + total_unmethylated_CHH_count => 0, + total_unmethylated_CpG_count => 0, + sequences_count => 0, + ); + warn "\nNow reading in Bismark result file $filename\n\n"; + open (IN,$filename) or die "Can't open file $filename: $!\n"; + + if($output_file){ + $full_output=$output_file; + } + else{ + #print "lbubb"; + my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/; + my $cpg_output = my $other_c_output = my $full_output = $output_filename; + $full_output =~ s/^/Full_/; + $full_output = $dir."/".$full_output.".bed"; + } + + ### OPENING OUTPUT-FILEHANDLES + open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n"; + + ### Write header + printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n"); + + my $methylation_call_strings_processed = 0; + my $line_count = 0; + + + ### PROCESSING SINGLE-END RESULT FILES + if ($single){ + + while (<IN>){ + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/){ # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + warn "Processed lines: $line_count\n" if ($line_count%500000==0); + + my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + + $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); + $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); + + if ("\U$chrom" !~ /CHR.*/){ + $chrom = "chr".$chrom; + } + + if (!defined $last_chrom){ + print "Now parsing data from $chrom\n"; + } + + if (defined($last_chrom) and $last_chrom ne $chrom){ + print "Parsed: $last_chrom\n"; + write_hash($last_chrom); + print "Now parsing data from $chrom\n"; + } + + $meth_call =~ s/^XM:Z://; + $read_conversion =~ s/^XR:Z://; + $genome_conversion =~ s/^XG:Z://; + my $strand; + chomp $genome_conversion; + + + my $index; + if ($meth_call){ + if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand + $index = 0; + $strand = '+'; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand + $index = 1; + $strand = '-'; + } + elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand + $index = 2; + $strand = '+'; + } + elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand + $index = 3; + $strand = '-'; + } + else { + die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n"; + } + + ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output + if ($strand eq '-'){ + $meth_call = reverse $meth_call; + } + } + + + ### printing out the methylation state of every C in the read + hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index); + + ++$methylation_call_strings_processed; # 1 per single-end result + + $last_chrom = $chrom; + } + write_hash($last_chrom); + print "\n\nDone!"; + } + + ### PROCESSING PAIRED-END RESULT FILES + elsif ($paired){ + while (<IN>){ + ### SAM format can either start with header lines (starting with @) or start with alignments directly + if (/^\@/){ # skipping header lines (starting with @) + warn "skipping SAM header line:\t$_"; + next; + } + + ++$line_count; + # example paired-end reads in SAM format (2 consecutive lines) + # 1_R1/1 67 5 103172224 255 40M = 103172417 233 AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:4 XX:Z:4T1T24TT7 XM:Z:....h.h........................hh....... XR:Z:CT XG:Z:CT + # 1_R1/2 131 5 103172417 255 40M = 103172224 -233 TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII NM:i:6 XX:Z:T5T1T9T9T7T3 XM:Z:h.....h.h.........h.........h.......h... XR:Z:GA XG:Z:CT + + + my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15]; + $_ = <IN>; # reading in the paired read + my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14]; + + + $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/); + $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/); + if ("\U$chrom" !~ /CHR.*/){ + $chrom = "chr".$chrom; + } + if (!defined $last_chrom){ + print "Now parsing data from $chrom\n"; + } + if (defined($last_chrom) and $last_chrom ne $chrom){ + print "Parsed: $last_chrom\n"; + write_hash($last_chrom); + print "Now parsing data from $chrom\n"; + } + $meth_call_1 =~ s/^XM:Z://; + $meth_call_2 =~ s/^XM:Z://; + $first_read_conversion =~ s/^XR:Z://; + $second_read_conversion =~ s/^XR:Z://; + $genome_conversion =~ s/^XG:Z://; + chomp $genome_conversion; + + + my $index; + my $strand; + + if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){ + $index = 0; ## this is OT + $strand = '+'; + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){ + $index = 1; ## this is CTOT + $strand = '-'; + } + elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){ + $index = 2; ## this is CTOB + $strand = '+'; + } + elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){ + $index = 3; ## this is OB + $strand = '-'; + } + else { + die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n"; + } + + ### reversing the methylation call of the read that was reverse-complemented + if ($strand eq '+'){ + # $meth_call_2 = reverse $meth_call_2; + } + else{ + #$meth_call_1 = reverse $meth_call_1; + } + + if ($meth_call_1 and $meth_call_2){ + ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>' + my $end_read_1; + + if ($strand eq '+'){ + $end_read_1 = $start_read_1+length($meth_call_1)-1; + #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand + } + else{ + $end_read_1 = $start_read_1+length($meth_call_1)-1; + #$end_read_1 = $start_read_1; + #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand + } + + if ($strand eq '+'){ + ## we first pass the first read which is in + orientation on the forward strand + hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0); + + # we next pass the second read which is in - orientation on the reverse strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2 + hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1); + } + + else{ + ## we first pass the first read which is in - orientation on the reverse strand + hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0); + + # we next pass the second read which is in + orientation on the forward strand + ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2 + hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1); + } + + $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls + } + $last_chrom = $chrom; + } + write_hash($last_chrom); + print "\n\nDone!"; + } + else{ + die "Single-end or paired-end reads not specified properly\n"; + } + + print "\n\nProcessed $line_count lines from $filename in total\n"; + print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n"; + } +} + + +sub hash_individual_C_methylation_states_paired_end_files{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + #my $methyl_CHG_count = 0; + #my $methyl_CHH_count = 0; + #my $methyl_CpG_count = 0; + #my $unmethylated_CHG_count = 0; + #my $unmethylated_CHH_count = 0; + #my $unmethylated_CpG_count = 0; + + ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified + +if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + ### Returning as soon as the methylation calls start overlapping + #if ($start+$index >= $end_read_1) { + # return; + #} + + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'H') { + #$counting{total_meCHH_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + ### Returning as soon as the methylation calls start overlapping + #if ($start-$index <= $end_read_1) { + # return; + #} +=head1 + if ($start-$index == 100){ + print "\nStrand: - ID: $id\n"; + } + + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'H') { + # $counting{total_meCHH_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } +=cut + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'H') { + # $counting{total_meCHH_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "This cannot happen $!\n"; + } + +=head1 + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + ### Returning as soon as the methylation calls start overlapping + #if ($start+$index >= $end_read_1) { + # return; + #} + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'H') { + #$counting{total_meCHH_count}++; + if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.'){} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } elsif ($strand eq '-') { + for my $index (0..$#methylation_calls) { + + ### Returning as soon as the methylation calls start overlapping + #if ($start-$index <= $end_read_1) { + # return; + #} + + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'H') { + # $counting{total_meCHH_count}++; + if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1; + } + } elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.') {} + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } else { + die "This cannot happen $!\n"; + } +=cut +} + + + +sub hash_individual_C_methylation_states_single_end{ + + my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_; + my @methylation_calls = split(//,$meth_call); + + ################################################################# + ### . for bases not involving cytosines ### + ### X for methylated C in CHG context (was protected) ### + ### x for not methylated C in CHG context (was converted) ### + ### H for methylated C in CHH context (was protected) ### + ### h for not methylated C in CHH context (was converted) ### + ### Z for methylated C in CpG context (was protected) ### + ### z for not methylated C in CpG context (was converted) ### + ################################################################# + + #my $methyl_CHG_count = 0; + #my $methyl_CHH_count = 0; + #my $methyl_CpG_count = 0; + #my $unmethylated_CHG_count = 0; + #my $unmethylated_CHH_count = 0; + #my $unmethylated_CpG_count = 0; + +### single-file CpG and other-context output + if ($strand eq '+') { + for my $index (0..$#methylation_calls) { + + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'H') { + #$counting{total_meCHH_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.') { + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + elsif ($strand eq '-') { + + unless ($ignore){ ### if --ignore was specified the start position has already been corrected + $start += length($meth_call)-1; + } + + for my $index (0..$#methylation_calls) { + ### methylated Cs (any context) will receive a forward (+) orientation + ### not methylated Cs (any context) will receive a reverse (-) orientation + if ($methylation_calls[$index] eq 'X') { + #$counting{total_meCHG_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'x') { + #$counting{total_unmethylated_CHG_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'Z') { + #$counting{total_meCpG_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'z') { + #$counting{total_unmethylated_CpG_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'H') { + #$counting{total_meCHH_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq 'h') { + #$counting{total_unmethylated_CHH_count}++; + if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}++; + }else{ + $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1; + } + } + elsif ($methylation_calls[$index] eq '.'){ + } + else{ + die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n"; + } + } + } + else { + die "This cannot happen (or at least it shouldn't....)\n"; + } +} + +sub write_hash(){ + my ($chrom) = @_; + + for my $k1 (sort {$a <=> $b} (keys %bases)){ + my $coverage = 0; + my $methylated = 0; + my $context; + for my $k2 (keys %{$bases{$k1}}){ + + for my $k3 (keys %{$bases{$k1}{$k2}}){ + if ($k3 eq "Z" or $k3 eq "z"){ + $context = "CPG"; + }elsif ($k3 eq "H" or $k3 eq "h"){ + $context = "CHH"; + }elsif ($k3 eq "X" or $k3 eq "x"){ + $context = "CHG"; + } + + #test, if $k3 is upper-case (==methylated base) + if ($k3 eq "\U$k3"){ + $methylated+=$bases{$k1}{$k2}{$k3}; + } + $coverage+=$bases{$k1}{$k2}{$k3}; + + } + my $percentage = ($methylated / $coverage)*100; + + ##################################################################################### + ### BED file format tabs: ### + ### 1. Chromosome ### + ### 2. Start position ### + ### 3. Strand ### + ### 4. Context (CpG,CHH,CHG) ### + ### 5. Coverage ### + ### 6. Methylated Cs ### + ### 7. Percentage methylated Cs ### + ##################################################################################### + + printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); + } + } + %bases = (); +} + + +sub write_hash_back(){ + my ($chrom) = @_; + + for my $k1 (sort (keys %bases)){ + my $coverage = 0; + my $methylated = 0; + my $context; + my $k2 = (keys %{$bases{$k1}})[0]; + for my $k3 (keys %{$bases{$k1}{$k2}}){ + if ($k3 eq "Z" or $k3 eq "z"){ + $context = "CPG"; + }elsif ($k3 eq "H" or $k3 eq "h"){ + $context = "CHH"; + }elsif ($k3 eq "X" or $k3 eq "x"){ + $context = "CHG"; + } + + #test, if $k3 is upper-case (==methylated base) + if ($k3 eq "\U$k3"){ + $methylated+=$bases{$k1}{$k2}{$k3}; + } + $coverage+=$bases{$k1}{$k2}{$k3}; + + } + my $percentage = ($methylated / $coverage)*100; + + ################################################################################### + ### BED file format tabs: ### + ### 1. Chromosome ### + ### 2. Start position ### + ### 3. Strand ### + ### 4. Context (CpG,CHH,CHG) ### + ### 5. Coverage ### + ### 6. Methylated Cs ### + ### 7. Percentage methylated Cs ### + ################################################################################### + + printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage ); + + } + %bases = (); +} + + +