view methylation_extractor @ 17:59a61c1d5aee draft default tip

Uploaded
author eugen
date Thu, 16 Aug 2012 08:11:06 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl
use Getopt::Long;
use Cwd;

my @filenames;
my %counting;

my %fhs;
my %bases;
my ($single,$paired,$output_file) = process_commandline();
my $parent_dir = getcwd;

process_Bismark_results_file();

sub process_commandline{
  my $help;
  my $single_end;
  my $paired_end;
  my $ignore;
  my $genomic_fasta;
  my $full;
  my $report;
  my $extractor_version;
  my $no_overlap;
  my $merge_non_CpG;
  my $vanilla;
  my $output_file;

  my $command_line = GetOptions ('p|paired-end' => \$paired_end,
				 's|single-end' => \$single_end,
				 'o|output=s' => \$output_file
				);

  ### EXIT ON ERROR if there were errors with any of the supplied options
  unless ($command_line){
    die "Please respecify command line options\n";
  }

 
  ### no files provided
  unless (@ARGV){
    die "You need to provide one or more Bismark files to create an individual C methylation output. Please respecify!\n";
  }
  @filenames = @ARGV;

  if ($single_end){
    $paired_end = 0;   ### SINGLE END ALIGNMENTS
  }
  elsif ($paired_end){
    $single_end = 0;   ### PAIRED-END ALIGNMENTS
  }
  else{
    die "Please specify whether the supplied file(s) are in Bismark single-end or paired-end format\n\n";
  }

  return ($single_end,$paired_end, $output_file);
}

sub process_Bismark_results_file{
  chdir $parent_dir or die "Unable to move to initial working directory $!\n";
  foreach my $filename (@filenames){
    %fhs = ();
    %counting =(
		total_meCHG_count => 0,
		total_meCHH_count => 0,
		total_meCpG_count => 0,
		total_unmethylated_CHG_count => 0,
		total_unmethylated_CHH_count => 0,
		total_unmethylated_CpG_count => 0,
		sequences_count => 0,
	       );
    warn "\nNow reading in Bismark result file $filename\n\n";
    open (IN,$filename) or die "Can't open file $filename: $!\n";

    if($output_file){
      $full_output=$output_file;
    }
    else{
      #print "lbubb";
      my($dir, $output_filename, $extension) = $filename =~ m/(.*\/)(.*)(\..*)$/;  
      my $cpg_output = my $other_c_output = my $full_output = $output_filename;
      $full_output =~ s/^/Full_/;
      $full_output = $dir."/".$full_output.".bed";
    }
    
    ### OPENING OUTPUT-FILEHANDLES
    open ($fhs{full_output},'>',$full_output) or die "Failed to write to $full_output $!\n";
  
    ### Write header
    printf {$fhs{full_output}} ("chr\tpos\tstrand\tcontext\tcoverage\tmethylated\tpercentage methylated\n");
 
    my $methylation_call_strings_processed = 0;
    my $line_count = 0;


    ### PROCESSING SINGLE-END RESULT FILES
    if ($single){
   
		while (<IN>){
	 		 ### SAM format can either start with header lines (starting with @) or start with alignments directly
		  if (/^\@/){ # skipping header lines (starting with @)
		    warn "skipping SAM header line:\t$_";
		    next;
		  }

		  ++$line_count;
		  warn "Processed lines: $line_count\n" if ($line_count%500000==0);
	
		  my ($id,$chrom,$start,$meth_call,$read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
	  
		  $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/);
		  $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/);

		  if ("\U$chrom" !~ /CHR.*/){
		    $chrom = "chr".$chrom;
		  }

		  if (!defined $last_chrom){
		    print "Now parsing data from $chrom\n";
		  }		

		  if (defined($last_chrom) and $last_chrom ne $chrom){
		    print "Parsed: $last_chrom\n";
		    write_hash($last_chrom);
		    print "Now parsing data from $chrom\n";
		  }

		  $meth_call =~ s/^XM:Z://;
		  $read_conversion =~ s/^XR:Z://;
		  $genome_conversion =~ s/^XG:Z://;	
		  my $strand;
		  chomp $genome_conversion;

	
	  	my $index;
		  if ($meth_call){
		    if ($read_conversion eq 'CT' and $genome_conversion eq 'CT'){ ## original top strand
		      $index = 0;
		      $strand = '+';
		    }	
		    elsif ($read_conversion eq 'GA' and $genome_conversion eq 'CT'){ ## complementary to original top strand
		      $index = 1;
		      $strand = '-';
		    }
		    elsif ($read_conversion eq 'GA' and $genome_conversion eq 'GA'){ ## complementary to original bottom strand
		      $index = 2;
		      $strand = '+';
		    }
		    elsif ($read_conversion eq 'CT' and $genome_conversion eq 'GA'){ ## original bottom strand
		      $index = 3;
		      $strand = '-';
		    }
		    else {
		      die "Unexpected combination of read and genome conversion: $read_conversion / $genome_conversion\n";
		    }
	
		    ### If the read is in SAM format we need to reverse the methylation call if the read has been reverse-complemented for the output
		    if ($strand eq '-'){
		      $meth_call = reverse $meth_call;
		    }
		  }


		  ### printing out the methylation state of every C in the read
		  hash_individual_C_methylation_states_single_end($meth_call,$chrom,$start,$id,$strand,$index);
		
		  ++$methylation_call_strings_processed; # 1 per single-end result
	
		  $last_chrom = $chrom;
    }
      write_hash($last_chrom);
      print "\n\nDone!";
  }

    ### PROCESSING PAIRED-END RESULT FILES
    elsif ($paired){
	while (<IN>){
	  ### SAM format can either start with header lines (starting with @) or start with alignments directly
	  if (/^\@/){ # skipping header lines (starting with @)
	    warn "skipping SAM header line:\t$_";
	    next;
	  }
	
	  ++$line_count;
	  # example paired-end reads in SAM format (2 consecutive lines)
	  # 1_R1/1	67	5	103172224	255	40M	=	103172417	233	AATATTTTTTTTATTTTAAAATGTGTATTGATTTAAATTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:4	XX:Z:4T1T24TT7	XM:Z:....h.h........................hh.......	XR:Z:CT	XG:Z:CT
	  # 1_R1/2	131	5	103172417	255	40M	=	103172224	-233	TATTTTTTTTTAGAGTATTTTTTAATGGTTATTAGATTTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:6	XX:Z:T5T1T9T9T7T3	XM:Z:h.....h.h.........h.........h.......h...	XR:Z:GA	XG:Z:CT
	
	
	  my ($id_1,$chrom,$start_read_1,$meth_call_1,$first_read_conversion,$genome_conversion) = (split("\t"))[0,2,3,13,14,15];
	  $_ = <IN>; # reading in the paired read
	  my ($id_2,$start_read_2,$meth_call_2,$second_read_conversion) = (split("\t"))[0,3,13,14];
		

	  $chrom = "chrC" if ("\U$chrom" =~ /CHLOR.*/);
	  $chrom = "chrM" if ("\U$chrom" =~ /MITOCH.*/);
	  if ("\U$chrom" !~ /CHR.*/){
	    $chrom = "chr".$chrom;
	  }
	  if (!defined $last_chrom){
	    print "Now parsing data from $chrom\n";
	  }		
	  if (defined($last_chrom) and $last_chrom ne $chrom){
	    print "Parsed: $last_chrom\n";
	    write_hash($last_chrom);
	    print "Now parsing data from $chrom\n";
	  }
	  $meth_call_1 =~ s/^XM:Z://;
	  $meth_call_2 =~ s/^XM:Z://;
	  $first_read_conversion =~ s/^XR:Z://;
	  $second_read_conversion =~ s/^XR:Z://;
	  $genome_conversion =~ s/^XG:Z://;	
	  chomp $genome_conversion;	
	 
		
	  my $index;
	  my $strand;

	  if ($first_read_conversion eq 'CT' and $genome_conversion eq 'CT'){
	    $index = 0; ## this is OT
	    $strand = '+';
	  }	
	  elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'CT'){
	    $index = 1; ## this is CTOT
	    $strand = '-';
	  }
	  elsif ($first_read_conversion eq 'GA' and $genome_conversion eq 'GA'){
	    $index = 2; ## this is CTOB
	    $strand = '+';
	  }
	  elsif ($first_read_conversion eq 'CT' and $genome_conversion eq 'GA'){
	    $index = 3; ## this is OB
	    $strand = '-';
	  }
	  else {
	    die "Unexpected combination of read and genome conversion: $first_read_conversion / $genome_conversion\n";
	  }

	  ### reversing the methylation call of the read that was reverse-complemented
	  if ($strand eq '+'){
	   # $meth_call_2 = reverse $meth_call_2;
	  }
	  else{
	    #$meth_call_1 = reverse $meth_call_1;
	  }

	  if ($meth_call_1 and $meth_call_2){
	    ### Clipping off the first <int> number of bases from the methylation call strings as specified with '--ignore <int>'
	    my $end_read_1;
	
	    if ($strand eq '+'){
	      $end_read_1 = $start_read_1+length($meth_call_1)-1;
	      #$start_read_2 += length($meth_call_2)-1; ## $meth_call_2 is already shortened! Passing on the start position on the reverse strand
	    }
	    else{
				$end_read_1 = $start_read_1+length($meth_call_1)-1;
	      #$end_read_1 = $start_read_1;	
	      #$start_read_1 += length($meth_call_1)-1; ## $meth_call_1 is already shortened! Passing on the start position on the reverse strand
	    }

	    if ($strand eq '+'){
	      ## we first pass the first read which is in + orientation on the forward strand
	      hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'+',$index,0);
	
	      # we next pass the second read which is in - orientation on the reverse strand
	      ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we can stop extracting methylation calls from read 2
	      hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'+',$index,$end_read_1);
	    }
	
	    else{
	      ## we first pass the first read which is in - orientation on the reverse strand
	      hash_individual_C_methylation_states_paired_end_files($meth_call_1,$chrom,$start_read_1,$id_1,'-',$index,0);
	
	      # we next pass the second read which is in + orientation on the forward strand
	      ### if --no_overlap was specified we also pass the end of read 1. If read 2 starts to overlap with read 1 we will stop extracting methylation calls from read 2
	      hash_individual_C_methylation_states_paired_end_files($meth_call_2,$chrom,$start_read_2,$id_2,'-',$index,$end_read_1);
	    }
	
	    $methylation_call_strings_processed += 2; # paired-end = 2 methylation calls
	  }	
	  $last_chrom = $chrom;
      }
      write_hash($last_chrom);
      print "\n\nDone!";
    }
    else{
      die "Single-end or paired-end reads not specified properly\n";
    }

    print "\n\nProcessed $line_count lines from $filename in total\n";
    print "Total number of methylation call strings processed: $methylation_call_strings_processed\n\n";
  }
}


sub hash_individual_C_methylation_states_paired_end_files{

  my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index,$end_read_1) = @_;
  my @methylation_calls = split(//,$meth_call);

  #################################################################
  ### . for bases not involving cytosines                       ###
  ### X for methylated C in CHG context (was protected)         ###
  ### x for not methylated C in CHG context (was converted)     ###
  ### H for methylated C in CHH context (was protected)         ###
  ### h for not methylated C in CHH context (was converted)     ###
  ### Z for methylated C in CpG context (was protected)         ###
  ### z for not methylated C in CpG context (was converted)     ###
  #################################################################

  #my $methyl_CHG_count = 0;
  #my $methyl_CHH_count = 0;
  #my $methyl_CpG_count = 0;
  #my $unmethylated_CHG_count = 0;
  #my $unmethylated_CHH_count = 0;
  #my $unmethylated_CpG_count = 0;

  ### THIS IS AN OPTIONAL 2-CONTEXT (CpG and $methylation_calls[$indexnon-CpG) SECTION IF --merge_non_CpG was specified

if ($strand eq '+') {		      
	  for my $index (0..$#methylation_calls) {
	
	    ### Returning as soon as the methylation calls start overlapping
	    #if ($start+$index >= $end_read_1) {
	    #  return;
	    #}

	  if ($methylation_calls[$index] eq 'X') {
	      #$counting{total_meCHG_count}++;
		if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'x') {
	      #$counting{total_unmethylated_CHG_count}++;
		if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'Z') {
	      #$counting{total_meCpG_count}++;
		if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'z') {
	      #$counting{total_unmethylated_CpG_count}++;
	      	if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'H') {
	      #$counting{total_meCHH_count}++;
	      	if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'h') {
	      #$counting{total_unmethylated_CHH_count}++;
	      	if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    }
	    elsif ($methylation_calls[$index] eq '.'){}
	    else{
	      die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	    }
	  }
	} elsif ($strand eq '-') {
	  for my $index (0..$#methylation_calls) {
	
	    ### Returning as soon as the methylation calls start overlapping
	    #if ($start-$index <= $end_read_1) {
	    #  return;
	    #}
=head1
			if ($start-$index == 100){
				print "\nStrand: -   ID: $id\n";
			}

	    if ($methylation_calls[$index] eq 'X') {
	      #$counting{total_meCHG_count}++;
	  if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'x') {
	      #$counting{total_unmethylated_CHG_count}++;
	      	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'Z') {
	      #$counting{total_meCpG_count}++;
	  if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'z') {
	      #$counting{total_unmethylated_CpG_count}++;
	     	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'H') {
	     # $counting{total_meCHH_count}++;
	      	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'h') {
	      #$counting{total_unmethylated_CHH_count}++;
	      	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    }
	    elsif ($methylation_calls[$index] eq '.') {}
	    else{
	      die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	    }
	  }
=cut
	  if ($methylation_calls[$index] eq 'X') {
	      #$counting{total_meCHG_count}++;
	  if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'x') {
	      #$counting{total_unmethylated_CHG_count}++;
	      	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'Z') {
	      #$counting{total_meCpG_count}++;
	  if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'z') {
	      #$counting{total_unmethylated_CpG_count}++;
	     	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'H') {
	     # $counting{total_meCHH_count}++;
	      	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'h') {
	      #$counting{total_unmethylated_CHH_count}++;
	      	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    }
	    elsif ($methylation_calls[$index] eq '.') {}
	    else{
	      die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	    }
	  }
	} else {
	  die "This cannot happen $!\n";
	}

=head1
	if ($strand eq '+') {		      
	  for my $index (0..$#methylation_calls) {
	
	    ### Returning as soon as the methylation calls start overlapping
	    #if ($start+$index >= $end_read_1) {
	    #  return;
	    #}
	    if ($methylation_calls[$index] eq 'X') {
	      #$counting{total_meCHG_count}++;
		if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'x') {
	      #$counting{total_unmethylated_CHG_count}++;
		if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'Z') {
	      #$counting{total_meCpG_count}++;
		if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'z') {
	      #$counting{total_unmethylated_CpG_count}++;
	      	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'H') {
	      #$counting{total_meCHH_count}++;
	      	if (defined $bases{$start+$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'h') {
	      #$counting{total_unmethylated_CHH_count}++;
	      	if (defined $bases{$start+$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start+$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    }
	    elsif ($methylation_calls[$index] eq '.'){}
	    else{
	      die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	    }
	  }
	} elsif ($strand eq '-') {
	  for my $index (0..$#methylation_calls) {
	
	    ### Returning as soon as the methylation calls start overlapping
	    #if ($start-$index <= $end_read_1) {
	    #  return;
	    #}

	    if ($methylation_calls[$index] eq 'X') {
	      #$counting{total_meCHG_count}++;
	      	if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'x') {
	      #$counting{total_unmethylated_CHG_count}++;
	      	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'Z') {
	      #$counting{total_meCpG_count}++;
	      	if (defined $bases{$start-$index}{"+"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'z') {
	      #$counting{total_unmethylated_CpG_count}++;
	     	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'H') {
	     # $counting{total_meCHH_count}++;
	      	if (defined $bases{$start-$=cutindex}{"+"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"+"}{$methylation_calls[$index]}=1;
		}
	    } elsif ($methylation_calls[$index] eq 'h') {
	      #$counting{total_unmethylated_CHH_count}++;
	      	if (defined $bases{$start-$index}{"-"}{$methylation_calls[$index]}){
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}++;
		}else{
		  $bases{$start-$index}{"-"}{$methylation_calls[$index]}=1;
		}
	    }
	    elsif ($methylation_calls[$index] eq '.') {}
	    else{
	      die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	    }
	  }
	} else {
	  die "This cannot happen $!\n";
	}
=cut
} 



sub hash_individual_C_methylation_states_single_end{

  my ($meth_call,$chrom,$start,$id,$strand,$filehandle_index) = @_;
  my @methylation_calls = split(//,$meth_call);

  #################################################################
  ### . for bases not involving cytosines                       ###
  ### X for methylated C in CHG context (was protected)         ###
  ### x for not methylated C in CHG context (was converted)     ###
  ### H for methylated C in CHH context (was protected)         ###
  ### h for not methylated C in CHH context (was converted)     ###
  ### Z for methylated C in CpG context (was protected)         ###
  ### z for not methylated C in CpG context (was converted)     ###
  #################################################################

  #my $methyl_CHG_count = 0;
  #my $methyl_CHH_count = 0;
  #my $methyl_CpG_count = 0;
  #my $unmethylated_CHG_count = 0;
  #my $unmethylated_CHH_count = 0;
  #my $unmethylated_CpG_count = 0;

### single-file CpG and other-context output
    if ($strand eq '+') {
      for my $index (0..$#methylation_calls) {

	### methylated Cs (any context) will receive a forward (+) orientation
	### not methylated Cs (any context) will receive a reverse (-) orientation
	if ($methylation_calls[$index] eq 'X') {
	  #$counting{total_meCHG_count}++;
	  if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'x') {
	  #$counting{total_unmethylated_CHG_count}++;
	  if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'Z') {
	  #$counting{total_meCpG_count}++;
	  if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'z') {
	  #$counting{total_unmethylated_CpG_count}++;
	 if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'H') {
	  #$counting{total_meCHH_count}++;
	  if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'h') {
	  #$counting{total_unmethylated_CHH_count}++;
	  if (defined $bases{$start+$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start+$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq '.') {
	}
	else{
	  die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	}
      }
    }
    elsif ($strand eq '-') {

      unless ($ignore){  ### if --ignore was specified the start position has already been corrected
	$start += length($meth_call)-1;
      }

      for my $index (0..$#methylation_calls) {
	### methylated Cs (any context) will receive a forward (+) orientation
	### not methylated Cs (any context) will receive a reverse (-) orientation
	if ($methylation_calls[$index] eq 'X') {
	  #$counting{total_meCHG_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'x') {
	  #$counting{total_unmethylated_CHG_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'Z') {
	  #$counting{total_meCpG_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'z') {
	  #$counting{total_unmethylated_CpG_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'H') {
	  #$counting{total_meCHH_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq 'h') {
	  #$counting{total_unmethylated_CHH_count}++;
	  if (defined $bases{$start-$index}{$strand}{$methylation_calls[$index]}){
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}++;
	  }else{
		  $bases{$start-$index}{$strand}{$methylation_calls[$index]}=1;
	  }
	}
	elsif ($methylation_calls[$index] eq '.'){
	}
	else{
	  die "The methylation call string contained the following unrecognised character: $methylation_calls[$index]\n";
	}
      }
    }
    else {
      die "This cannot happen (or at least it shouldn't....)\n";
    }
}

sub write_hash(){
	my ($chrom) = @_;
	
	for my $k1 (sort {$a <=> $b} (keys %bases)){
		my $coverage = 0;
		my $methylated = 0;
		my $context;
		for my $k2 (keys %{$bases{$k1}}){
		   
		      for my $k3 (keys %{$bases{$k1}{$k2}}){
			      if ($k3 eq "Z" or $k3 eq "z"){
				      $context = "CPG";
			      }elsif ($k3 eq "H" or $k3 eq "h"){
				      $context = "CHH";
			      }elsif ($k3 eq "X" or $k3 eq "x"){
				      $context = "CHG";
			      }

			      #test, if $k3 is upper-case (==methylated base)			
			      if ($k3 eq "\U$k3"){
				      $methylated+=$bases{$k1}{$k2}{$k3};
			      }
			      $coverage+=$bases{$k1}{$k2}{$k3};

		      }
		      my $percentage = ($methylated / $coverage)*100;

		      #####################################################################################
		      ### BED file format tabs:								###
		      ### 1. Chromosome									###
		      ### 2. Start position								###
		      ### 3. Strand									###
		      ### 4. Context (CpG,CHH,CHG)							###
		      ### 5. Coverage									###
		      ### 6. Methylated Cs								###
		      ### 7. Percentage methylated Cs							###
		      #####################################################################################

		      printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage );	
		}
	}
	%bases = ();	
}


sub write_hash_back(){
	my ($chrom) = @_;
	
	for my $k1 (sort (keys %bases)){
		my $coverage = 0;
		my $methylated = 0;
		my $context;
		my $k2 = (keys %{$bases{$k1}})[0];
		for my $k3 (keys %{$bases{$k1}{$k2}}){
			if ($k3 eq "Z" or $k3 eq "z"){
				$context = "CPG";
			}elsif ($k3 eq "H" or $k3 eq "h"){
				$context = "CHH";
			}elsif ($k3 eq "X" or $k3 eq "x"){
				$context = "CHG";
			}

			#test, if $k3 is upper-case (==methylated base)			
			if ($k3 eq "\U$k3"){
				$methylated+=$bases{$k1}{$k2}{$k3};
			}
			$coverage+=$bases{$k1}{$k2}{$k3};

		}
		my $percentage = ($methylated / $coverage)*100;

		###################################################################################
		### BED file format tabs:							###
		### 1. Chromosome								###
		### 2. Start position								###
		### 3. Strand									###
		### 4. Context (CpG,CHH,CHG)							###
		### 5. Coverage									###
		### 6. Methylated Cs								###
		### 7. Percentage methylated Cs							###
		###################################################################################

		printf {$fhs{full_output}} ("%s\t%u\t%s\t%s\t%u\t%u\t%.2f\n", $chrom, $k1, $k2, $context, $coverage, $methylated, $percentage );	

	}
	%bases = ();	
}